1.搭建3台虚拟机
2.建立账户及信任关系
3.安装java
wget jdk-xxxrpm -i jdk-xxx
4.添加环境变量(全部)
export JAVA_HOME=/usr/java/jdk1.8.0_141export JRE_HOME=$JAVA_HOME/jreexport PATH=$PATH:$JAVA_HOME/bin:$JAVA_HOME/jre/binexport CLASSPATH=$CLASSPATH:.:$JAVA_HOME/lib:$JAVA_HOME/jre/libexport HADOOP_HOME=/data/spark/bin/hadoopexport PATH=$PATH:$HADOOP_HOME/bin/:$HADOOP_HOME/sbinexport HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoopexport SPARK_HOME=/data/spark/bin/sparkexport PATH=$PATH:$SPARK_HOME/bin
5.搭建hadoop
1>vi $HADOOP_HOME/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_141
2>vi $HADOOP_HOME/etc/hadoop/core-site.xml
fs.defaultFS hdfs://10.0.0.5:9000 hadoop.tmp.dir /data/spark/bin/hadoop/tmp
3>vi $HADOOP_HOME/etc/hadoop/hdfs-site.xml
dfs.namenode.name.dir file:///data/spark/hdfs/name dfs.datanode.data.dir file:///data1/hdfs-ext,file:///data2/hdfs-ext,file:///data3/hdfs-ext dfs.namenode.checkpoint.dir /data/spark/hdfs/namesecondary dfs.namenode.http-address 0.0.0.0:50070 dfs.namenode.secondary.http-address 0.0.0.0:50090 dfs.datanode.http.address 0.0.0.0:50075 dfs.namenode.datanode.registration.ip-hostname-check false
4>vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
yarn.resourcemanager.hostname 10.0.0.5 yarn.nodemanager.local-dirs /data/spark/hdfs/nm-local-dir yarn.nodemanager.aux-services mapreduce_shuffle yarn.nodemanager.resource.memory-mb 8192 yarn.nodemanager.resource.cpu-vcores 4 yarn.resourcemanager.webapp.address 0.0.0.0:8088 yarn.nodemanager.webapp.address 0.0.0.0:8042 yarn.nodemanager.pmem-check-enabled false yarn.nodemanager.vmem-check-enabled false yarn.nodemanager.vmem-pmem-ratio 5
5>vi $HADOOP_HOME/etc/hadoop/slaves
10.0.0.510.0.0.610.0.0.7
6>拷贝hadoop文件到各个从机,并设置PATH
7>hdfs namenode格式化
hdfs namenode -format
8>启动hdfs并查看日志
start-dfs.sh
9>启动yarn并查看日志
start-yarn.sh
10>查看各节点进程情况,一定要看日志
jps
一定要看日志
11>测试并查看日志
cd /xxxecho "this is a test for hdfs" > 1.txthadoop fs -mkdir /sparkhadoop fs -mkdir /spark/testhadoop fs -appendToFile 1.txt hdfs://10.0.0.5:9000/spark/test/1.txthadoop fs -cat hdfs://10.0.0.5:9000/spark/test/1.txt
6.搭建spark
1>修改spark-env.sh
mv $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.shvi $SPARK_HOME/conf/spark-env.sh
export SPARK_HOME=/data/spark/bin/sparkexport JAVA_HOME=/usr/java/jdk1.8.0_141export HADOOP_HOME=/data/spark/bin/hadoopexport HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoopexport YARN_CONF_DIR=$HADOOP_HOME/etc/hadoopexport SPARK_MASTER_IP=10.0.0.5export SPARK_LOCAL_DIRS=/data/spark/bin/sparkexport SPARK_LIBARY_PATH=.:$JAVA_HOME/lib:$JAVA_HOME/jre/lib:$HADOOP_HOME/lib/nativeexport SPARK_LOG_DIR=/data/spark/bin/spark/logs
2>修改spark-defaults.conf
mv $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.confvi $SPARK_HOME/conf/spark-defaults.conf
spark.yarn.jars hdfs://10.0.0.5:9000/spark/jars/*
3>上传jars
cd $SPARK_HOME/jarshadoop fs -mkdir /spark/jarshadoop fs -put * hdfs://10.0.0.5:9000/spark/jars/
4>修改slave(没什么用)
mv $SPARK_HOME/conf/slaves.template $SPARK_HOME/conf/slavesvi $SPARK_HOME/conf/slaves
10.0.0.510.0.0.610.0.0.7
5>单点交互测试
pyspark --master local[4]
6>集群交互测试
pyspark --master yarn --deploy-mode client
7>建立测试脚本 vi test.py
from __future__ import print_functionimport sysfrom random import randomfrom operator import addfrom pyspark.sql import SparkSessionif __name__ == "__main__":"""Usage: pi [partitions]""" spark = SparkSession\.builder\.appName("PythonPi")\.getOrCreate()lines = spark.sparkContext.textFile("hdfs://10.0.0.5:9000/spark/test/1.txt")num = lines.count()p_str = lines.first()print("--------------------"+str(num)+"---------------------")print("--------------------"+p_str+"---------------------")spark.stop()
8>单点任务测试
spark-submit --master local[4] test.py
9>集群任务测试
spark-submit --master yarn --deploy-mode cluster test.py