当先锋百科网

首页 1 2 3 4 5 6 7

hadoop环境搭建预备节之后,开始正式搭建hadoop集群环境,先简单介绍下lz的环境,共五台虚机,node1-node5

node1和node2为NameNode

node3-node5为DataNode

安装包下载:lz使用的是2.7.7的包

配置

cd /data/soft/new/hadoop
//解压压缩包
tar -zxvf hadoop-2.7.7.tar.gz
cd /data/soft/new/hadoop/hadoop-2.7.7/etc/hadoop

进入到hadoop配置页面修改几个文件,我就直接将我的环境上的文件粘过来供参考,可以直接替换掉,因为默认的文件内容都是空的,替换并无影响

core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl" target="_blank" rel="external nofollow" ?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
  	<!-- 指定分布式文件存储系统(HDFS)的NameService为cluster1,是NameNode的URI -->
  	<property>
      		<name>fs.defaultFS</name>
        	<value>hdfs://cluster1</value>
        </property>
        <!--用于序列文件缓冲区的大小。这个缓冲区的大小可能是硬件页面大小的倍数,它决定了在读写操作 -->
  	<property>
      		<name>io.file.buffer.size</name>
        	<value>131072</value>
        </property>
        <!-- 指定hadoop临时目录 -->
  	<property>
        	<name>hadoop.tmp.dir</name>
        	<value>/data/soft/new/tmp</value>
        </property>
        <!--指定可以在任何IP访问 -->
  	<property>
       		<name>hadoop.proxyuser.hadoop.hosts</name>
        	<value>*</value>
        </property>
        <!--指定所有用户可以访问 -->
	<property>
		<name>hadoop.proxyuser.hadoop.groups</name>
		<value>*</value>
	</property>
        <!-- 指定ZooKeeper地址 -->
	<property>
		<name>ha.zookeeper.quorum</name>
		<value>node3:2181,node4:2181,node5:2181</value>
	</property>
</configuration>

 hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
	<!-- 指定HDFS的NameService为cluster1,需要和core-site.xml中的保持一致 -->
	<property>
		<name>dfs.nameservices</name>
		<value>cluster1</value>
	</property>
	<!-- cluster1下面有两个NameNode,分别是nna节点,nns节点 -->
	<property>
		<name>dfs.ha.namenodes.cluster1</name>
		<value>node1,node2</value>
	</property>
	<!-- nna节点的RPC通信地址 -->
	<property>
		<name>dfs.namenode.rpc-address.cluster1.node1</name>
		<value>node1:9000</value>
	</property>
	<!-- nns节点的RPC通信地址 -->
	<property>
		<name>dfs.namenode.rpc-address.cluster1.node2</name>
		<value>node2:9000</value>
	</property>
	<!-- nna节点的http通信地址 -->
	<property>
		<name>dfs.namenode.http-address.cluster1.node1</name>
		<value>node1:50070</value>
	</property>
	<!-- nns节点的http通信地址 -->
	<property>
		<name>dfs.namenode.http-address.cluster1.node2</name>
		<value>node2:50070</value>
	</property>
	<!-- 指定NameNode的元数据在JournalNode上的存放位置 -->
	<property>
		<name>dfs.namenode.shared.edits.dir</name>
		<value>
			qjournal://node3:8485;node4:8485;node5:8485/cluster1
		</value>
	</property>
	<!-- 配置失败自动切换实现方式 -->
	<property>
		<name>dfs.client.failover.proxy.provider.cluster1</name>
		<value>
			org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
		</value>
	</property>
	<!-- 配置隔离机制 -->
	<property>
		<name>dfs.ha.fencing.methods</name>
		<value>sshfence</value>
	</property>
	<!-- 使用隔离机制时需要ssh免密码登陆 -->
	<property>
		<name>dfs.ha.fencing.ssh.private-key-files</name>
		<value>/home/hadoop/.ssh/id_rsa</value>
	</property>
	<!-- 指定NameNode的元数据在JournalNode上的存放位置 -->
	<property>
		<name>dfs.journalnode.edits.dir</name>
		<value>/data/soft/new/tmp/journal</value>
	</property>
	<!--指定支持高可用自动切换机制 -->
	<property>
		<name>dfs.ha.automatic-failover.enabled</name>
		<value>true</value>
	</property>
	<!--指定NameNode名称空间的存储地址 -->
	<property>
		<name>dfs.namenode.name.dir</name>
		<value>/data/soft/new/dfs/name</value>
	</property>
	<!--指定DataNode数据存储地址 -->
	<property>
		<name>dfs.datanode.data.dir</name>
		<value>/data/soft/new/dfs/data</value>
	</property>
	<!-- 指定数据冗余份数 -->
	<property>
		<name>dfs.replication</name>
		<value>3</value>
	</property>
	<!-- 指定可以通过Web访问HDFS目录 -->
	<property>
		<name>dfs.webhdfs.enabled</name>
		<value>true</value>
	</property>
	<!-- 保证数据恢复,通过0.0.0.0来保证既可以内网地址访问,也可以外网地址访问 -->
	<property>
		<name>dfs.journalnode.http-address</name>
		<value>0.0.0.0:8480</value>
	</property>
	<property>
		<name>dfs.journalnode.rpc-address</name>
		<value>0.0.0.0:8485</value>
	</property>
	<!-- 通过ZKFailoverController来实现自动故障切换 -->
	<property>
		<name>ha.zookeeper.quorum</name>
		<value>node3:2181,node4:2181,node5:2181</value>
	</property>
</configuration>

 

yarn-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
 <!-- RM(Resource Manager)失联后重新链接的时间 -->
  <property>
    <name>yarn.resourcemanager.connect.retry-interval.ms</name>
    <value>2000</value>
  </property>
  <!-- 开启Resource Manager HA,默认为false -->
  <property>
    <name>yarn.resourcemanager.ha.enabled</name>
    <value>true</value>
  </property>
  <!-- 配置Resource Manager -->
  <property>
    <name>yarn.resourcemanager.ha.rm-ids</name>
    <value>rm1,rm2</value>
  </property>
  <property>
    <name>ha.zookeeper.quorum</name>
    <value>node3:2181,node4:2181,node5:2181</value>
  </property>
  <!-- 开启故障自动切换 -->
  <property>
    <name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
    <value>true</value>
  </property>
  <!-- rm1配置开始 -->
  <!-- 配置Resource Manager主机别名rm1角色为NameNode Active-->
  <property>
    <name>yarn.resourcemanager.hostname.rm1</name>
    <value>node1</value>
  </property>
  <!-- 配置Resource Manager主机别名rm1角色为NameNode Standby-->
  <property>
    <name>yarn.resourcemanager.hostname.rm2</name>
    <value>node2</value>
  </property>
  <!-- 在nna上配置rm1,在nns上配置rm2,将配置好的文件远程复制到其它机器上,但在yarn的另一个机器上一定要修改-->
  <property>
    <name>yarn.resourcemanager.ha.id</name>
    <value>rm1</value>
  </property>
  <!-- 开启自动恢复功能 -->
  <property>
    <name>yarn.resourcemanager.recovery.enabled</name>
    <value>true</value>
  </property>
  <!-- 配置与zookeeper的连接地址 -->
  <property>
    <name>yarn.resourcemanager.zk-state-store.address</name>
    <value>node3:2181,node4:2181,node5:2181</value>
  </property>
  <!--用于持久化RM(Resource Manager简称)状态存储,基于Zookeeper实现 -->
  <property>
    <name>yarn.resourcemanager.store.class</name>
    <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
  </property>
  <!-- Zookeeper地址用于RM(Resource Manager)实现状态存储,以及HA的设置-->
  <property>
    <name>yarn.resourcemanager.zk-address</name>
    <value>node3:2181,node4:2181,node5:2181</value>
  </property>
  <!-- 集群ID标识 -->
  <property>
    <name>yarn.resourcemanager.cluster-id</name>
    <value>cluster1-yarn</value>
  </property>
  <!-- schelduler失联等待连接时间 -->
  <property>
    <name>yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms</name>
    <value>5000</value>
  </property>
  <!-- 配置rm1,其应用访问管理接口 -->
  <property>
    <name>yarn.resourcemanager.address.rm1</name>
    <value>node1:8132</value>
  </property>
  <!-- 调度接口地址 -->
  <property>
    <name>yarn.resourcemanager.scheduler.address.rm1</name>
    <value>node1:8130</value>
  </property>
  <!-- RM的Web访问地址 -->
  <property>
    <name>yarn.resourcemanager.webapp.address.rm1</name>
    <value>node1:8188</value>
  </property>
  <property>
    <name>yarn.resourcemanager.resource-tracker.address.rm1</name>
    <value>node1:8131</value>
  </property>
  <!-- RM管理员接口地址 -->
  <property>
    <name>yarn.resourcemanager.admin.address.rm1</name>
    <value>node1:8033</value>
  </property>
  <property>
    <name>yarn.resourcemanager.ha.admin.address.rm1</name>
    <value>node1:23142</value>
  </property>
  <!-- rm1配置结束 -->
  <!-- rm2配置开始 -->
  <!-- 配置rm2,与rm1配置一致,只是将nna节点名称换成nns节点名称 -->
  <property>
    <name>yarn.resourcemanager.address.rm2</name>
    <value>node2:8132</value>
  </property>
  <property>
    <name>yarn.resourcemanager.scheduler.address.rm2</name>
    <value>node2:8130</value>
  </property>
  <property>
    <name>yarn.resourcemanager.webapp.address.rm2</name>
    <value>node2:8188</value>
  </property>
  <property>
    <name>yarn.resourcemanager.resource-tracker.address.rm2</name>
    <value>node2:8131</value>
  </property>
  <property>
    <name>yarn.resourcemanager.admin.address.rm2</name>
    <value>node2:8033</value>
  </property>
  <property>
    <name>yarn.resourcemanager.ha.admin.address.rm2</name>
    <value>node2:23142</value>
  </property>
  <!-- rm2配置结束 -->
  <!-- NM(NodeManager得简称)的附属服务,需要设置成mapreduce_shuffle才能运行MapReduce任务 -->
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <!-- 配置shuffle处理类 -->
  <property>
    <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
  <!-- NM(NodeManager得简称)本地文件路径 -->
  <property>
    <name>yarn.nodemanager.local-dirs</name>
    <value>/data/soft/new/yarn/local</value>
  </property>
  <!-- NM(NodeManager得简称)日志存放路径 -->
  <property>
    <name>yarn.nodemanager.log-dirs</name>
    <value>/data/soft/new/log/yarn</value>
  </property>
  <!-- ShuffleHandler运行服务端口,用于Map结果输出到请求Reducer  -->
  <property>
    <name>mapreduce.shuffle.port</name>
    <value>23080</value>
  </property>
  <!-- 故障处理类 -->
  <property>
    <name>yarn.client.failover-proxy-provider</name>
    <value>org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider</value>
  </property>
  <!-- 故障自动转移的zookeeper路径地址 -->
  <property>
    <name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
    <value>/yarn-leader-election</value>
  </property>
  <property>
    <name>mapreduce.jobtracker.address</name>
    <value>http://node1:9001</value>
  </property>
 <property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
 </property>
 <property>
     <name>yarn.nodemanager.remote-app-log-dir</name>
     <value>/tmp/logs</value>
 </property>
 <property>
     <name>yarn.nodemanager.remote-app-log-dir-suffix</name>
     <value>logs</value>     
  </property>
 <property>
    <name>yarn.log-aggregation.retain-seconds</name>
    <value>259200</value>
 </property>
 <property>
    <name>yarn.log-aggregation.retain-check-interval-seconds</name>
    <value>3600</value>
 </property>
<property>
    <name>yarn.web-proxy.address</name>
    <value>node1:8090</value>
</property> 
<!-- 配置Fair调度策略  -->
 <property>
     <description>
       CLASSPATH for YARN applications. A comma-separated list
       of CLASSPATH entries. When this value is empty, the following default
       CLASSPATH for YARN applications would be used. 
       For Linux:
       HADOOP_CONF_DIR,
       $HADOOP_COMMON_HOME/share/hadoop/common/*,
       $HADOOP_COMMON_HOME/share/hadoop/common/lib/*,
       $HADOOP_HDFS_HOME/share/hadoop/hdfs/*,
       $HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
       $HADOOP_YARN_HOME/share/hadoop/yarn/*,
       $HADOOP_YARN_HOME/share/hadoop/yarn/lib/*
    </description>
    <name>yarn.application.classpath</name>
    <value>/data/soft/new/hadoop/hadoop-2.7.7/etc/hadoop,/data/soft/new/hadoop/hadoop-2.7.7/share/hadoop/common/*,/data/soft/new/hadoop/hadoop-2.7.7/share/hadoop/common/lib/*,/data/soft/new/hadoop/hadoop-2.7.7/share/hadoop/hdfs/*,/data/soft/new/hadoop/hadoop-2.7.7/share/hadoop/hdfs/lib/*,/data/soft/new/hadoop/hadoop-2.7.7/share/hadoop/yarn/*,/data/soft/new/hadoop/hadoop-2.7.7/share/hadoop/yarn/lib/*</value>
</property>

<property>
     <name>yarn.resourcemanager.scheduler.class</name>
     <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
<property>
    <name>yarn.resourcemanager.system-metrics-publisher.enabled</name>
    <value>true</value>
</property>
<!--失败策略-->
<property>
    <name>yarn.scheduler.fair.allocation.file</name>
    <value>/data/soft/new/hadoop/hadoop-2.7.7/etc/hadoop/fair-scheduler.xml</value>
</property>
<property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>1024</value>
</property>
<property>
    <name>yarn.nodemanager.resource.cpu-vcores</name>
    <value>1</value>
</property>
<property>
  <name>yarn.nodemanager.vmem-pmem-ratio</name>
  <value>4.2</value>
</property>
</configuration>

mapred-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
  	<!--计算任务托管的资源框架名称-->
 	<property>
      		<name>mapreduce.framework.name</name>
          	<value>yarn</value>
       	</property>
        <!--配置 MapReduce JobHistory Server 地址,默认端口10020 -->
  	<property>
      		<name>mapreduce.jobhistory.address</name>
          	<value>0.0.0.0:10020</value>
       	</property>
         <!--配置 MapReduce JobHistory Server Web 地址,默认端口19888 -->
  	<property>
      		<name>mapreduce.jobhistory.webapp.address</name>
          	<value>node1:19888</value>
       	</property>
	<property>
    		<name>yarn.app.mapreduce.am.resource.mb</name>
        	<value>512</value>
        </property>
	<property>
  		<name>mapreduce.map.memory.mb</name>
  		<value>512</value>
	</property>
	<property>
  		<name>mapreduce.map.java.opts</name>
  		<value>-Xmx512M</value>
	</property>
	<property>
  		<name>mapreduce.reduce.memory.mb</name>
  		<value>512</value>
	</property>
	<property>
  		<name>mapreduce.reduce.java.opts</name>
  		<value>-Xmx512M</value>
	</property>
	<property>
  		<name>mapred.child.java.opts</name>
  		<value>-Xmx512M</value>
	</property>            
</configuration>

hadoop最主要的四个配置文件已经配置完成了,里面都有一些简单的注释,可以自己了解下具体的含义。

在参考的书中是有失败策略的配置,但是lz还没有测试这部分,在yarn-site.xml中也有说明,配置如下

fair-scheduler.xml

<?xml version="1.0"?>
<allocations>
	<queue name="root">
		<aclSubmitApps>hadoop</aclSubmitApps>
		<aclAdministerApps>hadoop</aclAdministerApps>
		
		<queue name="default">
			<maxRunningApps>10</maxRunningApps>
			<minResources>1024mb,1vcores</minResources>
			<maxResources>2048mb,2vcores</maxResources>
			<schedulingPolicy>fair</schedulingPolicy>
			<weight>1.0</weight>
			<aclSubmitApps>hadoop</aclSubmitApps>
			<aclAdministerApps>hadoop</aclAdministerApps>
		</queue>
		
		<queue name="hadoop">
			<maxRunningApps>10</maxRunningApps>
			<minResources>1024mb,1vcores</minResources>
			<maxResources>3072mb,3vcores</maxResources>
			<schedulingPolicy>fair</schedulingPolicy>
			<weight>1.0</weight>
			<aclSubmitApps>hadoop</aclSubmitApps>
			<aclAdministerApps>hadoop</aclAdministerApps>
		</queue>
		
		<queue name="queue_1024_01">
			<maxRunningApps>10</maxRunningApps>
			<minResources>1000mb,1vcores</minResources>
			<maxResources>2048mb,2vcores</maxResources>
			<schedulingPolicy>fair</schedulingPolicy>
			<weight>1.0</weight>
			<aclSubmitApps>hadoop,hduser1024</aclSubmitApps>
			<aclAdministerApps>hadoop,hduser1024</aclAdministerApps>
		</queue>
	</queue>
	
	<fairSharePreemptionTimeout>600000</fairSharePreemptionTimeout>
	<defaultMinSharePreemptionTimeout>600000</defaultMinSharePreemptionTimeout>
</allocations>

接下来是环境变量的配置,需要在hadoop-env.sh 和yarn-env.sh中配置jdk的目录(里面自带是取得环境变量的值,但是不清楚为什么没有取到,就改成了jdk的目录)

vim hadoop-env.sh
//修改JAVA_HOME
export JAVA_HOME=/data/soft/new/jdk/jdk1.8.0_11
:wq
vim yarn-env.sh
//修改JAVA_HOME
export JAVA_HOME=/data/soft/new/jdk/jdk1.8.0_11
:wq

最后一个文件就是修改slaves文件,这里面配置的是所有DataNode节点

把原来的文件内容替换如下内容即可

node3
node4
node5

至此,所有文件已经配置完毕,检查下配置的文件缺不缺吧:

core-site.xml

hdfs-site.xml

yarn-site.xml

mapred-site.xml

fair-scheduler.xml

hadoop-env.sh

yarn-env.sh

slaves

复制到其他四个节点

//将目录传到其他四个节点
scp -r hadoop-2.7.7 hadoop@node2:/data/soft/new/hadoop

//在node2节点需要修改yarn-site.xml
//将yarn.resourcemanager.ha.id 修改为node2

配置环境变量 

vim /etc/profile
export HADOOP_HOME=/data/soft/new/hadoop/hadoop-2.7.7
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
:wq
source /etc/profile

启动

1.首先将zookeeper起来(三个节点)

zkServer.sh start 

2.在DataNode节点启动(node3-node5)

hadoop-daemon.sh start journalnode

3.初次启动hadoop集群时,需要格式化NameNode节点

hdfs namenode -format

4.向zookeeper注册ZNode

hdfs zkfc -formatZK

5.启动分布式文件系统

start-dfs.sh

6.启动YARN服务进程

start-yarn.sh

7.在node2节点同步node1节点的元数据信息

hdfs namenode -bootstrapStandby

8.在node2节点输入jps发现只有DFSZKailoverontroller进程,可以手动启动NameNode和ResourceManager服务进程

hadoop-daemon.sh start namenode
yarn-daemon.sh start resourcemanager
 

至此,hadoop集群已经启动成功

验证一下吧

http://node1:50070

http://node1:8188

 

操作HDFS

简单介绍几个简单的命令:

查看hdfs下的目录

hdfs dfs -ls /

上传文件到hdfs中

hdfs dfs -put file /file

在hdfs中创建目录

hdfs dfs -mkdir /home

下载hdfs中的文件

hdfs dfs -get /file ./file