1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
| [root@bigdata-* soft]# echo -e "# hadoop\nexport HADOOP_HOME=/opt/hadoop\nexport HADOOP_PREFIX=\$HADOOP_HOME\nexport HADOOP_COMMON_HOME=\$HADOOP_PREFIX\nexport HADOOP_CONF_DIR=\$HADOOP_PREFIX/etc/hadoop\nexport HADOOP_HDFS_HOME=\$HADOOP_PREFIX\nexport HADOOP_MAPRED_HOME=\$HADOOP_PREFIX\nexport HADOOP_YARN_HOME=\$HADOOP_PREFIX\nexport PATH=\$PATH:\$HADOOP_HOME/bin:\$HADOOP_HOME/sbin" >> /etc/profile
[root@bigdata-* soft]# source /etc/profile
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/opt/jdk
export HADOOP_SSH_OPTS="-p 22"
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/mapred-env.sh
export JAVA_HOME=/opt/jdk
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/yarn-env.sh
export JAVA_HOME=/opt/jdk
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://bigdata-01:9000/</value>
<description>NameNode URI, 192.168.1.100为服务器IP地址, 其实也可以使用主机名</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/hadoop/tmp</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>bigdata-01:2181,bigdata-02:2181,bigdata-03:2181</value>
</property>
<property>
<name>ha.zookeeper.session-timeout.ms</name>
<value>10000</value>
</property>
<property>
<name>fs.trash.checkpoint.interval</name>
<value>1440</value>
<discription>以分钟为单位的垃圾回收检查间隔。</discription>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>simple</value>
<discription>可以设置的值为 simple (无认证) 或者 kerberos(一种安全认证系统)</discription>
</property>
<property>
<name>fs.trash.interval</name>
<value>1440</value>
<discription>以分钟为单位的垃圾回收时间, 垃圾站中数据超过此时间, 会被删除。如果是0, 垃圾回收机制关闭。</discription>
</property>
</configuration>
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>true</value>
<description>true:权限检查, false:权限检查关闭,其他行为不变. 从一个参数值切换到另一个参数值不会改变模式、所有者或文件或目录组。</description>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/data/bigdata/hdfs/name</value>
<discription>持久存储名字空间,事务日志的本地路径</discription>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/data/bigdata/hdfs/data</value>
<discription>datanode存放数据的路径,单个节点单配,多个目录逗号分隔</discription>
</property>
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>16384</value>
<discription>指定用于在DataNode间传输block数据的最大线程数</discription>
</property>
<property>
<name>dfs.datanode.balance.bandwidthPerSec</name>
<value>52428800</value>
<description>Specifies the maximum amount of bandwidth that each datanode can utilize for the balancing purpose in term of the number of bytes per second.</description>
</property>
<property>
<name>dfs.datanode.balance.max.concurrent.moves</name>
<value>50</value>
<description>增加DataNode上转移block的Xceiver的个数上限。</description>
</property>
<property>
<name>dfs.nameservices</name>
<value>ns1</value>
<description>HDFS 命名服务的逻辑名称,可用户自己定义,比如 mycluster,注意,该名称将被基于 HDFS 的系统使用,比如 Hbase 等,此外,需要你想启用 HDFS Federation,可以通过该 参数指定多个逻辑名称,并用“,”分割。</description>
</property>
<property>
<name>dfs.ha.namenodes.ns1</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.ns1.nn1</name>
<value>bigdata-01:8020</value>
<discription>nn1的RPC通信地址, nn1所在地址</discription>
</property>
<property>
<name>dfs.namenode.http-address.ns1.nn1</name>
<value>bigdata-01:50070</value>
<discription>nn1的http通信地址, 外部访问地址</discription>
</property>
<property>
<name>dfs.namenode.rpc-address.ns1.nn2</name>
<value>bigdata-02:8020</value>
<discription>nn2的RPC通信地址, nn2所在地址</discription>
</property>
<property>
<name>dfs.namenode.http-address.ns1.nn2</name>
<value>bigdata-02:50070</value>
<discription>nn2的http通信地址, 外部访问地址</discription>
</property>
<property>
<name>dfs.namenode.journalnode</name>
<value>node1:8485;node2:8485;node3:8485</value>
<discription>journalnode为了解决hadoop单点故障,给namenode做元数据同步的,奇数个,一般3个或5个</discription>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://${dfs.namenode.journalnode}/ns1</value>
<description>指定NameNode的元数据在JournalNode日志上的存放位置(一般和zookeeper部署在一起)</description>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/data/hadoop/journal</value>
<description>指定JournalNode在本地磁盘存放数据的位置</description>
</property>
<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
<description>客户端通过代理访问namenode, 访问文件系统, HDFS 客户端与Active 节点通信的Java 类, 使用其确定Active 节点是否活跃</description>
</property>
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
<description>这是配置自动切换的方法, 有多种使用方法, 具体可以看官网, 在文末会给地址, 这里是远程登录杀死的方法, 这个参数的值可以有多种, 你也可以换成shell(/bin/true)试试, 也是可以的, 这个脚本do nothing 返回0</description>
</property>
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
<description>这个是使用sshfence隔离机制时才需要配置ssh免登陆</description>
</property>
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
<description>配置sshfence隔离机制超时时间, 这个属性同上, 如果你是用脚本的方法切换, 这个应该是可以不配置的</description>
</property>
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
<description>开启自动故障转移, 如果你没有自动故障转移, 这个可以先不配</description>
</property>
</configuration>
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>bigdata-01:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>bigdata-01:19888</value>
</property>
</configuration>
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/yarn-site.xml
<configuration>
<property>
<name>yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms</name>
<value>5000</value>
<discription>schelduler失联等待连接时间</discription>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.connect.retry-interval.ms</name>
<value>5000</value>
<description>How often to try connecting to the ResourceManager.</description>
</property>
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
<discription>是否启用RM HA,默认为false(不启用)</discription>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
<discription>是否启用自动故障转移。默认情况下,在启用HA时,启用自动故障转移。</discription>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
<value>true</value>
<discription>启用内置的自动故障转移。默认情况下,在启用HA时,启用内置的自动故障转移。</discription>
</property>
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>cluster1</value>
<discription>集群的Id,elector使用该值确保RM不会做为其它集群的active。</discription>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
<discription>RMs的逻辑id列表,rm管理资源器;一般配两个,一个起作用 其他备用;用逗号分隔,如:rm1,rm2 </discription>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>bigdata-01</value>
<discription>RM的节点1的hostname</discription>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>${yarn.resourcemanager.hostname.rm1}:8030</value>
<discription>RM对AM暴露的地址,AM通过地址想RM申请资源,释放资源等</discription>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>${yarn.resourcemanager.hostname.rm1}:8031</value>
<discription>RM对NM暴露地址,NM通过该地址向RM汇报心跳,领取任务等</discription>
</property>
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>${yarn.resourcemanager.hostname.rm1}:8032</value>
<discription>RM对客户端暴露的地址,客户端通过该地址向RM提交应用程序等</discription>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm1</name>
<value>${yarn.resourcemanager.hostname.rm1}:8033</value>
<discription>RM对管理员暴露的地址.管理员通过该地址向RM发送管理命令等</discription>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>${yarn.resourcemanager.hostname.rm1}:8088</value>
<discription>RM对外暴露的web http地址,用户可通过该地址在浏览器中查看集群信息</discription>
</property>
<property>
<description>The https adddress of the RM web application.</description>
<name>yarn.resourcemanager.webapp.https.address.rm1</name>
<value>${yarn.resourcemanager.hostname.rm1}:8090</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>bigdata-02</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>${yarn.resourcemanager.hostname.rm2}:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>${yarn.resourcemanager.hostname.rm2}:8031</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>${yarn.resourcemanager.hostname.rm2}:8032</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm2</name>
<value>${yarn.resourcemanager.hostname.rm2}:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>${yarn.resourcemanager.hostname.rm2}:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm2</name>
<value>${yarn.resourcemanager.hostname.rm2}:8090</value>
</property>
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
<discription>默认值为false,也就是说resourcemanager挂了相应的正在运行的任务在rm恢复后不能重新启动</discription>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
<discription>状态存储的类</discription>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>node1:2181,node2:2181,node3:2181</value>
</property>
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>${ha.zookeeper.quorum}</value>
<discription>ZooKeeper服务器的地址(主机:端口号),既用于状态存储也用于内嵌的leader-election。</discription>
</property>
<property>
<name>yarn.nodemanager.address</name>
<value>${yarn.nodemanager.hostname}:8041</value>
<discription>The address of the container manager in the NM.</discription>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>106800</value>
</property>
</configuration>
[root@bigdata-01 soft]# vim $HADOOP_HOME/etc/hadoop/slaves
bigdata-01
bigdata-02
bigdata-03
|