安装篇-Hadoop3.2.1集群安装分享

最新推荐文章于 2025-01-09 11:08:45 发布

原创最新推荐文章于 2025-01-09 11:08:45 发布 · 403 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#hadoop #大数据 #big data

1.预备知识

1.1 linux须知

linux微内核的特性，vmware安装linux
centos：稳定
linux常操作目录：/bin,/usr,/etc
xshell:实用xshell操作centos
linux记事本：vi/vim

1.2 常用命令

帮助：man
目录：mkdir，rmdir，mv,ls,rm -rf,cd
文件：touch/vi,cat,cp,rm,more，grep
搜索：which,whereis,find
时间：date ,date -s
用户和组管理：useradd…,groupadd…
进程：ps -ef,kill -9 进程id，pkill -p id 或者/-f 进程
网络：netstat -aux
磁盘:df
压缩和解压：zip，unzip，tar

tar -zcvf 压缩
tar -zxvf 解压

软件：yum
- yum list
- yum install
- yum remove
- rpm -ivh,evh:了解
上传、下载（lrzsz）：rz，sz
定时任务：crontab -e
- min,h,d,m,week
- crontab -l
- crontab -r:删除

1.3 shell脚本

变量：

x,$x
运算符：

$[3+6]
判断:

if [];then fi
循环：

for(()) 或者 for x in list

do

done

while [ ]

do

done
函数:

function fun(){} fun

2.hadoop之windows配置

解压hadoop压缩文件
指定HADOOP_HOME
指定path：/bin，/sbin
测试：

hadoop version

3.在linux上搭建hadoop集群

集群成员：

主机 hdfs yarn

master namenode ,secondarynamenode resourcemanager

slave1 datanode nodemanager

slave2 datanode nodemanager

3.1 安装jdk8，hadoop3.2.1

上传压缩文件并解压(/usr)
设置环境变量（/etc/profile）

export JAVA_HOME=/usr/jdk8
export HADOOP_HOME=/usr/hadoop321
export PATH= $P A T H :$ JAVA_HOME/bin: $HADOOP_HOME/bin:$ HADOOP_HOME/sbin
激活配置文件

. /etc/profile
测试：

hadoop version

3.2 hdfs配置

core-site.xml

fs.defaultFS hdfs://master:9000
hdfs-site.xml

dfs.replication 2 dfs.http.address 0.0.0.0:5700 dfs.namenode.name.dir file:///root/hadoop/dfs/namenode dfs.datanode.data.dir file:///root/hadoop/dfs/datanode dfs.webhdfs.enabled true
初始化namenode

hdfs namenode -format
start-dfs.sh,stop-dfs.sh

#设置用户
HDFS_NAMENODE_USER=root
HDFS_DATANODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root
hadoop-env.sh

export JAVA_HOME=/usr/jdk8

3.3 集群成员配置

域名与ip绑定(/etc/hosts)

192.168.85.129 master
192.168.85.130 slave1
192.168.85.131 slave2
配置workers（工作节点）(/usr/hadoop321/etc/hadoop/workers)

slave1

slave2
修改副本数量(数据节点数量)(hdfs-site.xml)

dfs.replication 2

3.4 yarn配置

yarn-site.xml

yarn.nodemanager.aux-services mapreduce_shuffle yarn.resourcemanager.hostname master yarn.resourcemanager.webapp.address master:8088 yarn.application.classpath /usr/hadoop321/etc/hadoop:/usr/hadoop321/share/hadoop/common/lib/*:/usr/hadoop321/share/hadoop/common/*:/usr/hadoop321/share/hadoop/hdfs:/usr/hadoop321/share/hadoop/hdfs/lib/*:/usr/hadoop321/share/hadoop/hdfs/*:/usr/hadoop321/share/hadoop/mapreduce/lib/*:/usr/hadoop321/share/hadoop/mapreduce/*:/usr/hadoop321/share/hadoop/yarn:/usr/hadoop321/share/hadoop/yarn/lib/*:/usr/hadoop321/share/hadoop/yarn/*
mapred-site.xml

mapreduce.framework.name yarn
start-yarn.sh,stop-yarn.sh

YARN_RESOURCEMANAGER_USER=root
YARN_NODEMANAGER_USER=root

3.5 cnetos克隆

修改主机名

hostnamectl set-hostanme 主机名
删除/tmp目录下的文件，使数据节点在浏览器端能看见(注意：防火墙关闭)

systemctl disable firewalld(开机不自启)

3.6 master免密登录slave

在root目录创建密钥：

ssh-keygen
authorized_keys拷贝到slave上

cat id_rsa.pub >> authorized_keys

scp 拷贝到salve的.ssh文件夹下

scp authorized_keys root@slave1:/root/.ssh

3.7 启动hadoop集群

在master上启动

start-all.sh
测试

jps
查看节点

hdfs dfsadmin -report

4. mapreduce实例

4.1 单词统计(入门)

/**
 * 英文单词统计
 */
public class WordCounter {
    //实现分词
    public static class MyMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
        public static Text text = new Text();
        public static IntWritable intWritable = new IntWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String v = value.toString();
            String[] words = v.split(" ");
            for (String word : words) {
                text.set(word);
                context.write(text,intWritable);
            }
        }
    }

    //实现统计
    public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count+=value.get();
            }
            context.write(key,new IntWritable(count));
        }
    }

    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            // 任务
            Job job = Job.getInstance(conf);
            job.setJobName("firstJob");
            job.setJarByClass(WordCounter.class);
            // 设置mapper，reducer
            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);
            // 设置输出数据类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            // 设置输入输出目录
            FileInputFormat.setInputPaths(job,"data6");
            FileOutputFormat.setOutputPath(job,new Path("dTemp"));
            // 执行并关闭
            job.waitForCompletion(true);
            job.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

4.2 中文分词统计（ik）

/**
 * 中文单词统计
 */
public class CNWordCounter {
    // 实现中文分词
    public static class MyMapper extends Mapper<LongWritable, Text,Text, IntWritable>{

        public static Text text = new Text();
        public static IntWritable intWritable = new IntWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            byte[] bytes = value.toString().getBytes();
            ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
            InputStreamReader isReader = new InputStreamReader(bis);
            IKSegmenter ikSegmenter = new IKSegmenter(isReader, true);

            Lexeme lexeme=null;
            while ((lexeme=ikSegmenter.next())!=null){
                String word = lexeme.getLexemeText();
                text.set(word);
                context.write(text,intWritable);
            }
        }
    }

    // 实现统计
    public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

        public static Text text = new Text();
        public static List<Record> list =new ArrayList<Record>();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count=0;
            for (IntWritable value : values) {
                count+=value.get();
            }
//            context.write(key,new IntWritable(count));
            Record record = new Record(key.toString(), count);
            list.add(record);
        }
        // 实现排序
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            Collections.sort(list);
            Collections.reverse(list);
            for (Record record : list) {
                text.set(record.getWord());
                context.write(text,new IntWritable(record.getCount()));
            }
        }
    }

    
    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            Job job = Job.getInstance(conf);
            job.setJobName("secondJob");
            job.setJarByClass(CNWordCounter.class);
            // 设置mapper,reducer
            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);
            // 设置输出类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            // 设置输入输出目录
            FileInputFormat.setInputPaths(job,"/test99/data2");
            FileOutputFormat.setOutputPath(job,new Path("/test99/out"));
            // 启动任务并关闭
            job.waitForCompletion(true);
            job.close();
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
}

4.3 数据清洗（去重、去空、去非）

/**
 * 数据清洗：去空，去重，去非
 */
public class DataClear {

    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            Job job = Job.getInstance(conf);
            job.setJobName("clearJob");
            job.setJarByClass(DataClear.class);
            // mapper
            job.setMapperClass(RemoveReplyMapper.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            // 输入输出目录
            FileInputFormat.setInputPaths(job,"data4");
            FileOutputFormat.setOutputPath(job,new Path("out"));
            job.waitForCompletion(true);
            job.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

/**
 * 去空Mapper类
 */
class RemoveNullMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String v = value.toString();
        boolean flag = isEmpty(v);
        // 非空则写入
        if (!flag){
            context.write(value,NullWritable.get());
        }
    }
    // 如果某一字段为空，则判断为空
    private boolean isEmpty(String v) {
        String[] split = v.split("  ");
        for (String field : split) {
            if (field==null||field.equals("  ")||field.equals("")){
                return true;
            }
        }
        return false;
    }
}

/**
 * 去重Mapper类：利用set集合去重
 */
class RemoveReplyMapper extends Mapper<LongWritable,Text,Text,NullWritable>{

    public static Set<String> names = new HashSet<>();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String v = value.toString();
        boolean flag = isRely(v);
        // 非重复数据则写入
        if (!flag){
            context.write(value,NullWritable.get());
        }
    }

    // 如果姓名字段重复，则判定重复
    private boolean isRely(String v) {
        String[] split = v.split("  ");
        String name =split[0];
        // 重复
        if (names.contains(name)){
            return true;
        }
        // 不重复
        names.add(name);
        return false;
    }
}

/**
 * 去非Mapper类
 */
class RemoveIllegalMapper extends Mapper<LongWritable,Text,Text,NullWritable>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String v = value.toString();
        boolean flag = isIllegal(v);
        // 合法数据则写入
        if (!flag){
            context.write(value,NullWritable.get());
        }
    }
    // 如果成绩字段>100或<0,则为非法数据
    private boolean isIllegal(String v) {
        String[] split = v.split("\\s+");
        for (int i = 1; i < split.length; i++) {
            int score = Integer.parseInt(split[i]);
            if (score>100 || score<0){
                return true;
            }
        }
        return false;
    }
}

4.4 序列化和反序列化（writable）

/**
 * 手机话费序列化类：话费、流量费
 */
public class PhoneWritable implements Writable {

    private String num;
    private Double base;
    private Double flow;

    public PhoneWritable() {
    }

    @Override
    public String toString() {
        return "PhoneWritable{" +
                "base=" + base +
                ", flow=" + flow +
                '}';
    }

    public PhoneWritable(Double base, Double flow) {
        this.base = base;
        this.flow = flow;
    }


    public String getNum() {
        return num;
    }

    public void setNum(String num) {
        this.num = num;
    }

    public Double getBase() {
        return base;
    }

    public void setBase(Double base) {
        this.base = base;
    }

    public Double getFlow() {
        return flow;
    }

    public void setFlow(Double flow) {
        this.flow = flow;
    }

    // 序列化
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeDouble(base);
        out.writeDouble(flow);
    }
    // 反序列化
    @Override
    public void readFields(DataInput in) throws IOException {
        this.base=in.readDouble();
        this.flow=in.readDouble();
    }
}

4.5 数据排序（WritableComparable）

@Data
@NoArgsConstructor
@AllArgsConstructor
public class SortRecord implements WritableComparable<SortRecord> {

    private String key;
    private Integer value;

    @Override
    public String toString() {
        return key+"  "+value;
    }


    @Override
    public int compareTo(SortRecord o) {
        // 降序
        return o.getValue()-this.getValue();
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(key);
        out.writeInt(value);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.key=in.readUTF();
        this.value=in.readInt();
    }
}

4.6 数据压缩（map，reduce）

// map压缩
conf.setBoolean("mapreduce.map.output.compress",true);
conf.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
// reduce压缩
FileOutputFormat.setCompressOutput(job,true);
FileOutputFormat.setOutputCompressorClass(job,BZip2Codec.class);

4.7 连接查询（mapJoin,reduceJoin）

reduce端连接

class JoinMapper extends Mapper<LongWritable, Text,Text,Record>{

    Record record=new Record();
    Text text =new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        InputSplit inputSplit = context.getInputSplit();
        FileSplit fileSplit = (FileSplit) inputSplit;
        String name = fileSplit.getPath().getName();
        String[] split = value.toString().split("\\s+");
        String pid=null;
        if (name.startsWith("order")){
            pid=split[1];
            record.setOrderid(split[0]);
            record.setPid(split[1]);
            record.setNum(Integer.parseInt(split[2]));
            record.setPname("");
        }else {
            pid=split[0];
            record.setOrderid("");
            record.setPid(split[0]);
            record.setPname(split[1]);
            record.setNum(0);
        }
        text.set(pid);
        context.write(text,record);
    }
}

class JoinReducer extends Reducer<Text,Record,Text, NullWritable>{

    Text text=new Text();

    @Override
    protected void reduce(Text key, Iterable<Record> values, Context context) throws IOException, InterruptedException {
        List<Record> list =new ArrayList<>();
        Record pd =new Record();
        for (Record record : values) {
            if (StringUtils.isEmpty(record.getPname())){
                Record record1 = new Record();
                // 订单
                try {
                    BeanUtils.copyProperties(record1,record);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                list.add(record1);
            }else {
                pd.setPname(record.getPname());
            }
        }
        for (Record re : list) {
            String res =re.getOrderid()+" "+pd.getPname()+" "+re.getNum();
            text.set(res);
            context.write(text,NullWritable.get());
        }
    }
}
public class ReduceJoin {
    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            Job job = Job.getInstance(conf, "reduceJoin");
            job.setJarByClass(ReduceJoin.class);
            job.setMapperClass(JoinMapper.class);
            job.setReducerClass(JoinReducer.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Record.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);

            FileInputFormat.setInputPaths(job,"data");
            FileOutputFormat.setOutputPath(job,new Path("out"));
            job.waitForCompletion(true);
            job.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

map端连接

自定义序列化类

@Data
@NoArgsConstructor
@AllArgsConstructor
public class Record implements Writable {
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(orderid);
        out.writeUTF(pid);
        out.writeUTF(pname);
        out.writeInt(num);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
       orderid=in.readUTF();
       pid=in.readUTF();
       pname=in.readUTF();
       num=in.readInt();
    }

    private String orderid;
    private String pid;
    private String pname;
    private Integer num;
}

map任务

class MyMapper extends Mapper<LongWritable, Text,Text, NullWritable>{
    // 存放商品星系：id，name
    Map<String,String> map =new HashMap<>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        FileInputStream fileInputStream = new FileInputStream("data/pd.txt");
        BufferedReader reader=new BufferedReader(new InputStreamReader(fileInputStream));
        String str=null;
        while ((str=reader.readLine())!=null){
            String[] split = str.split("\\s+");
            map.put(split[0],split[1]);
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        InputSplit inputSplit = context.getInputSplit();
        FileSplit fileSplit= (FileSplit) inputSplit;
        String name = fileSplit.getPath().getName();
        if (name.startsWith("order")){
            String[] split = value.toString().split("\\s+");
            String res =split[0]+" "+map.get(split[1])+" "+split[2];
            context.write(new Text(res),NullWritable.get());
        }
    }
}

public class MapJoin {
    public static void main(String[] args) {
        Configuration conf = new Configuration();
        try {
            Job job = Job.getInstance(conf, "mapJoin");
            job.setMapperClass(MyMapper.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            FileInputFormat.setInputPaths(job,"data");
            FileOutputFormat.setOutputPath(job,new Path("out"));
            job.waitForCompletion(true);
            job.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

4.8 自定义分区（partitioner）

自定义分区类

class MyPartition extends Partitioner<Text,Record>{

    @Override
    public int getPartition(Text text, Record record, int i) {
        String key= text.toString();
        switch (key){
            case "01":
                return 1;
            case "02":
                return 2;
            case "03":
                return 3;
        }
        return 1;
    }
}

job中指定

job.setPartitionerClass(MyPartition.class);
job.setNumReduceTasks(3);

5. hadoop优化

选用高性能机器
map之前预处理:小文件合并成大文件
map阶段：combine，局部汇总
reduce阶段:设置reduce buff参数
数据倾斜
- 自定义分区
- mapJoin

6.zookeeper的使用

下载文件解压(/usr)
配置环境变量（/etc/profile）

export ZK_HOME=/usr/zk
export PATH= $P A T H :$ JAVA_HOME/bin: $HADOOP_HOME/bin:$ HADOOP_HOME/sbin:$ZK_HOME/bin
配置运行参数(/usr/zk/conf/zoo.cfg)

dataDir=/root/zk/data
dataLogDir=/root/zk/log
启动zk服务端

zkServer.sh start
打开zk客户端

zkCli.sh
关闭zk服务端

zkServer.sh stop

7.mapreduce进阶案例

7.1 多mr顺序执行

public static void main(String[] args) {
    Mr1.execMr1();
    Mr2.execMr2();
}

7.2 mapreduce的链式执行

//map链
ChainMapper.addMapper(job,MyMapper1.class,LongWritable.class,Text.class,Text.class,IntWritable.class,cfg);
ChainMapper.addMapper(job,MyMapper2.class,Text.class,IntWritable.class,Text.class,IntWritable.class,cfg);
//reducer
ChainReducer.setReducer(job,MyReducer.class,Text.class,IntWritable.class,Text.class,IntWritable.class,cfg);
//mapper3
ChainMapper.addMapper(job,MyMapper3.class,Text.class,IntWritable.class,Text.class,IntWritable.class,cfg);

7.3 气温指数分析

自定义天气类

/**
 * 天气类：年，月，日，温度
 */
@Data@NoArgsConstructor@AllArgsConstructor
public class TianQi implements WritableComparable<TianQi> {
    private Integer year;
    private Integer month;
    private Integer day;
    private Integer wd;

    @Override
    public String toString() {
        return year+"\t"+month+"\t"+day+"\t"+wd+"c";
    }

    @Override
    public int compareTo(TianQi o) {
        // 按年升序，月升序，温度降序，日升序
        int yAsc = Integer.compare(this.getYear(),o.getYear());
        if (yAsc==0){
            int mAsc = Integer.compare(this.getMonth(), o.getMonth());
            if (mAsc==0){
                int wdDesc = Integer.compare(o.getWd(), this.getWd());
                if (wdDesc==0){
                    int dAsc = Integer.compare(this.getDay(), o.getDay());
                    return dAsc;
                }
                return wdDesc;
            }
            return mAsc;
        }
        return yAsc;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(year);
        out.writeInt(month);
        out.writeInt(day);
        out.writeInt(wd);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        year=in.readInt();
        month=in.readInt();
        day=in.readInt();
        wd=in.readInt();
    }
}

自定义分组类

/**
 * 按年月分组
 */
public class TianQiGroupComparator extends WritableComparator {
    public TianQiGroupComparator() {
        super(TianQi.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        TianQi aa= (TianQi) a;
        TianQi bb = (TianQi) b;
        int y =aa.getYear()-bb.getYear();
        if (y==0){
            return aa.getMonth()-bb.getMonth();
        }
        return y;
    }
}

编写mr程序

/**
 * 统计每月温度最高的两天
 */
public class TianQiClient {
    /**
     * 封装到TianQi类中
     */
    public static class TianQiMapper extends Mapper<LongWritable, Text,TianQi, NullWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("\\s+");
            String time =split[0]+" "+split[1];
            int wd = Integer.parseInt(split[2].substring(0, split[2].length()-1));
            SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            try {
                Date date = simpleDateFormat.parse(time);
                Calendar calendar = Calendar.getInstance();
                calendar.setTime(date);
                int year =calendar.get(Calendar.YEAR);
                int month =calendar.get(Calendar.MONTH)+1;
                int day =calendar.get(Calendar.DAY_OF_MONTH);
                TianQi tianQi = new TianQi(year, month, day, wd);
                context.write(tianQi,NullWritable.get());
            } catch (ParseException e) {
                e.printStackTrace();
            }
        }
    }
    public static class TianQiPartitioner extends Partitioner<TianQi, NullWritable>{

        @Override
        public int getPartition(TianQi tianQi, NullWritable nullWritable, int numPartitions) {
            return (tianQi.getYear()&Integer.MAX_VALUE)%numPartitions;
        }
    }

    /**
     * 找出温度最高的两天
     */
    public static class TianQiReducer extends Reducer<TianQi,NullWritable,TianQi,NullWritable>{
        @Override
        protected void reduce(TianQi key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            int flag = 0;
            int day = 0;
            for (NullWritable nullWritable : values) {
                // 写出最高温度
                if(flag == 0){
                    context.write(key, NullWritable.get());
                    flag ++;
                    // 记录天
                    day = key.getDay();
                }
                // 写出次高温度
                if(key.getDay() != day){
                    context.write(key, NullWritable.get());
                    break;
                }
            }
        }
    }


    public static void main(String[] args) {
        Configuration cfg = new Configuration();
        try {
            Job job = Job.getInstance(cfg,"tianqi");
            job.setMapperClass(TianQiMapper.class);
            job.setReducerClass(TianQiReducer.class);

            job.setMapOutputKeyClass(TianQi.class);
            job.setMapOutputValueClass(NullWritable.class);
            job.setOutputKeyClass(TianQi.class);
            job.setOutputValueClass(NullWritable.class);

            FileInputFormat.setInputPaths(job,"data2");
            FileOutputFormat.setOutputPath(job,new Path("out"));

            job.setPartitionerClass(TianQiPartitioner.class);
            job.setNumReduceTasks(3);
            job.setGroupingComparatorClass(TianQiGroupComparator.class);

            job.waitForCompletion(true);
            job.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

7.4 好友推荐

/**
 * 好友推荐:推荐潜在好友
 */
public class FriendClient {
    /**
     * 直接好友：0，间接好友：1
     */
    public static class FriendMapper extends Mapper<LongWritable, Text,Text, IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(":");
            String left =split[0];
            String[] rights = split[1].split("\\s+");
            for (int i = 0; i < rights.length; i++) {
                // 直接好友
                context.write(new Text(unit(left,rights[i])),new IntWritable(0));
                for (int j = i+1; j < rights.length; j++) {
                    // 间接好友
                    context.write(new Text(unit(rights[i],rights[j])),new IntWritable(1));
                }
            }
        }
    }
    public static class FriendReducer extends Reducer<Text, IntWritable,Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                // 排除直接好友
                if (value.get() == 0) {
                    return;
                }
                count++;
            }
            context.write(key, new IntWritable(count));
        }
    }
    // 交换
    private static String unit(String left, String right) {
        return left.compareTo(right)>0?left+":"+right:right+":"+left;
    }


    public static void main(String[] args) {
        Configuration cfg = new Configuration();
        try {
            Job job = Job.getInstance(cfg,"fried");
            job.setMapperClass(FriendMapper.class);
            job.setReducerClass(FriendReducer.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            FileInputFormat.setInputPaths(job,"data4");
            Path out = new Path("out");
            FileSystem fs = out.getFileSystem(cfg);
            if (fs.exists(out)){
                fs.delete(out,true);
            }
            FileOutputFormat.setOutputPath(job,out);
            job.waitForCompletion(true);
            job.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

8 部署高可用Hadoop集群

集群成员：

主机 hdfs yarn

master namenode resourcemanager

slave1 namenode,datanode resourcemanager，nodemanager

slave2 datanode nodemanager

8.1 在master上安装zk

8.2 修改core-site.xml

fs.defaultFS hdfs://cluster hadoop.tmp.dir file:/root/hadoop/tmp ha.zookeeper.quorum master:2181

8.3 修改hdfs-site.xml

<configuration>
<!--指定hdfs的nameservice为myNameService1，需要和core-site.xml中的保持一致 -->
<property>
      <name>dfs.nameservices</name>
      <value>cluster</value>
</property>
<!-- myNameService1下面有两个NameNode，分别是nn1，nn2 -->
<property>
      <name>dfs.ha.namenodes.cluster</name>
      <value>master,slave1</value>
</property>
<!-- master的RPC通信地址 -->
<property>
      <name>dfs.namenode.rpc-address.cluster.master</name>
      <value>master:9000</value>
</property>

<!-- master的http通信地址 -->
<property>
      <name>dfs.namenode.http-address.cluster.master</name>
      <value>master:50070</value>
</property>

<!-- slave1的RPC通信地址 -->
<property>
      <name>dfs.namenode.rpc-address.cluster.slave1</name>
      <value>slave1:9000</value>
</property>

<!-- slave2的http通信地址 -->
<property>
      <name>dfs.namenode.http-address.cluster.slave1</name>
      <value>slave1:50070</value>
</property>

<!-- 指定NameNode的元数据在JournalNode上的存放位置 -->
<property>
      <name>dfs.namenode.shared.edits.dir</name>
      <value>qjournal://master:8485;slave1:8485;slave2:8485/cluster</value>
</property>

<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
<property>
      <name>dfs.journalnode.edits.dir</name>
      <value>/root/hadoop/journalData</value>
</property>

<!-- 开启NameNode失败自动切换 -->
<property>
      <name>dfs.ha.automatic-failover.enabled</name>
      <value>true</value>
</property>

<!-- 配置失败自动切换实现方式 -->
<property>
      <name>dfs.client.failover.proxy.provider.myNameService1</name>
      <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>

<!-- 配置隔离机制方法，Failover后防止停掉的Namenode启动，造成两个服务,多个机制用换行分割，即每个机制暂用一行-->
<property>
      <name>dfs.ha.fencing.methods</name>
      <value>
              sshfence
              shell(/bin/true)
      </value>
</property>

<!-- 使用sshfence隔离机制时需要ssh免登陆，注意换成自己的用户名 -->
<property>
      <name>dfs.ha.fencing.ssh.private-key-files</name>
      <value>/root/.ssh/id_rsa</value>
</property>

<!-- 配置sshfence隔离机制超时时间 -->
<property>
      <name>dfs.ha.fencing.ssh.connect-timeout</name>
      <value>30000</value>
</property>
<property>
      <name>dfs.replication</name>
      <value>2</value>
</property>
<property>
      <name>dfs.namenode.name.dir</name>
      <value>/root/hadoop/dfs/namenode</value>
</property>
<property>
      <name>dfs.datanode.data.dir</name>
      <value>/root/hadoop/dfs/datanode</value>
</property>
<property>
       <name>dfs.webhdfs.enabled</name>
       <value>true</value>
</property>
<property>
      <name>dfs.permissions</name>
      <value>false</value>
</property>
<property> 
       <name>dfs.client.failover.proxy.provider.cluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
</configuration>

8.4 修改mapred-site.xml

<configuration>
        <!-- 指定mr框架为yarn方式 -->
        <property>
                <name>mapreduce.framework.name</name>
                <value>yarn</value>
        </property>
        <!-- 配置 MapReduce JobHistory Server 地址 ，默认端口10020 -->
        <property>
                <name>mapreduce.jobhistory.address</name>
                <value>master:10020</value>
        </property>
        <!-- 配置 MapReduce JobHistory Server web ui 地址， 默认端口19888 -->
        <property>
                <name>mapreduce.jobhistory.webapp.address</name>
                <value>master:19888</value>
        </property>
        <property>
                <name>mapreduce.application.classpath</name>             			<value>/usr/hadoop321/share/hadoop/mapreduce/*,/usr/hadoop321/share/hadoop/mapreduce/lib/*</value>
        </property>
</configuration>

8.5 修改yarn-site.xml

<configuration>
        <!-- 开启RM高可用 -->
        <property>
                <name>yarn.resourcemanager.ha.enabled</name>
                <value>true</value>
        </property>
        <!-- 指定RM的cluster id -->
        <property>
                <name>yarn.resourcemanager.cluster-id</name>
                <value>yrc</value>
        </property>
        <!-- 指定RM的名字 -->
        <property>
                <name>yarn.resourcemanager.ha.rm-ids</name>
                <value>rm1,rm2</value>
        </property>
        <!-- 分别指定RM的地址 -->
        <property>
                <name>yarn.resourcemanager.hostname.rm1</name>
                <value>master</value>
        </property>
        <property>
                <name>yarn.resourcemanager.hostname.rm2</name>
                <value>slave1</value>
        </property>
		<!-- RM对外暴露的web http地址，用户可通过该地址在浏览器中查看集群信息 -->
        <property>
                <name>yarn.resourcemanager.webapp.address.rm1</name>
                <value>master:8088</value>
        </property>
        <property>
                <name>yarn.resourcemanager.webapp.address.rm2</name>
                <value>slave1:8088</value>
        </property>
        <!-- 指定zookeeper集群地址 -->
        <property>
                <name>yarn.resourcemanager.zk-address</name>
                <value>master:2181</value>
        </property>
        <property>
                <name>yarn.nodemanager.aux-services</name>
                <value>mapreduce_shuffle</value>
        </property>
        <property>
                <name>yarn.application.classpath</name>             	<value>/usr/hadoop321/etc/hadoop:/usr/hadoop321/share/hadoop/common/lib/*:/usr/hadoop321/share/hadoop/common/*:/usr/hadoop321/share/hadoop/hdfs:/usr/hadoop321/share/hadoop/hdfs/lib/*:/usr/hadoop321/share/hadoop/hdfs/*:/usr/hadoop321/share/hadoop/mapreduce/lib/*:/usr/hadoop321/share/hadoop/mapreduce/*:/usr/hadoop321/share/hadoop/yarn:/usr/hadoop321/share/hadoop/yarn/lib/*:/usr/hadoop321/share/hadoop/yarn/*</value>
        </property>
</configuration>

8.6 修改hadoop-env.sh

export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_ZKFC_USER=root
export HDFS_JOURNALNODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root

8.7 设置多机互相免密登录

在各机器上生成密钥

ssh-keygen
都把公钥放到authorized_keys文件中

cat id_rsa.pub>>authorized_keys

cat id_rsa.pub.s1>>authorized_keys

cat id_rsa.pub.s2>>authorized_keys
发送到每台机器上

scp authorized_keys root@slave1:/root/.ssh

scp authorized_keys root@slave2:/root/.ssh

8.8 启动设置

三台机上都启动journalnode

hdfs --daemon start journalnode
在master上：

hdfs namenode -format #格式化namenode
zkServer.sh start #启动zk
hdfs zkfc -formatZK #格式化zk
scp -r /root/hadoop root@slave1:/root #同步两个namenode
start-all.sh #启动服务
jps查看进程，浏览器访问namenode