MR 案例分析：求每个月温度最高的两天

最新推荐文章于 2022-12-04 17:02:17 发布

原创最新推荐文章于 2022-12-04 17:02:17 发布 · 817 阅读

2 ·

本内容遵循CC 4.0 BY-SA版权协议

bigdata 专栏收录该内容

19 篇文章

订阅专栏

本文介绍使用MapReduce算法解决气象数据处理问题，具体目标是找出每个月温度最高的两天。通过自定义对象HotDay和HotTwoDay类，利用key排序实现自定义分组，根据year、month和温度进行分区排序，确保相同日期下温度较高的记录优先处理。

最近看到一个mr的例子，感觉有些收获这里分享一下

需求：求每个月温度最高的两天，日期格式为：1949-10-01 14:21:02 34c

思路一：

      (1)mr job  将key 封装为 year-month的格式   传递给 reduce， 
      (2)reduce 中遍历每个key 的值，取出温度，进行比较。 遍历完成之后输出

思路二：

    (1) 利用key的排序， 实现自定义分组 。根据 ，year、month、温度 来分区排序。同样日期下，温度大的排前面。
    (2)这样每个reduce处理的时候，第一个数据符合要求， 再取出 year、month 、day 与第一个不同的第二个数据就可以了

下面代码实现的是第二个思路：

自定义对象：

/**
 * @ author fnb
 * @ email nebofeng@gmail.com
 * @ date  2019/10/23
 * @ des : 查询 数据中 ，每个月 温度最高的两天
 *  日期                温度
 * xxxx-xx-x            xx
 *
 */
public class HotDay  implements WritableComparable<HotDay> {
    private int year;
    private int month;
    private int day;
    private int wd;

    //省略set get 方法

    /**

     * @param   o the object to be compared.
     * @return  a negative integer, zero, or a positive integer as this object
     *          is less than, equal to, or greater than the specified object.
     *
          负整数、零或正整数，标识 此对象小于、等于或大于指定对象。
     */
    @Override
    public int compareTo(HotDay o) {

        int flagy=Integer.compare(this.getYear(),o.getYear());
        if(flagy==0){
            int flagm= Integer.compare(this.getMonth(),o.getMonth());
            if(flagm==0){
                return Integer.compare(this.getDay(),o.getDay());
            }
            return flagm;
        }
        return flagy;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(year);
        out.writeInt(month);
        out.writeInt(day);
        out.writeInt(wd);

    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.year=in.readInt();
        this.month=in.readInt();
        this.day=in.readInt();
        this.wd=in.readInt();

    }
}

/**
 * @ author fnb
 * @ email nebofeng@gmail.com
 * @ date  2019/10/24
 * @ des : 找到一组格式为：1949-10-01 14:21:02	34c
 *  找出每个月温度最大的两天
 */
public class HotTwoDay {

/*
  自定义 mapper ： 利用key的排序， 实现自定义分组 。根据 ，year、month、温度 来分区排序。同样日期下，温度大的排前面。
        这样每个reduce处理的时候，第一个数据符合要求， 再取出 year、month 、day 与第一个不同的第二个数据就可以了
 */
    static class  HotDayMapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, HotDay,Text>{

        @Override
        protected void map(LongWritable key, Text value,Context context)
                throws IOException, InterruptedException {

            HotDay hDay=new HotDay();
            Text vwd =new Text();
            //value:  1949-10-01 14:21:02	34c
            try {
                String[] strs = StringUtils.split(value.toString(), '\t');

                SimpleDateFormat  sdf = new SimpleDateFormat("yyyy-MM-dd");
                Date date = null;
                date = sdf.parse(strs[0]);
                Calendar  cal = Calendar.getInstance();
                cal.setTime(date);
                hDay.setYear(cal.get(Calendar.YEAR));
                hDay.setMonth(cal.get(Calendar.MONTH)+1);
                //Java通过cal.get(Calendar.MONTH)比真实月份少了一个月,这里月份是从0开始计算的，也就是说，月份是从0—11。
                hDay.setDay(cal.get(Calendar.DAY_OF_MONTH));
                int wd  = Integer.parseInt(strs[1].substring(0, strs[1].length()-1));
                hDay.setWd(wd);
                vwd.set(wd+"");
                context.write(hDay, vwd);
            } catch (ParseException e) {
                e.printStackTrace();
            }
        }
    }

  / *
    自定义reducer ，这里需要注意，如果温度最高的2个是同一天，取第1个温度就可以了
  */
    static  class  HotDayReducer extends Reducer<HotDay,Text,Text,Text>{

        Text rkey = new Text();
        Text rval = new Text();

        @Override
        protected void reduce(HotDay key, Iterable<Text> vals, Context context)
                throws IOException, InterruptedException {

            int flg=0;
            int day=0;
            for (Text v : vals) {
                if(flg==0){
                    day=key.getDay();
                    rkey.set(key.getYear()+"-"+key.getMonth()+"-"+key.getDay());
                    rval.set(key.getWd()+"");
                    context.write(rkey,rval );
                    flg++;
                }
                if(flg!=0 && day != key.getDay()){
                    rkey.set(key.getYear()+"-"+key.getMonth()+"-"+key.getDay());
                    rval.set(key.getWd()+"");
                    context.write(rkey,rval );
                    break;
                    /*
    reduce 总结： 这里我之前一直以为，reduce 是一个分组调用的， 每个分组的key 是唯一的。但是发现自己的理解还是比较浅显的。
总的来说就是 ReduceContextImpl  中有一个 迭代器模式的内部类 ValueIterator  ， reduce 中 当你遍历值的 时候，key也会变化， 
因为key只是传入的一 个地址，获取KV的迭代器的获取下一个KV值之后，把K值和V值放到之前传入我们自己写的Reduce类的方法
中那个输入参数的地址上，使得key也会变化。
   详细过程可以查看文末的参考博客。
                 */
                }
             }
        }
    }

   /*
 自定义分区
  */
    static class HotDayPartitioner  extends Partitioner<HotDay, Text> {
        @Override
        public int getPartition(HotDay key, Text value, int numPartitions) {
            return key.getYear() % numPartitions;
        }


    }

    /**
     * 实现天气年月正序， 温度倒序
     */
    static  class HotDaySortComparator  extends WritableComparator {
        HotDay day1= null;
        HotDay day2= null;
        public  HotDaySortComparator(){
            super(HotDay.class,true);
        }

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            day1=(HotDay) a;
            day2=(HotDay) b;
            int flagy=Integer.compare(day1.getYear(),day2.getYear());
            if(flagy==0){
                int flagm=Integer.compare(day1.getMonth(),day2.getMonth());
                if(flagm==0){
                    //年、月 正序，温度倒序
                    return - Integer.compare(day1.getWd(),day2.getWd());
                }
                return flagm;
            }
            return flagy;
        }
    }

    /**
     * 分组使得，year ，month 相同的分到一个组， 排序使得 一个组内，温度倒序排列
     */
    static  class HotDayGroupingComparator  extends WritableComparator {

        HotDay day1= null;
        HotDay day2= null;
        public  HotDayGroupingComparator(){
            super(HotDay.class,true);
        }

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            day1=(HotDay) a;
            day2=(HotDay) b;
            int flagy=Integer.compare(day1.getYear(),day2.getYear());
            if(flagy==0){
                return  Integer.compare(day1.getMonth(),day2.getMonth());
            }
            return flagy;
        }

    }


    public static void main(String[] args)  throws  Exception{
        //1,conf
        Configuration conf = new Configuration(true);
        //2,job
        Job job=Job.getInstance(conf);
        job.setJarByClass(HotTwoDay.class);
        //3,input,output
        Path input =new Path("/DATA/hotday.txt");
        Path output = new Path("/DATA/hotdayout");
        FileInputFormat.addInputPath(job,input);
        if(output.getFileSystem(conf).exists(output)){
            output.getFileSystem(conf).delete(output,true);
        }
        FileOutputFormat.setOutputPath(job, output );
        //4,map
        job.setMapperClass(HotDayMapper.class);
        job.setMapOutputKeyClass(HotDay.class);
        job.setMapOutputValueClass(Text.class);
        //5,reduce
        job.setReducerClass(HotDayReducer.class);
        //6,other:sort,part..,group...
        job.setPartitionerClass(HotDayPartitioner.class);
        job.setSortComparatorClass(HotDaySortComparator.class);
        job.setGroupingComparatorClass(HotDayGroupingComparator.class);
        //7,submit
        job.waitForCompletion(true);

    }
}

以上源码： github地址

参考博客：MapReduce中一次reduce方法的调用中key的值不断变化分析及源码解析