MapReduce2中自定义排序分组

username2

浏览: 722434 次
性别:
来自: 黑龙江

最近访客更多访客>>

dsh_oliver

杭州007

loginboot

xmmdream

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Hadoop学习笔记

1 Map 、Reduce和主类

package com.wzt.mapreduce.secondsort;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.wzt.mapreduce.wordcount.WCRunner;

public class SecSortMain {

	public static class SecSortMapper extends Mapper<LongWritable, Text, FirstSortEntity, IntWritable> {
		
		protected void map(LongWritable key, Text value, Context context)
				throws  IOException, InterruptedException {
			 
			String line = value.toString();
			String[] spilted = line.split(" ");
			
			// 为了显示效果而输出Mapper的输出键值对信息
			System.out.println("Mapper输出<" + spilted[0] + "," + spilted[1] + ">"+this);
			context.write(new FirstSortEntity(spilted[0], Integer.parseInt(spilted[1]))  , new IntWritable(Integer.parseInt(spilted[1])) );
		};
		
	}

	public static class SecSortReducer extends Reducer<FirstSortEntity, IntWritable , FirstSortEntity, IntWritable> {
		
		@Override
		protected void reduce(
				FirstSortEntity key,
				Iterable<IntWritable> values,
				Context context)
				throws IOException, InterruptedException {
			
			// 显示次数表示redcue函数被调用了多少次，表示k2有多少个分组
			System.out.println("Reducer输入分组<" + key+ ",N(N>=1)>"+this);
			StringBuffer sb = new StringBuffer() ; 
			for (IntWritable value : values) {
				//count += value.get();
				// 显示次数表示输入的k2,v2的键值对数量
				sb.append( value+" , " ) ;
				System.out.println("Reducer输入键值对<" + key.toString() + "," + value.get() + ">  组"+sb.toString() );
			}
//			if(sb.length()>0){
//				sb.deleteCharAt( -1 ) ;
//			}

			context.write(key, key.getSecondkey());
			//context.write(key.getFirstkey(),  new Text(sb.toString() ));
			
		}
		
	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		Configuration conf = new Configuration() ; 
		Job job = Job.getInstance(conf) ;
		
		job.setJarByClass(WCRunner.class );
		
		job.setMapperClass( SecSortMapper.class );
		job.setMapOutputKeyClass( FirstSortEntity.class);
		job.setMapOutputValueClass( IntWritable.class );
		 
		//设置分区方法
		job.setPartitionerClass( SSPartintioner.class);//不同
		//会有几个reduce去执行最后的汇总数据， 有几个分区就要有几个reduce ，最后就会生成几个reduce ，这里设置为1 ，没看到调用但是确实分区了，没弄明白
		job.setNumReduceTasks(1);//当任务数为1的时候设置Partitioner是没有用的
		
		//数据做总的排序
		job.setSortComparatorClass(MySSSortComparator.class) ; //排序
		//总数据  记性分组 
		job.setGroupingComparatorClass( GroupComparator.class );//分组
		
		job.setReducerClass( SecSortReducer.class );
		job.setOutputKeyClass( FirstSortEntity.class );
		job.setOutputValueClass(IntWritable.class );
		
		
//		FileInputFormat.setInputPaths(job,  "/wc/input/xiyou.txt");
//		FileOutputFormat.setOutputPath(job,  new Path("/wc/output6"));
		FileInputFormat.setInputPaths(job,  "/sort/input");
		FileOutputFormat.setOutputPath(job,  new Path("/sort/output1"));
		
 		job.waitForCompletion(true) ; 
	}
}

2 自定义组合key

package com.wzt.mapreduce.secondsort;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
/**
 * 自定义组合件 
 * @author root
 *
 */
public class FirstSortEntity implements WritableComparable<FirstSortEntity>{

	private Text firstkey ; 
	private IntWritable secondkey ;
	
	public FirstSortEntity( ) {
	}
	
	public FirstSortEntity(Text firstkey, IntWritable secondkey) {
		this.firstkey = firstkey;
		this.secondkey = secondkey;
	}
	public FirstSortEntity(String firstkey, int secondkey) {
		this.firstkey = new Text(firstkey);
		this.secondkey = new IntWritable(secondkey);
	}
	
	public Text getFirstkey() {
		return firstkey;
	}
	public void setFirstkey(Text firstkey) {
		this.firstkey = firstkey;
	}
	public IntWritable getSecondkey() {
		return secondkey;
	}
	public void setSecondkey(IntWritable secondkey) {
		this.secondkey = secondkey;
	}
	/**
	 * 对象序列化
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		 out.writeUTF(firstkey.toString() );
		 out.writeInt(  secondkey.get() );
	}

	//对象反序列化
	@Override
	public void readFields(DataInput in) throws IOException {
		 
		firstkey = new Text(in.readUTF() );
		secondkey = new IntWritable(in.readInt()); 
	}

	
	/**
	 * 排序在map执行后数据传出后 会调用这个方法对key进行排序 
	 * 数据map后，如果设置了分区并且reduce>1 的话，会执行分区类方法，进行分区
	 */
	@Override
	public int compareTo(FirstSortEntity entity) {
		//利用这个来控制升序或降序
		//this本对象写在前面代表是升序
		//this本对象写在后面代表是降序
		return this.firstkey.compareTo( entity.getFirstkey());
		//return this.secondkey.get()>entity.getSecondkey().get()?1:-1;	
	}
	@Override
	public String toString() {
		return this.getFirstkey() +" "+this.getSecondkey()+ "   "  ;
	} 

}

3 自定义分区

package com.wzt.mapreduce.secondsort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

//自定义 分区
public class SSPartintioner extends Partitioner<FirstSortEntity, IntWritable>{
 
	/**
	 * key map输出的key
	 * value map 输出的value 
	 *  map后的数据 经过排序后传进这个分区方法，如果返回的值相同的数据，值相同的数据会分配到一组中 ，即 放到一堆 
	 *  到此 数据为N堆，并且数据是经过排序的 
	 */
	@Override
	public int getPartition(FirstSortEntity key, IntWritable value,
			int numPartitions) {
			System.out.println("Partitioner  key:"+key.getFirstkey()+"  value:"+value+"  "+ ( ( key.getFirstkey().hashCode()&Integer.MAX_VALUE)%numPartitions ) +"   "+this);
			//System.out.println("Partitioner  key:"+key.getFirstkey()+"  value:"+value+"  "+ ((key.getSecondkey().get()&Integer.MAX_VALUE)%numPartitions) +"   "+this);
			
	       return (key.getFirstkey().hashCode()&Integer.MAX_VALUE)%numPartitions;
			//return (key.getSecondkey().get()&Integer.MAX_VALUE)%numPartitions;
	}
	 
	
}

个人理解以上都是在Map阶段进行，即本地操作，以下为Map到Reduce这段进行的

4 自定义整体排序

package com.wzt.mapreduce.secondsort;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;


//组内自定义排序策略
/**
 * @author root
 *
 */
public class MySSSortComparator extends WritableComparator{

	public MySSSortComparator() {//注册处理的试题类型 
		super(FirstSortEntity.class,true);
	}
	
	/**
	 *  reduce 处理数据之前 
	 *  对全量数据排序 
	 *  逻辑：分组一样则按照第二个参数排序  ，分组不一样，则按照第一个参数排序  
	 */
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		
		FirstSortEntity e1 = (FirstSortEntity)a;
		FirstSortEntity e2 = (FirstSortEntity)b;
		System.out.println( e1.getFirstkey()+"==MySSSortComparator 排序 。。 "+e2.getFirstkey());
		//首先要保证是同一个组内，同一个组的标识就是第一个字段相同
		if(!e1.getFirstkey().equals( e2.getFirstkey())){
			return e1.getFirstkey().compareTo(e2.getFirstkey());
		}else{
			return e1.getSecondkey().get() - e2.getSecondkey().get() ; 
		}
	}
}

5 自定义分组

package com.wzt.mapreduce.secondsort;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;


//对象分组策略 
//数据放到 reduce前 ，对数据进行分组 
public class GroupComparator extends WritableComparator{

	public GroupComparator() { //注册处理的试题类型 
		super(FirstSortEntity.class,true ) ; 
	}
	
	
	/**
	 * 对排序后的数据 分组， 
	 * 第一个参数相同的，放到一个key的 迭代器 集合中  
	 */
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		FirstSortEntity e1 = (FirstSortEntity)a;
		FirstSortEntity e2 = (FirstSortEntity)b;
		System.out.println( e1.getFirstkey()+"==GroupComparator = 分组=="+e2.getFirstkey());
		return  e1.getFirstkey().toString().compareTo( e2.getFirstkey().toString());
		//return  e1.getSecondkey().compareTo( e2.getSecondkey());
	}
}

在以后就是主类中的reduce进行数据处理

下面这个类作为自己的记录，这里没用：

package com.wzt.mapreduce.secondsort;

import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;

//自定义分组比较器
//这个类 暂时没用， 分组比较器的 实现，但没有测试 
public class SSGroupComparator implements RawComparator<FirstSortEntity>{

	@Override
	public int compare(FirstSortEntity o1, FirstSortEntity o2) {
	 
		return o1.getSecondkey().get()>o2.getSecondkey().get()?1:-1;
	}
 
    @Override
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
    	 
    	//对象可以这样反序列化 
    	//IntWritable d ; 
    	System.out.println( "SSGroupComparator   自定义分组 =" );
    	ByteArrayInputStream bis = new ByteArrayInputStream(b1);
    	DataInput in1 = new DataInputStream(bis); 
    	FirstSortEntity entity1 = new FirstSortEntity();
    	
    	ByteArrayInputStream bis2 = new ByteArrayInputStream(b2);
    	DataInput in2 = new DataInputStream(bis2); 
    	FirstSortEntity entity2 = new FirstSortEntity();
    	try {
			entity1.readFields(in1);
			entity2.readFields(in2);
		} catch (IOException e) {
			e.printStackTrace();
		}
     
        return entity1.getFirstkey().compareTo( entity2.getFirstkey());
    }
 

}

hadoop_test1.zip (90 KB)
下载次数: 0

分享到：

Hadoop2.x动态添加或删除datanode | MapReduce中自定义Combiner

2016-01-28 19:18
浏览 746
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

MapReduce2中自定义排序分组

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

MapReduce2中自定义排序分组

评论

发表评论

相关推荐

strom使用示例

Hadoop2.x动态添加或删除datanode

MapReduce中自定义Combiner

2.x MapReduce的测试类

Sqoop

kafka使用与安装

storm 的安装使用

Hbase 的Java API 操作

Hbase 的java API 操作

Hbase集群安装

HIVE的安装与使用

HA 下执行JAVA操作hdfs

hadoop 2.x集群安装与配置

zookeeper安装

hadoop 2.x wordcount练习

Hadoop 2.x单节点部署学习。

SequenceFile和MapFile使用

重新编译Hadoop

Hadoop 中数据的序列化与反序列化

Hadoop基于文件的数据结构

最近访客更多访客>>