`
cakin24
  • 浏览: 1334856 次
  • 性别: Icon_minigender_1
  • 来自: 西安
社区版块
存档分类
最新评论

MapReduce之WordCount单词计数

阅读更多
一 代码
Wordcount.java
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 
public class WordCount {
public static class WordCountMap extends
Mapper<LongWritable, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);
private Text word = new Text();
 
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer token = new StringTokenizer(line);
while (token.hasMoreTokens()) {
word.set(token.nextToken());
context.write(word, one);
}
}
}
 
public static class WordCountReduce extends
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
 
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(WordCount.class);
job.setJobName("wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(WordCountMap.class);
job.setReducerClass(WordCountReduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
 
二 构建运行
1、编译
[root@localhost word_count]# ll
total 4
drwxr-xr-x. 2 root root 101 Aug 20 14:27 word_count_class
-rwxr-xr-x. 1 root root 2132 Aug 20 14:22 WordCount.java
[root@localhost word_count]# javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d word_count_class/ WordCount.java
[root@localhost word_count]# cd word_count_class/
[root@localhost word_count_class]# ls
WordCount.class WordCount$WordCountMap.class WordCount$WordCountReduce.class
2、打包
[root@localhost word_count_class]# jar -cvf wordcount.jar *.class
added manifest
adding: WordCount.class(in = 1539) (out= 772)(deflated 49%)
adding: WordCount$WordCountMap.class(in = 1829) (out= 767)(deflated 58%)
adding: WordCount$WordCountReduce.class(in = 1645) (out= 687)(deflated 58%)
[root@localhost word_count_class]# ls
WordCount.class wordcount.jar WordCount$WordCountMap.class WordCount$WordCountReduce.class
3、准备输入文件file1和输入文件file2
[root@localhost input]# ls
file1 file2
file1的内容:
hello world
hello hadoop
hadoop file system
hadoop java api
hello java
file2的内容:
new file
hadoop file
hadoop new world
hadoop free home
hadoop free school
4、将输入文件提交HDFS
[root@localhost word_count]# hadoop fs -mkdir input_wordcount
Warning: $HADOOP_HOME is deprecated.
 
[root@localhost word_count]# hadoop fs -put input/* input_wordcount/
Warning: $HADOOP_HOME is deprecated.
[root@localhost word_count]# hadoop fs -ls
Warning: $HADOOP_HOME is deprecated.
 
Found 2 items
drwxr-xr-x - root supergroup 0 2017-08-20 12:44 /user/root/input
drwxr-xr-x - root supergroup 0 2017-08-20 14:41 /user/root/input_wordcount
[root@localhost word_count]# hadoop fs -ls input_wordcount
Warning: $HADOOP_HOME is deprecated.
 
Found 2 items
-rw-r--r-- 3 root supergroup 71 2017-08-20 14:41 /user/root/input_wordcount/file1
-rw-r--r-- 3 root supergroup 74 2017-08-20 14:41 /user/root/input_wordcount/file2
[root@localhost word_count]# hadoop fs -cat input_wordcount/file1
Warning: $HADOOP_HOME is deprecated.
 
hello world
hello hadoop
hadoop file system
hadoop java api
hello java
5、任务提交
[root@localhost word_count]# hadoop jar word_count_class/wordcount.jar WordCount input_wordcount output_wordcount
Warning: $HADOOP_HOME is deprecated.
 
17/08/20 14:50:30 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
17/08/20 14:50:31 INFO input.FileInputFormat: Total input paths to process : 2
17/08/20 14:50:31 INFO util.NativeCodeLoader: Loaded the native-hadoop library
17/08/20 14:50:31 WARN snappy.LoadSnappy: Snappy native library not loaded
17/08/20 14:50:33 INFO mapred.JobClient: Running job: job_201708201140_0001
17/08/20 14:50:34 INFO mapred.JobClient: map 0% reduce 0%
17/08/20 14:51:20 INFO mapred.JobClient: map 100% reduce 0%
17/08/20 14:51:45 INFO mapred.JobClient: map 100% reduce 100%
17/08/20 14:51:51 INFO mapred.JobClient: Job complete: job_201708201140_0001
17/08/20 14:51:52 INFO mapred.JobClient: Counters: 29
17/08/20 14:51:52 INFO mapred.JobClient: Job Counters
17/08/20 14:51:52 INFO mapred.JobClient: Launched reduce tasks=1
17/08/20 14:51:52 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=81389
17/08/20 14:51:52 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
17/08/20 14:51:52 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
17/08/20 14:51:52 INFO mapred.JobClient: Launched map tasks=2
17/08/20 14:51:52 INFO mapred.JobClient: Data-local map tasks=2
17/08/20 14:51:52 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=24253
17/08/20 14:51:52 INFO mapred.JobClient: File Output Format Counters
17/08/20 14:51:52 INFO mapred.JobClient: Bytes Written=83
17/08/20 14:51:52 INFO mapred.JobClient: FileSystemCounters
17/08/20 14:51:52 INFO mapred.JobClient: FILE_BYTES_READ=301
17/08/20 14:51:52 INFO mapred.JobClient: HDFS_BYTES_READ=381
17/08/20 14:51:52 INFO mapred.JobClient: FILE_BYTES_WRITTEN=156847
17/08/20 14:51:52 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=83
17/08/20 14:51:52 INFO mapred.JobClient: File Input Format Counters
17/08/20 14:51:52 INFO mapred.JobClient: Bytes Read=145
17/08/20 14:51:52 INFO mapred.JobClient: Map-Reduce Framework
17/08/20 14:51:52 INFO mapred.JobClient: Map output materialized bytes=307
17/08/20 14:51:52 INFO mapred.JobClient: Map input records=10
17/08/20 14:51:52 INFO mapred.JobClient: Reduce shuffle bytes=307
17/08/20 14:51:52 INFO mapred.JobClient: Spilled Records=50
17/08/20 14:51:52 INFO mapred.JobClient: Map output bytes=245
17/08/20 14:51:52 INFO mapred.JobClient: Total committed heap usage (bytes)=246751232
17/08/20 14:51:52 INFO mapred.JobClient: CPU time spent (ms)=5290
17/08/20 14:51:52 INFO mapred.JobClient: Combine input records=0
17/08/20 14:51:52 INFO mapred.JobClient: SPLIT_RAW_BYTES=236
17/08/20 14:51:52 INFO mapred.JobClient: Reduce input records=25
17/08/20 14:51:52 INFO mapred.JobClient: Reduce input groups=11
17/08/20 14:51:52 INFO mapred.JobClient: Combine output records=0
17/08/20 14:51:52 INFO mapred.JobClient: Physical memory (bytes) snapshot=382996480
17/08/20 14:51:52 INFO mapred.JobClient: Reduce output records=11
17/08/20 14:51:52 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2590666752
17/08/20 14:51:52 INFO mapred.JobClient: Map output records=25
6、查看结果
[root@localhost word_count]# hadoop fs -ls output_wordcount
Warning: $HADOOP_HOME is deprecated.
 
Found 3 items
-rw-r--r-- 3 root supergroup 0 2017-08-20 14:51 /user/root/output_wordcount/_SUCCESS
drwxr-xr-x - root supergroup 0 2017-08-20 14:50 /user/root/output_wordcount/_logs
-rw-r--r-- 3 root supergroup 83 2017-08-20 14:51 /user/root/output_wordcount/part-r-00000
[root@localhost word_count]# hadoop fs -cat output_wordcount/part-r-00000
Warning: $HADOOP_HOME is deprecated.
 
api 1
file 3
free 2
hadoop 7
hello 3
home 1
java 2
new 2
school 1
system 1
world 2
 

 

 
分享到:
评论

相关推荐

    MapReduce编程实例:单词计数

    单词计数(WordCount)的任务是对一组输入文档中的单词进行分别计数。假设文件的量比较大,每个文档又包含大量的单词,则无法使用传统的线性程序进行处理,而这类问题正是 MapReduce 可以发挥优势的地方。 在前面...

    实验项目 MapReduce 编程

    3. 查看 Hadoop 自带的 MR-App 单词计数源代码 WordCount.java,在 Eclipse 项目 MapReduceExample 下建立新包 com.xijing.mapreduce,模仿内置的 WordCount 示例,自己编写一个 WordCount 程序,最后打包成 JAR ...

    mapreduce wc单词计数 自定义分区 自定义排序实现

    实现mr的wordcount功能和自定义分区的功能、自定义排序功能;com.ellis.mr1为类似wc功能,com.ellis.mr2为自定义分区功能,com.ellis.mr3为自定义排序功能

    wordcount:使用Java的Hadoop MapReduce单词计数

    字数 使用Java的Hadoop MapReduce字数统计 运行: hadoop jar wordcount.jar "input_folder" "output_folder" “ input_folder”和“ output_folder”是HDFS上的文件夹。

    Hadoop原理与技术MapReduce实验

    1.单词计数实验(wordcount) (1)输入start-all.sh启动hadoop相应进程和相关的端口号 (2)打开网站localhost:8088和localhost:50070,查看MapReduce任务启动情况 (3)写wordcount代码并把代码生成jar包 (4)运行...

    MapReduce_mapReduce_

    MapReduce--1--入门程序WordCountMapReduce界的helloworld程序就是WordCount程序。所谓WordCount,就是单词计数,就是用来统计一篇或者一堆文本文件中的各单词的出现次数。

    phoenix_wordcount.tar.gz_Hadoop Phoenix_mapReduce_phoenix wordc

    mapreduce算法的phoenix架构实现和一个使用实例,用mapreduce算法对一个输入的文本文件中的单词计数。

    编写Java程序,实现统计单词个数功能

    1. 创建目录 2. mkdir wcinput ...5. 调用上传的jar包,实现单词计数功能 hadoop jar wordcount.jar com.only.mapreduce.wordcount.WordcountDriver /user/wcinput /user/wcoutput 6. 查看结果 hadoop fs -

    word源码java-hadoop-test:hadoop、mapreduce的一些练习

    包org.dan.mr.wordcount MapReduce单词计数 包org.dan.mr.flowsum MapReduce流量统计 包org.dan.mr.flowsumsort MapReduce流量统计,按总流量排序 包org.dan.mr.order_pro MapReduce实现订单信息和产品信息的join...

    summingbird:通过缩放和风暴流式传输MapReduce

    虽然纯Scala中的单词计数聚合看起来像这样: def wordCount ( source : Iterable [ String ], store : MutableMap [ String , Long ]) = source.flatMap { sentence =&gt; toWords(sentence).map(_ - &gt; 1L ) }....

    gomr:Golang的MapReduce框架

    有关规范的单词计数mapreduce程序,请参见examples/wordcount/parallel 。 要构建,将cd进入目录并运行go build 。 然后,使用./parallel 运行。 入门 要为GoMR编写作业,我们首先需要创建和满足gomr.go找到的接口的...

    hadoop_letter_counter

    字数使用Hadoop MapReduce的臭名昭著的单词计数MapReduce示例。输入包含要计数的单词的文件输出您提供作为输入的文件中的单词及其对应出现的列表。怎么跑说明需要事先设置Hadoop。 克隆存储库,然后在pom.xml所在的...

    MRWordCountKite

    本示例使用规范的MapReduce单词计数示例来演示如何将单词计数的结果写为Parquet文件。 这是该程序的高级描述: 为了将数据存储为Parquet,我们使用简单的WordCount.avsc Avro模式,并使用Avro工具将其转换为Java类...

    INFSCI2711_Homework3

    INFSCI2711_Homework3 队员: ... 对于使用Hadoop MapReduce的方法,我们设计通过Map获取所有以所有字母开头的单词,然后使用Reduce对以相同字母开头的单词进行计数,得到结果。 Java文件 输入 输出

Global site tag (gtag.js) - Google Analytics