`
lovejuan1314
  • 浏览: 337487 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

Nutch 研究<三> 将Nutch爬取结果放入Hypertable

阅读更多
想把Nutch抓取的web page结果放入到Hypertable中去,目前思路主要有三个:

1. 修改Nutch源代码,让Nutch基于Hypertable工作,可以参考Hbase的实现. 由于该实现缺失Nutch好多特性,而且不易升级,考虑作罢.

2. 将Nutch抓取结果以命令导出为text的dump文件,然后用MapReduce解析该文件,哪相关信息到Hypertable.

3. 其实和第一一样,只不过是直接使用人家已经改好的基于Hbase的实现,然后导出一份tsv文件导入到Hypertable. 不仅融合了第一的缺点还增加了麻烦. 不考虑.

好,以下代码基于第二种思想实现.

package nutchdump;

import java.io.IOException;
import java.sql.Timestamp;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TTransportException;
import org.hypertable.thrift.ThriftClient;
import org.hypertable.thriftgen.Cell;
import org.hypertable.thriftgen.ClientException;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;


/**
 * NutchDumpReader
 *
 *Reads the dump entries from nutch dump command output, get each line result to
 *write into hypertable database as special format
 *由于只保存抓取的网页内容,所以只关心Nutch导出的文件中,Content::这一块的相关信息
 *
 * @author(lovejuan1314)
 */

public class NutchDumpReader extends Configured implements Tool{
	
	  // where to put the data in hdfs when we're done
	  private static final String OUTPUT_PATH = "nutch_content_result";

	  // where to read the data from.
	  private static final String INPUT_PATH = "/shared/nutch/segdump";
  
	  static class NutchReaderMapper extends MapReduceBase
      implements Mapper<LongWritable, Text, Text, Text> {
		  
		public NutchReaderMapper() { }  
	
		public void map(LongWritable key, Text value,
				OutputCollector<Text, Text> output, Reporter reporter)
				throws IOException {
			String dumpline = value.toString();
			NutchDumpRecord nutchDumpRecord = new NutchDumpRecord(dumpline);
			String version = nutchDumpRecord.getVersion();
			if (version != null){
				output.collect(new Text("version"), new Text(version));
			}
			String base = nutchDumpRecord.getBase();
			if (base != null){
				output.collect(new Text("base"), new Text(base));
			}
			String ContentType = nutchDumpRecord.getContentType();
			if (ContentType != null){
				output.collect(new Text("ContentType"), new Text(ContentType));
			}
			String metadata = nutchDumpRecord.getMetadata();
			if (metadata != null){
				output.collect(new Text("metadata"), new Text(metadata));
			}
			String url = nutchDumpRecord.getUrl();
			if (url != null){
				output.collect(new Text("url"), new Text(url));
			} 
			
			String content = nutchDumpRecord.getContent();
			if (content != null){
				output.collect(new Text("content"), new Text(content));
			}
			
		}
		  
	  }
	  
	  static class NutchReaderReducer extends MapReduceBase
      implements Reducer<Text, Text, Text, NullWritable> {
		  
		public void reduce(Text key, Iterator<Text> values,
				OutputCollector<Text, NullWritable> output, Reporter reporter)
				throws IOException {
			String valKey = key.toString();
			
			while(values.hasNext()){
				Text val = values.next();
				if (val.toString() != null){
					//write into hypertable
					writeIntoTable(valKey,val.toString());
					// output
					output.collect(key, NullWritable.get());
				}
			}
			
		}
		  
	  }
	  
	  /**
	   * 
	   * @param colName
	   * @param colValue
	   */
	  
	  private static void writeIntoTable(String colName,String colValue){
		  
		  try {
			  
			ThriftClient client = ThriftClient.create("192.168.0.40", 38080);
			// mutator examples
		    long mutator = client.open_mutator("webDb", 0, 0);
		      
		    Timestamp ts = new Timestamp(System.currentTimeMillis());
		      
		      try {
		        Cell cell = new Cell();
		        String sysDt = ts.toString();
//设置行关键字 我使用了系统时间+反转URL的格式
		        cell.row_key = sysDt+" "+"com.mytest.www";
//列名
		        cell.column_family = colName;
//列值
		        cell.value = colValue.getBytes();
		        client.set_cell(mutator, cell);
		      }
		      finally {
		        client.close_mutator(mutator, true);
		      }
			
		} catch (TTransportException e) {
			e.printStackTrace();
		} catch (TException e) {
			e.printStackTrace();
		}catch (ClientException ex){
			ex.printStackTrace();
		}
		  
	  }
	  
	  /** Driver for the actual MapReduce process */
	  
	  private void runJob() throws IOException{
		  JobConf conf = new JobConf(getConf(),NutchDumpReader.class);
		  
		  FileInputFormat.addInputPath(conf, new Path(INPUT_PATH));
		  FileOutputFormat.setOutputPath(conf, new Path(OUTPUT_PATH));
		  
		  conf.setMapperClass(NutchReaderMapper.class);
		  conf.setReducerClass(NutchReaderReducer.class);
		  
		  conf.setOutputKeyClass(Text.class);
		  conf.setOutputValueClass(NullWritable.class);
		  
		  conf.setMapOutputValueClass(Text.class);
		  
		  JobClient.runJob(conf);
	  }


	public int run(String[] arg0) throws Exception {
		runJob();
		return 0;
	}
	
	 public static void main(String [] args) throws Exception {
		    int ret = ToolRunner.run(new NutchDumpReader(), args);
		    System.exit(ret);
		  }

}





package nutchdump;


public class NutchDumpRecord {
	
	// the actual line from dump file
	private String record;
	
	// the fileds on the line
	private String version;
	private String url;
	private String base;
	private String ContentType;
	private String metadata;
	private String content;

	
	//public NutchDumpFileRecord
	
	public NutchDumpRecord(final String record){
		if (record == null){
			this.record = "";
		}else{
			this.record = record;
		}
		this.parse();
	}
	
	protected void parse(){
		int versionIdx = this.record.indexOf("Version:");
		int urlIdx = this.record.indexOf("url:");
		int baseIdx = this.record.indexOf("base:");
		int contentTypeIdx = this.record.indexOf("contentType:");
		int metadataIdx = this.record.indexOf("metadata");
		int contentIdx = this.record.indexOf("Content:");
		
		if (versionIdx != -1){
			this.version = this.record.substring(versionIdx).trim();
		}
		
		if (urlIdx != -1){
			this.url = this.record.substring(urlIdx).trim();
		}
		if (baseIdx != -1){
			this.base = this.record.substring(baseIdx).trim();
		}
		if (contentTypeIdx != -1){
			this.ContentType = this.record.substring(contentTypeIdx).trim();
		}
		if (metadataIdx != -1){
			this.metadata = this.record.substring(metadataIdx).trim();
		}
		if (contentIdx != -1){
			this.content = this.record.substring(contentIdx).trim();
		}
		
	}
	
	// getters

	  /** Return the record */
	public String getRecord(){
		return this.record;
	}
	
	public String getVersion(){
		return this.version;
	}
	public String getUrl(){
		return this.url;
	}
	public String getBase(){
		return this.base;
	}
	public String getContentType(){
		return this.ContentType;
	}
	public String getMetadata(){
		return this.metadata;
	}
	public String getContent(){
		return this.content;
	}
}





//这个类是Hypertable源码中提供的. 

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

/**
 * Copyright (C) 2008  Luke Lu (Zvents, Inc.)
 *
 * This file is distributed under the Apache Software License
 * (http://www.apache.org/licenses/)
 */

package nutchdump;

import org.hypertable.thriftgen.*;

import org.apache.thrift.TException;
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TFramedTransport;
import org.apache.thrift.transport.TTransportException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;

public class ThriftClient extends HqlService.Client {
  public ThriftClient(TProtocol protocol) { super(protocol); }

  // Java only allow super as the first statement of constructor.  It doesn't
  // support multiple inheritance to use the base-from-member idiom either. So,
  // we're resorting to a static factory method here.
  public static ThriftClient
  create(String host, int port, int timeout_ms, boolean do_open)
      throws TTransportException, TException {
    TFramedTransport transport = new TFramedTransport(
        new TSocket(host, port, timeout_ms));
    ThriftClient client = new ThriftClient(new TBinaryProtocol(transport));
    client.transport = transport;

    if (do_open)
      client.open();

    return client;
  }

  // Java doesn't support default argument values, which makes things
  // unnecessarily verbose here
  public static ThriftClient create(String host, int port)
      throws TTransportException, TException {
    return create(host, port, 30000, true);
  }

  public void open() throws TTransportException, TException {
    transport.open();
    do_close = true;
  }

  public void close() {
    if (do_close) {
      transport.close();
      do_close = false;
    }
  }

  private TFramedTransport transport;
  private boolean do_close = false;
}




代码完成后直接打成jar包,在hadoop环境下运行就可以了.


Ps:仅供参考,如果大家有什么更好的方法,欢迎讨论. 另外代码里也没有严格控制数据的一致性,若要在产品上运行还得进一步修改.
分享到:
评论
1 楼 diddyrock 2009-09-24  
其实第一条路最快

相关推荐

    AnyFo – Nutch 冰破银针

    1.6 Tomcat中启动搜索站台 1. 将Nutch.war包考到Tomcat的webapps下。...&lt;value&gt;E:\nutch-0.9\crawl&lt;/value&gt; &lt;/property&gt; 2. 启动Tomcat,输入http://127.0.0.1:8080/nutch-0.9 3. 可以进行查询了。

    nutch网页爬取总结

    nutch安装指南,nutch教程,nutch网络爬取

    filter-nutch-plugin

    Nutch 1.x插件,允许对网页的入站和出站进行索引。 默认情况下,此插件会忽略那些主机与被索引网页的主机匹配的出站链接。 通过将以下内容添加到您的nutch-site.xml可以绕过此行为。 &lt; property&gt; &lt; name&gt;outlinks....

    基于Apache Nutch和Solr的AJAX页面内容爬取与处理设计源码

    本项目是基于Apache Nutch和Solr开发的AJAX页面内容爬取与处理设计源码,主要使用Java进行开发。...项目结构清晰,代码注释详尽,适合用于学习和研究Apache Nutch和Solr在AJAX页面内容爬取与处理中的应用。

    Nutch安装配置

    NULL 博文链接:https://jcyanfan.iteye.com/blog/257691

    indexer-links:Nutch 1.x插件,可对网页的入站和出站进行索引

    链接提取器Nutch 1.x插件,允许对网页的入站和出...则可以通过nutch-site.xml配置文件, 只需添加以下内容: &lt; property&gt; &lt; name&gt;inlinks.host.ignore&lt;/ name&gt; &lt; value&gt;false&lt;/ value&gt;&lt;/ property&gt; 如果只对入站和出站

    Apache Nutch网络爬虫-其他

    &lt;/p&gt;&lt;p&gt;Nutch诞生于2002年8月,是Apache旗下的一个用Java实现的开源搜索引擎项目,自Nutch1.2版本之后,Nutch已经从搜索引擎演化为网络爬虫,接着Nutch进一步演化为两大分支版本:1.X和2.X,这两大分支最大的区别...

    LoremIpsumSearch:包含与 lucene 和 solr 一起使用的搜索算法

    LoremIpsum搜索 包含与 lucene 和 solr 一起使用的搜索算法... export CLASSPATH="&lt;lucene&gt;/lucene/replicator/lib/*:&lt;nutch&gt;/build/*:&lt;nutch&gt;/build/lib/*:&lt;lucene&gt;/solr/dist/*:&lt;lucene&gt;/solr/ dist/solrj-lib/*:*:.

    Nutch搜索引擎的页面排序修改方法研究.kdh

    Nutch是一个优秀的开放源代码的Web...分析开源搜索引擎Nutch代码,研究了Nutch的页面排序方法。在Nutch原有的结构基础上提出了3种修改Nutch 排序的方法,对每种方法的实现进行了阐述,最后对这些方法的特点进行了比较

    Apache Nutch-其他

    &lt;/p&gt;&lt;p&gt;Nutch诞生于2002年8月,是Apache旗下的一个用Java实现的开源搜索引擎项目,自Nutch1.2版本之后,Nutch已经从搜索引擎演化为网络爬虫,接着Nutch进一步演化为两大分支版本:1.X和2.X,这两大分支最大的区别...

    nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据

    nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据nutch 爬虫数据

    Missing jar for nutch

    you will encouter problems with some imports in parse-mp3 and parse-rtf plugins in nutch project.&lt;br&gt;please copy the jar files into src/plugin/parse-mp3/lib and src/plugin/parse-rtf/lib respectively.

    nutch介绍信息

    java实现的开源搜索引擎nutch 主要类分析: 一、org.apache.nutch.crawl.Injector: 1,注入url.txt 2,url标准化 3,拦截url,进行正则校验(regex-urlfilter.... 1,从segment中读取&lt;url, CrawlDatum&gt;,将它放入相

    Nutch中文教程nutcher.zip

    Nutch教程——导入Nutch工程,执行完整爬取 Nutch流程控制源码详解(bin/crawl中文注释版) Nutch教程——URLNormalizer源码详解 Nutch参数配置——http.content.limit ...

    Nutch_插件深入研究

    nutch插件,安装nutch插件,mysql与nutch

    nutch 初学文档教材

    1.2研究nutch的原因...1 1.3 nutch的目标..1 1.4 nutch VS lucene.....2 2. nutch的安装与配置.....3 2.1 JDK的安装与配置.3 2.2 nutch的安装与配置........5 2.3 tomcat的安装与配置......5 3. nutch初体验7 3.1 ...

    Nutch搜索引擎·Nutch浅入分析(第5期)

    1.1 Nutch 基本原理 1.1.1 Nutch 基本组成 1.1.2 Nutch 工作流程 1.2 Nutch 流程详解 1.2.1 Nutch 数据流程 1.2.2 Nutch 流程分析

    Nutch相关框架视频教程

    资源名称:Nutch相关框架视频教程资源目录:【】Nutch相关框架视频教程1_杨尚川【】Nutch相关框架视频教程2_杨尚川【】Nutch相关框架视频教程3_杨尚川【】Nutch相关框架视频教程4_杨尚川【】Nutch相关框架视频教程5_...

    Nutch执行单步执行、中间结果文件分析和插件开发基础

    资源中urls.txt是我nutch单步执行过程的种子文件,里面的ppt主要讲解nutch单步执行流程,并获取每次单步执行的结果文件,对文件进行分析,同时ppt还讲解了nutch的插件的基础知识,不是很详细,但是可以作为参考。...

    eclipse配置nutch,eclipse配置nutch

    eclipse配置nutch,eclipse配置nutch

Global site tag (gtag.js) - Google Analytics