`
Java_大猫
  • 浏览: 169831 次
  • 性别: Icon_minigender_1
  • 来自: 大连
社区版块
存档分类
最新评论

httpclient4 网页抓取

    博客分类:
  • J2SE
 
阅读更多
最近在做全文检索。所以数据很纠结。没办法 抓一些行业新闻。于是乎用了 httpclient

上代码。分享下

TEbInformationModel model = new TEbInformationModel();
			
			HttpClient httpclient = new DefaultHttpClient();
			
			httpclient.getParams().setParameter("http.protocol.content-charset",HTTP.UTF_8);  
			httpclient.getParams().setParameter(HTTP.CONTENT_ENCODING, HTTP.UTF_8);  
			httpclient.getParams().setParameter(HTTP.CHARSET_PARAM, HTTP.UTF_8);  
			httpclient.getParams().setParameter(HTTP.DEFAULT_PROTOCOL_CHARSET,HTTP.UTF_8);
			httpclient.getParams().setParameter(HTTP.CONTENT_TYPE, HTTP.UTF_8);
			
			HttpPost httppost = new HttpPost(httpurl); 
			
			httppost.setHeader("Accept-Language", "zh-cn,zh;q=0.5");  
			httppost.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");  
			
			
			httppost.getParams().setParameter("http.protocol.content-charset",HTTP.UTF_8);  
	        httppost.getParams().setParameter(HTTP.CONTENT_ENCODING, HTTP.UTF_8);  
	        httppost.getParams().setParameter(HTTP.CHARSET_PARAM, HTTP.UTF_8);  
	        httppost.getParams().setParameter(HTTP.DEFAULT_PROTOCOL_CHARSET, HTTP.UTF_8);
	        httppost.getParams().setParameter(HTTP.CONTENT_TYPE, HTTP.UTF_8);

			HttpResponse response = httpclient.execute(httppost); 

	        
	        
			InputStream is = response.getEntity().getContent(); 
			BufferedReader br = new BufferedReader(new InputStreamReader(is,"GBK"));
			StringBuffer sbf = new StringBuffer();
			String line = null;
			while ((line = br.readLine()) != null)
			{
			sbf.append(line);
			}
			/** 回收资源 */
			br.close();
			
			String title = getSubTitle(getStringNoBlank(getTitle(sbf.toString(),"title")));
			String context = getSubContext(getStringNoBlank(getTitle(sbf.toString(),"content")));
			String key = getSubKey(getStringNoBlank(getTitle(sbf.toString(),"key")));
			
			System.out.println("标题:"+title);
			System.out.println("内容:"+context);
			System.out.println("关键字:"+key);


正则匹配的部分
private  String getStringNoBlank(String str) {      
        if(str!=null && !"".equals(str)) {      
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");      
            Matcher m = p.matcher(str);      
            String strNoBlank = m.replaceAll("");      
            return strNoBlank;      
        }else {      
            return str;      
        }           
    }
	
	
	
	public  String getSubTitle(String str){
		return str.substring(str.indexOf("<h1>")+4, str.lastIndexOf("</h1>"));
	}
	public  String getSubContext(String str){
		return str.substring(str.indexOf("<P>")+3, str.lastIndexOf("</P>"));
	}
	public  String getSubKey(String str){
		return str.substring(str.indexOf("</b>")+4, str.lastIndexOf("</p>"));
	}
	
	private String getTitle( String s,String type)
	 {
	  String regex = null;
	  String title = "";
	  final List<String> list = new ArrayList<String>();
	 
	  if("title".equals(type)){
		  regex = "<div class=\"zz_leftneirong1\">.*?</h1>";
	  }else if("content".equals(type)){
		  regex = "<div  class=\"zz_leftneirong4\" id=\"content\" name=\"content\">.*? </div>";
	  }else{
		  regex = " <p class=\"key\"><b>本文关键词:</b>.*?</p>";
	  }
	  final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
	  final Matcher ma = pa.matcher(s);
	  while (ma.find())
	  {
	   list.add(ma.group());
	  }
	  for (int i = 0; i < list.size(); i++)
	  {
	   title = title + list.get(i);
	  }
	  return title;
	 }
	


分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics