`

Java实现之网络爬虫

阅读更多
最近公司闲来无事,看到了Apache nutch项目,记得前段时间做了网上数据的抓取,是别人给的代码,自己改动一下代码,然后实现其功能。当初没有深究,所以现研究了一下。
从网上看了很多的例子,实现网络爬虫,大概三步走:一是加载所要爬虫的网站。二是爬虫链接。三是爬虫匹配的内容。以下是原始的实现方法,代码:
package com.shangkang.pzf.xywy;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.manager.Queue;
import com.shangkang.yjw.util.Constant;

public class GetStartPoint {

public static void main(String[] args) {

String baseUrl = "http://club.xywy.com/";

new GetStartPoint().downloadFile(baseUrl,"xywy");

String filePath  = "d:/crawler-cust/xywy.html";
testParserHtml2NeedLink(filePath);
        //加载所要爬虫的网站
public void downloadFile(String url,String fileName){
String saveFilePath = "d:/crawler-cust/";

HttpClient hc = null;
try {
hc = new DefaultHttpClient();                    hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 5000);
HttpGet httpGet = new HttpGet(url);
HttpResponse response = hc.execute(httpGet);
response.getParams();
HttpEntity entity = response.getEntity();
System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
FileUtils.copyInputStreamToFile(is, new File(saveFilePath + fileName + ".html"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
//爬虫链接
public static void testParserHtml2NeedLink(String filePath)
{
try {
Parser parser = new Parser(filePath);
NodeList nodeList = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("dl class=\"clearfix\""))
{ System.out.println("node.getText()"+node.getText());//class="clearfix"  <dl class="clearfix">
return true;
}else
{
return false;
}
}
});

NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeList != null)
{
int size = nodeList.size();
for(int i = 0 ; i < size ; i ++)
{
Node dlNode = nodeList.elementAt(i);
nodeListDd = dlNode.getChildren();
nodeListA.add(nodeListDd.extractAllNodesThatMatch(new NodeFilter() {

@Override
public boolean accept(Node node) {
if(node.getText().startsWith("a target=\"_blank\" href="))
{ System.out.println(node.getText());
return true;
}
return false;
}
},true));
}
}
System.out.println("-------------------------------");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
// nodeListA.
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();

// System.out.println("link == " + link.replace("file://localhost", base_url_yp900));
link = link.replace("file://localhost", "");
System.out.println(link);
link =  Constant.BASE_URL_XYWY+link; LinkQueue.addUnvisitedUrl(link); LinkQueue.addUnvisitedUrlName(new String(node.toPlainTextString().getBytes("ISO-8859-1"),"GBK"));
}
// System.out.println(node);
}
File file = new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt");
File fileName = new File(Constant.SAVE_FILE_DIR + "xywy_need_links_TypeName.txt");
// Queue<String> ulrNames = LinkQueue.getUnVisitedUrlQueue();
Queue<String> ulrs = LinkQueue.getUnVisitedUrlQueue();
while(!ulrs.isEmpty())
{
String url = ulrs.deQueue();
// String urlName = ulrNames.deQueue();
// FileUtils.writeStringToFile(fileName, urlName+"\r\n", true);
FileUtils.writeStringToFile(file, url+"\r\n", true);
}
} catch (ParserException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//爬虫二级连接
/**
* COPYRIGHT (C) 2010 LY. ALL RIGHTS RESERVED.
*
* No part of this publication may be reproduced, stored in a retrieval system,
* or transmitted, on any form or by any means, electronic, mechanical, photocopying,
* recording, or otherwise, without the prior written permission of 3KW.
*
* Created By: zzqiang
* Created On: 2013-6-18
*
* Amendment History:
*
* Amended By       Amended On      Amendment Description
* ------------     -----------     ---------------------------------------------
*
**/
package com.shangkang.pzf.xywy;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;

import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.util.Constant;

public class GetValuedLink {

public static void main(String[] args) throws IOException
{
List<String> urls = new ArrayList<String>();
//获取
urls =FileUtils.readLines(new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt"));
for (String url : urls)
{
String startPoint = url;
System.out.println(startPoint);
LinkQueue.addUnvisitedUrl(startPoint);
}

while (!LinkQueue.getUnVisitedUrlQueue().isEmpty())
{
String url = LinkQueue.getUnVisitedUrlQueue().deQueue();
System.out.println("---------------------正在处理Url----------------===" + url);

if(!LinkQueue.getVisitedUrl().contains(url))
{
downloadFileAndParserLink(url);

LinkQueue.addVisitedUrl(url);
}
}

String filePath = Constant.SAVE_FILE_DIR  + "valued_link_" + Constant.WWWXYWYCOM + ".txt";
LinkQueue.flushContent2File(LinkQueue.getValuedUrls(), filePath);

}
public static void downloadFileAndParserLink(String startPoint)
{
String accessUrl = startPoint;


//http://www.yp900.com/ZY-HXXT/index_2.htm
//http://www.yp900.com/ZY-HXXT/

String urlEnd  = startPoint.substring(startPoint.lastIndexOf("/")+1);

int lastPoint = startPoint.lastIndexOf("/");
int lastLastPoint = startPoint.substring(0, lastPoint).lastIndexOf("/");
String sonDir = startPoint.substring(lastLastPoint+1, lastPoint);

startPoint = startPoint.replace(urlEnd, "");

String fileName = urlEnd.equals("") ? sonDir : urlEnd.substring(0, urlEnd.lastIndexOf("."));

HttpClient hc = null;
String filePath = null;
try {
hc = new DefaultHttpClient();

hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);
HttpGet httpGet = new HttpGet(accessUrl);

HttpResponse response = hc.execute(httpGet);

response.getParams();

StatusLine statusLine = response.getStatusLine();

if(statusLine.getStatusCode() ==  200)
{
HttpEntity entity = response.getEntity();

// System.out.println(entity.getContentType());

if(entity != null)
{
InputStream is = entity.getContent();
filePath = Constant.SAVE_FILE_DIR+ Constant.WWWXYWYCOM +"/" + sonDir +"/"+fileName + ".htm";
System.out.println("save file Path = " + filePath);
FileUtils.copyInputStreamToFile(is, new File(filePath));
System.out.println("file down load succuss: source url =" + startPoint);

}
}else if(statusLine.getStatusCode() == 404)
{
System.err.println("http 404 :::"  + startPoint);;
}
else
{
System.err.println("http connect error");
}
if(null != filePath)
{
parserValuedLinkAndNextLink(filePath,startPoint);

System.out.println("-- 删除下载的文件 --" + filePath);
new File(filePath).delete();
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}


}

public static void parserValuedLinkAndNextLink(String filePath,String startPoint)
{
// div class="r_btn f_r"

try
{
Parser parser = new Parser(filePath);
NodeList nodeListDiv = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
// System.out.println(node);

if (node.getText().startsWith(
"td class=\"pl20 w340\""))
{
//class="clearfix"  <dl class="clearfix">
return true;
} else
{
return false;
}
}
});

NodeList nodeListA = new NodeList();

NodeList nodeListDd = new NodeList();

if(nodeListDiv != null)
{
int size = nodeListDiv.size();

for(int i = 0 ; i < size ; i ++)
{
Node divNode = nodeListDiv.elementAt(i);

NodeList nodes = divNode.getChildren();

nodeListA.add(nodes.extractAllNodesThatMatch(new NodeFilter() {

@Override
public boolean accept(Node node)
{
if(node instanceof LinkTag)
{
return true;
}
else
{
return false;
}
}
}, true));

}
}

System.out.println("-------抽取有价值的连接---start----");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// link = link.replace("file://localhost", "");
// System.out.println(link);

if(link.indexOf("static") != -1)
{
// link = Constant.BASE_URL_XYWY + link;

// link = link.replace("file://localhost", "");


System.out.println("valued link =" + link);

LinkQueue.addValuedUrl(link,Constant.WWWXYWYCOM);
}
}
}
System.out.println("-------抽取有价值的连接---end---");


System.out.println("-------抽取Next下载的连接- start------");

NodeList nextNodeList = new NodeList();


parser = new Parser(filePath);

NodeList pageNumNodeList = parser.extractAllNodesThatMatch(new NodeFilter(){

@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("div class=\"clearfix pageStyle tc mt20 pb20 f12 pagelink\""))
{
return true;
}else
{
return false;
}
}

});
int divSize = pageNumNodeList.size();

String nextLink = null;

for(int i = 0; i< divSize; i++)
{
Node divNode = pageNumNodeList.elementAt(i);
nextNodeList = divNode.getChildren().extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("a href=") && node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
String link = linkTag.extractLink();
String linkText = linkTag.getLinkText();
// System.out.println("linkText =" + linkText);
if(linkText.contains("下一页") && link != null && !link.equals(""))
{
return true;
}
}

return false;
}
}, true);

}
if(null != nextNodeList && nextNodeList.size() > 0)
{
Node node = nextNodeList.elementAt(0);

if(node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
nextLink = linkTag.extractLink();
System.out.println("nextLink ==" + nextLink);

nextLink = Constant.BASE_URL_XYWY + nextLink;

System.out.println("找到新的下载链接:" + nextLink);

String fileName = nextLink.substring(nextLink.lastIndexOf("/"));

System.out.println("fileName ====" + fileName);

LinkQueue.addUnvisitedUrl(nextLink);
}
}
System.out.println("-------抽取Next下载的连接---end----");

} catch (Exception e)
{
e.printStackTrace();
}
}
}

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics