- 浏览: 16878 次
- 性别:
- 来自: 长沙
最新评论
最近公司闲来无事,看到了Apache nutch项目,记得前段时间做了网上数据的抓取,是别人给的代码,自己改动一下代码,然后实现其功能。当初没有深究,所以现研究了一下。
从网上看了很多的例子,实现网络爬虫,大概三步走:一是加载所要爬虫的网站。二是爬虫链接。三是爬虫匹配的内容。以下是原始的实现方法,代码:
package com.shangkang.pzf.xywy;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.manager.Queue;
import com.shangkang.yjw.util.Constant;
public class GetStartPoint {
public static void main(String[] args) {
String baseUrl = "http://club.xywy.com/";
new GetStartPoint().downloadFile(baseUrl,"xywy");
String filePath = "d:/crawler-cust/xywy.html";
testParserHtml2NeedLink(filePath);
//加载所要爬虫的网站
public void downloadFile(String url,String fileName){
String saveFilePath = "d:/crawler-cust/";
HttpClient hc = null;
try {
hc = new DefaultHttpClient(); hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 5000);
HttpGet httpGet = new HttpGet(url);
HttpResponse response = hc.execute(httpGet);
response.getParams();
HttpEntity entity = response.getEntity();
System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
FileUtils.copyInputStreamToFile(is, new File(saveFilePath + fileName + ".html"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
//爬虫链接
public static void testParserHtml2NeedLink(String filePath)
{
try {
Parser parser = new Parser(filePath);
NodeList nodeList = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("dl class=\"clearfix\""))
{ System.out.println("node.getText()"+node.getText());//class="clearfix" <dl class="clearfix">
return true;
}else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeList != null)
{
int size = nodeList.size();
for(int i = 0 ; i < size ; i ++)
{
Node dlNode = nodeList.elementAt(i);
nodeListDd = dlNode.getChildren();
nodeListA.add(nodeListDd.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("a target=\"_blank\" href="))
{ System.out.println(node.getText());
return true;
}
return false;
}
},true));
}
}
System.out.println("-------------------------------");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
// nodeListA.
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// System.out.println("link == " + link.replace("file://localhost", base_url_yp900));
link = link.replace("file://localhost", "");
System.out.println(link);
link = Constant.BASE_URL_XYWY+link; LinkQueue.addUnvisitedUrl(link); LinkQueue.addUnvisitedUrlName(new String(node.toPlainTextString().getBytes("ISO-8859-1"),"GBK"));
}
// System.out.println(node);
}
File file = new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt");
File fileName = new File(Constant.SAVE_FILE_DIR + "xywy_need_links_TypeName.txt");
// Queue<String> ulrNames = LinkQueue.getUnVisitedUrlQueue();
Queue<String> ulrs = LinkQueue.getUnVisitedUrlQueue();
while(!ulrs.isEmpty())
{
String url = ulrs.deQueue();
// String urlName = ulrNames.deQueue();
// FileUtils.writeStringToFile(fileName, urlName+"\r\n", true);
FileUtils.writeStringToFile(file, url+"\r\n", true);
}
} catch (ParserException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//爬虫二级连接
/**
* COPYRIGHT (C) 2010 LY. ALL RIGHTS RESERVED.
*
* No part of this publication may be reproduced, stored in a retrieval system,
* or transmitted, on any form or by any means, electronic, mechanical, photocopying,
* recording, or otherwise, without the prior written permission of 3KW.
*
* Created By: zzqiang
* Created On: 2013-6-18
*
* Amendment History:
*
* Amended By Amended On Amendment Description
* ------------ ----------- ---------------------------------------------
*
**/
package com.shangkang.pzf.xywy;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.util.Constant;
public class GetValuedLink {
public static void main(String[] args) throws IOException
{
List<String> urls = new ArrayList<String>();
//获取
urls =FileUtils.readLines(new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt"));
for (String url : urls)
{
String startPoint = url;
System.out.println(startPoint);
LinkQueue.addUnvisitedUrl(startPoint);
}
while (!LinkQueue.getUnVisitedUrlQueue().isEmpty())
{
String url = LinkQueue.getUnVisitedUrlQueue().deQueue();
System.out.println("---------------------正在处理Url----------------===" + url);
if(!LinkQueue.getVisitedUrl().contains(url))
{
downloadFileAndParserLink(url);
LinkQueue.addVisitedUrl(url);
}
}
String filePath = Constant.SAVE_FILE_DIR + "valued_link_" + Constant.WWWXYWYCOM + ".txt";
LinkQueue.flushContent2File(LinkQueue.getValuedUrls(), filePath);
}
public static void downloadFileAndParserLink(String startPoint)
{
String accessUrl = startPoint;
//http://www.yp900.com/ZY-HXXT/index_2.htm
//http://www.yp900.com/ZY-HXXT/
String urlEnd = startPoint.substring(startPoint.lastIndexOf("/")+1);
int lastPoint = startPoint.lastIndexOf("/");
int lastLastPoint = startPoint.substring(0, lastPoint).lastIndexOf("/");
String sonDir = startPoint.substring(lastLastPoint+1, lastPoint);
startPoint = startPoint.replace(urlEnd, "");
String fileName = urlEnd.equals("") ? sonDir : urlEnd.substring(0, urlEnd.lastIndexOf("."));
HttpClient hc = null;
String filePath = null;
try {
hc = new DefaultHttpClient();
hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);
HttpGet httpGet = new HttpGet(accessUrl);
HttpResponse response = hc.execute(httpGet);
response.getParams();
StatusLine statusLine = response.getStatusLine();
if(statusLine.getStatusCode() == 200)
{
HttpEntity entity = response.getEntity();
// System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
filePath = Constant.SAVE_FILE_DIR+ Constant.WWWXYWYCOM +"/" + sonDir +"/"+fileName + ".htm";
System.out.println("save file Path = " + filePath);
FileUtils.copyInputStreamToFile(is, new File(filePath));
System.out.println("file down load succuss: source url =" + startPoint);
}
}else if(statusLine.getStatusCode() == 404)
{
System.err.println("http 404 :::" + startPoint);;
}
else
{
System.err.println("http connect error");
}
if(null != filePath)
{
parserValuedLinkAndNextLink(filePath,startPoint);
System.out.println("-- 删除下载的文件 --" + filePath);
new File(filePath).delete();
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
public static void parserValuedLinkAndNextLink(String filePath,String startPoint)
{
// div class="r_btn f_r"
try
{
Parser parser = new Parser(filePath);
NodeList nodeListDiv = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
// System.out.println(node);
if (node.getText().startsWith(
"td class=\"pl20 w340\""))
{
//class="clearfix" <dl class="clearfix">
return true;
} else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeListDiv != null)
{
int size = nodeListDiv.size();
for(int i = 0 ; i < size ; i ++)
{
Node divNode = nodeListDiv.elementAt(i);
NodeList nodes = divNode.getChildren();
nodeListA.add(nodes.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node instanceof LinkTag)
{
return true;
}
else
{
return false;
}
}
}, true));
}
}
System.out.println("-------抽取有价值的连接---start----");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// link = link.replace("file://localhost", "");
// System.out.println(link);
if(link.indexOf("static") != -1)
{
// link = Constant.BASE_URL_XYWY + link;
// link = link.replace("file://localhost", "");
System.out.println("valued link =" + link);
LinkQueue.addValuedUrl(link,Constant.WWWXYWYCOM);
}
}
}
System.out.println("-------抽取有价值的连接---end---");
System.out.println("-------抽取Next下载的连接- start------");
NodeList nextNodeList = new NodeList();
parser = new Parser(filePath);
NodeList pageNumNodeList = parser.extractAllNodesThatMatch(new NodeFilter(){
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("div class=\"clearfix pageStyle tc mt20 pb20 f12 pagelink\""))
{
return true;
}else
{
return false;
}
}
});
int divSize = pageNumNodeList.size();
String nextLink = null;
for(int i = 0; i< divSize; i++)
{
Node divNode = pageNumNodeList.elementAt(i);
nextNodeList = divNode.getChildren().extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("a href=") && node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
String link = linkTag.extractLink();
String linkText = linkTag.getLinkText();
// System.out.println("linkText =" + linkText);
if(linkText.contains("下一页") && link != null && !link.equals(""))
{
return true;
}
}
return false;
}
}, true);
}
if(null != nextNodeList && nextNodeList.size() > 0)
{
Node node = nextNodeList.elementAt(0);
if(node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
nextLink = linkTag.extractLink();
System.out.println("nextLink ==" + nextLink);
nextLink = Constant.BASE_URL_XYWY + nextLink;
System.out.println("找到新的下载链接:" + nextLink);
String fileName = nextLink.substring(nextLink.lastIndexOf("/"));
System.out.println("fileName ====" + fileName);
LinkQueue.addUnvisitedUrl(nextLink);
}
}
System.out.println("-------抽取Next下载的连接---end----");
} catch (Exception e)
{
e.printStackTrace();
}
}
}
从网上看了很多的例子,实现网络爬虫,大概三步走:一是加载所要爬虫的网站。二是爬虫链接。三是爬虫匹配的内容。以下是原始的实现方法,代码:
package com.shangkang.pzf.xywy;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.manager.Queue;
import com.shangkang.yjw.util.Constant;
public class GetStartPoint {
public static void main(String[] args) {
String baseUrl = "http://club.xywy.com/";
new GetStartPoint().downloadFile(baseUrl,"xywy");
String filePath = "d:/crawler-cust/xywy.html";
testParserHtml2NeedLink(filePath);
//加载所要爬虫的网站
public void downloadFile(String url,String fileName){
String saveFilePath = "d:/crawler-cust/";
HttpClient hc = null;
try {
hc = new DefaultHttpClient(); hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 5000);
HttpGet httpGet = new HttpGet(url);
HttpResponse response = hc.execute(httpGet);
response.getParams();
HttpEntity entity = response.getEntity();
System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
FileUtils.copyInputStreamToFile(is, new File(saveFilePath + fileName + ".html"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
//爬虫链接
public static void testParserHtml2NeedLink(String filePath)
{
try {
Parser parser = new Parser(filePath);
NodeList nodeList = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("dl class=\"clearfix\""))
{ System.out.println("node.getText()"+node.getText());//class="clearfix" <dl class="clearfix">
return true;
}else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeList != null)
{
int size = nodeList.size();
for(int i = 0 ; i < size ; i ++)
{
Node dlNode = nodeList.elementAt(i);
nodeListDd = dlNode.getChildren();
nodeListA.add(nodeListDd.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("a target=\"_blank\" href="))
{ System.out.println(node.getText());
return true;
}
return false;
}
},true));
}
}
System.out.println("-------------------------------");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
// nodeListA.
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// System.out.println("link == " + link.replace("file://localhost", base_url_yp900));
link = link.replace("file://localhost", "");
System.out.println(link);
link = Constant.BASE_URL_XYWY+link; LinkQueue.addUnvisitedUrl(link); LinkQueue.addUnvisitedUrlName(new String(node.toPlainTextString().getBytes("ISO-8859-1"),"GBK"));
}
// System.out.println(node);
}
File file = new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt");
File fileName = new File(Constant.SAVE_FILE_DIR + "xywy_need_links_TypeName.txt");
// Queue<String> ulrNames = LinkQueue.getUnVisitedUrlQueue();
Queue<String> ulrs = LinkQueue.getUnVisitedUrlQueue();
while(!ulrs.isEmpty())
{
String url = ulrs.deQueue();
// String urlName = ulrNames.deQueue();
// FileUtils.writeStringToFile(fileName, urlName+"\r\n", true);
FileUtils.writeStringToFile(file, url+"\r\n", true);
}
} catch (ParserException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//爬虫二级连接
/**
* COPYRIGHT (C) 2010 LY. ALL RIGHTS RESERVED.
*
* No part of this publication may be reproduced, stored in a retrieval system,
* or transmitted, on any form or by any means, electronic, mechanical, photocopying,
* recording, or otherwise, without the prior written permission of 3KW.
*
* Created By: zzqiang
* Created On: 2013-6-18
*
* Amendment History:
*
* Amended By Amended On Amendment Description
* ------------ ----------- ---------------------------------------------
*
**/
package com.shangkang.pzf.xywy;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import com.shangkang.yjw.manager.LinkQueue;
import com.shangkang.yjw.util.Constant;
public class GetValuedLink {
public static void main(String[] args) throws IOException
{
List<String> urls = new ArrayList<String>();
//获取
urls =FileUtils.readLines(new File(Constant.SAVE_FILE_DIR + "xywy_need_links.txt"));
for (String url : urls)
{
String startPoint = url;
System.out.println(startPoint);
LinkQueue.addUnvisitedUrl(startPoint);
}
while (!LinkQueue.getUnVisitedUrlQueue().isEmpty())
{
String url = LinkQueue.getUnVisitedUrlQueue().deQueue();
System.out.println("---------------------正在处理Url----------------===" + url);
if(!LinkQueue.getVisitedUrl().contains(url))
{
downloadFileAndParserLink(url);
LinkQueue.addVisitedUrl(url);
}
}
String filePath = Constant.SAVE_FILE_DIR + "valued_link_" + Constant.WWWXYWYCOM + ".txt";
LinkQueue.flushContent2File(LinkQueue.getValuedUrls(), filePath);
}
public static void downloadFileAndParserLink(String startPoint)
{
String accessUrl = startPoint;
//http://www.yp900.com/ZY-HXXT/index_2.htm
//http://www.yp900.com/ZY-HXXT/
String urlEnd = startPoint.substring(startPoint.lastIndexOf("/")+1);
int lastPoint = startPoint.lastIndexOf("/");
int lastLastPoint = startPoint.substring(0, lastPoint).lastIndexOf("/");
String sonDir = startPoint.substring(lastLastPoint+1, lastPoint);
startPoint = startPoint.replace(urlEnd, "");
String fileName = urlEnd.equals("") ? sonDir : urlEnd.substring(0, urlEnd.lastIndexOf("."));
HttpClient hc = null;
String filePath = null;
try {
hc = new DefaultHttpClient();
hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);
HttpGet httpGet = new HttpGet(accessUrl);
HttpResponse response = hc.execute(httpGet);
response.getParams();
StatusLine statusLine = response.getStatusLine();
if(statusLine.getStatusCode() == 200)
{
HttpEntity entity = response.getEntity();
// System.out.println(entity.getContentType());
if(entity != null)
{
InputStream is = entity.getContent();
filePath = Constant.SAVE_FILE_DIR+ Constant.WWWXYWYCOM +"/" + sonDir +"/"+fileName + ".htm";
System.out.println("save file Path = " + filePath);
FileUtils.copyInputStreamToFile(is, new File(filePath));
System.out.println("file down load succuss: source url =" + startPoint);
}
}else if(statusLine.getStatusCode() == 404)
{
System.err.println("http 404 :::" + startPoint);;
}
else
{
System.err.println("http connect error");
}
if(null != filePath)
{
parserValuedLinkAndNextLink(filePath,startPoint);
System.out.println("-- 删除下载的文件 --" + filePath);
new File(filePath).delete();
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
public static void parserValuedLinkAndNextLink(String filePath,String startPoint)
{
// div class="r_btn f_r"
try
{
Parser parser = new Parser(filePath);
NodeList nodeListDiv = parser.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
// System.out.println(node);
if (node.getText().startsWith(
"td class=\"pl20 w340\""))
{
//class="clearfix" <dl class="clearfix">
return true;
} else
{
return false;
}
}
});
NodeList nodeListA = new NodeList();
NodeList nodeListDd = new NodeList();
if(nodeListDiv != null)
{
int size = nodeListDiv.size();
for(int i = 0 ; i < size ; i ++)
{
Node divNode = nodeListDiv.elementAt(i);
NodeList nodes = divNode.getChildren();
nodeListA.add(nodes.extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node instanceof LinkTag)
{
return true;
}
else
{
return false;
}
}
}, true));
}
}
System.out.println("-------抽取有价值的连接---start----");
int size = nodeListA.size();
for(int i = 0; i< size ; i++)
{
Node node = nodeListA.elementAt(i);
if(node instanceof LinkTag)
{
String link = ((LinkTag)node).extractLink();
// link = link.replace("file://localhost", "");
// System.out.println(link);
if(link.indexOf("static") != -1)
{
// link = Constant.BASE_URL_XYWY + link;
// link = link.replace("file://localhost", "");
System.out.println("valued link =" + link);
LinkQueue.addValuedUrl(link,Constant.WWWXYWYCOM);
}
}
}
System.out.println("-------抽取有价值的连接---end---");
System.out.println("-------抽取Next下载的连接- start------");
NodeList nextNodeList = new NodeList();
parser = new Parser(filePath);
NodeList pageNumNodeList = parser.extractAllNodesThatMatch(new NodeFilter(){
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("div class=\"clearfix pageStyle tc mt20 pb20 f12 pagelink\""))
{
return true;
}else
{
return false;
}
}
});
int divSize = pageNumNodeList.size();
String nextLink = null;
for(int i = 0; i< divSize; i++)
{
Node divNode = pageNumNodeList.elementAt(i);
nextNodeList = divNode.getChildren().extractAllNodesThatMatch(new NodeFilter() {
@Override
public boolean accept(Node node)
{
if(node.getText().startsWith("a href=") && node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
String link = linkTag.extractLink();
String linkText = linkTag.getLinkText();
// System.out.println("linkText =" + linkText);
if(linkText.contains("下一页") && link != null && !link.equals(""))
{
return true;
}
}
return false;
}
}, true);
}
if(null != nextNodeList && nextNodeList.size() > 0)
{
Node node = nextNodeList.elementAt(0);
if(node instanceof LinkTag)
{
LinkTag linkTag = (LinkTag)node;
nextLink = linkTag.extractLink();
System.out.println("nextLink ==" + nextLink);
nextLink = Constant.BASE_URL_XYWY + nextLink;
System.out.println("找到新的下载链接:" + nextLink);
String fileName = nextLink.substring(nextLink.lastIndexOf("/"));
System.out.println("fileName ====" + fileName);
LinkQueue.addUnvisitedUrl(nextLink);
}
}
System.out.println("-------抽取Next下载的连接---end----");
} catch (Exception e)
{
e.printStackTrace();
}
}
}
发表评论
-
Java开发中遇到的小问题总结
2014-12-11 17:49 592昨天有人在群里问说JSP页面发送请求后获取参数属性值乱码,re ... -
Java实现判断电话号码运行商
2014-12-04 12:05 1768项目需求描述: 后台统计电话号码发送短信统计,要求选择运行商选 ... -
Java编程之代码优化
2014-12-02 15:47 661可供程序利用的资源(内存、CPU时间、网络带宽等)是有限的,优 ... -
JAVA的Random类
2014-11-27 17:28 352Random类 (java.util) ... -
Java开发ui工具对比
2014-11-26 09:47 416最近由于项目需要,对js UI作了一些简单的了解和使用,有自己 ... -
汉字按拼音首字母查询
2014-11-17 15:16 1549按汉字首字母搜索的功能,即:输入“bj”得到“北京”; 注 ... -
网上支付
2014-11-15 09:46 412关于Alipay支付宝接口(Java版) 2013-03 ...
相关推荐
网络爬虫代码java实现 网络爬虫代码java实现 网络爬虫代码java实现
基于java实现的java爬虫,是我学习java来练练手的,java基础入门的学生可以考虑参考一下
用Java实现抓取网页,支持下载网页和网页上面的图片内容,修改不同的源地址可以下载不同地址的网页
通过java代码实现一个简单的网络爬虫效果,爬取智联招聘网站上的职位名,公司名,工作地点,工作薪资等。
基于Java的多线程网络爬虫设计与实现.txt
基于java 实现类似于搜索引擎的东西,值得一看。 用网络爬虫抓取网页的url资源。
为本人毕业设计,内含数据库结构、程序源码、论文。程序使用框架springBoot+Mybatis+WebMagic,数据库为MySQL。论文经过查重。
JAVA基于网络爬虫的搜索引擎设计与实现.pdf
基于JAVA的网络爬虫程序源代码
该院吗详细的写出lJava网络爬虫蜘蛛源码,可以很好的帮助你实现爬虫,对了解爬虫的整个过程和实现爬虫非常有用
Java代码 实现 搜索链接 网络爬虫(蜘蛛) (内附源码 + 使用说明) Java代码 实现 搜索链接 网络爬虫(蜘蛛) (内附源码 + 使用说明) Java代码 实现 搜索链接 网络爬虫(蜘蛛) (内附源码 + 使用说明) Java代码 实现 ...
Java网络爬虫简单实现.pdf
java网络爬虫实现简单Demo
此资源集包含了一套完整的Java实现的网络爬虫(蜘蛛)项目源代码、相关的毕业论文以及详尽的使用说明。它旨在提供一个全面、深入的学习和研究工具,适用于本科课程设计、毕业设计以及任何希望深入学习Java编程的学习者...
第四章 网络爬虫模型的设计和实现 15 4.1 网络爬虫总体设计 15 4.2 网络爬虫具体设计 15 4.2.1 爬取网页 15 4.2.2 分析网页 16 4.2.3 判断相关度 17 4.2.4 保存网页信息 18 4.2.5 数据库设计和存储 18 4.2.6...
项目资源】:基于Java实现的网络爬虫(蜘蛛)源码 【适用人群】:适用于希望学习不同技术领域的小白或进阶学习者。可作为毕设项目、课程设计、大作业、工程实训或初期项目立项。
crawler4j是Java实现的开源网络爬虫。提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫。
Java搜索引擎的实现-网络爬虫.pdf
实现很不错的网络爬虫非常有价值。内有api。。。源程序。。jar文件。
由于项目需要,特研究了一段时间关于java爬虫的相关技术,发现一个比较好用的爬虫框架--WebMagic,只需少量代码即可实现一个爬虫,本项目就是基于它的一个简单实现,导入项目即可运行,项目只有两个类,一个用于抓取...