`
wangshu3000
  • 浏览: 131384 次
  • 性别: Icon_minigender_1
  • 来自: 大连
社区版块
存档分类
最新评论

[小代码]蜘蛛爬虫,抓取某网站所有图片文章中的图片~

阅读更多
为朋友的网站写了个小代码,把所有图片下载到本地,有点不道德了,哈哈。。。
package com.ai.picpicker;

import java.io.IOException;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.htmlparser.util.ParserException;

public class PicPicker {
	public static final String siteUrlPrefix = "http://www.****.com/a********";
	public static final String siteUrlSuffix = ".html";;
	public static final int pageNum = 4;// Sum 20 pages.
	public static final int startCategory = 1;

	public static void main(String args[]) throws ParserException, HttpException, IOException, InterruptedException {
		MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
		HttpClient httpClient = new HttpClient(connectionManager);

		int picCount = 0;
		// Main Loop, all page.
		PickerThread[] ppt = new PickerThread[pageNum];
		for (int p = 3; p < pageNum; p++) {
			ppt[p] = new PickerThread(httpClient, p + 1, startCategory);
			ppt[p].start();
			System.out.println("Thread " + (p + 1) + " Started~~");
			ppt[p].join();
		}
		for (int p = 3; p < pageNum; p++) {
			picCount = picCount + ppt[p].getCount();
		}
		System.out.println("All downloaded file num:" + picCount);
	}
}


package com.ai.picpicker;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;

public class PickerThread extends Thread {

	private HttpClient httpClient = null;
	private int pageNum = 0;
	private int picCountT = 0;
	private int startCategory = 0;

	public PickerThread(HttpClient httpClient, int pageNum, int startCategory) {
		this.httpClient = httpClient;
		this.pageNum = pageNum;
		this.startCategory = startCategory - 1;
	}

	public int getCount() {
		return picCountT;
	}

	@Override
	public void run() {
		System.out.println("Thread " + pageNum + " Running~~");
		File forLogFile = new File("log4thread" + pageNum);
		FileWriter fwl = null;
		String picStr = null;
		try {
			Parser parser;
			parser = new Parser("http://www.******.com/a*******" + pageNum + ".html");
			fwl = new FileWriter(forLogFile);
			NodeList nodelist = parser.parse(null);
			NodeFilter categoryFilter = new TagNameFilter("h4");
			NodeList categoryList = nodelist.extractAllNodesThatMatch(categoryFilter, true);
			GetMethod getPageMethod = null;
			for (int i = startCategory; i < categoryList.size(); i++) {
				HeadingTag ht = (HeadingTag) categoryList.elementAt(i);
				LinkTag lt = (LinkTag) ht.getChild(0);
				String oneUrl = lt.getLink();
				fwl.write("[INFO]" + "Category Num" + i + " Downloading! Url:" + oneUrl + "\n");
				getPageMethod = new GetMethod(oneUrl);
				int statusCode = httpClient.executeMethod(getPageMethod);
				if (statusCode != HttpStatus.SC_OK) {
					fwl.write("[ERROR]" + "Method failed: " + getPageMethod.getStatusLine() + "\n");
				} else {
					fwl.write("[INFO]" + "Page" + pageNum + "_" + getPageMethod.getStatusLine() + "\n");
					fwl.flush();
					byte[] pageBody = getPageMethod.getResponseBody();
					getPageMethod.releaseConnection();
					String picListHtml = new String(pageBody, "UTF-8").trim();
					String picSubStr = picListHtml.substring(picListHtml.indexOf("start"), picListHtml.indexOf("end"));
					while (picSubStr.indexOf("\"file\"") != -1) {
						picStr = picSubStr.substring(picSubStr.indexOf("\"file\"") + 8, picSubStr.indexOf("\"pic\"") - 2).replace("\\",
								"");
						StringBuilder sb = new StringBuilder();
						sb.append("P").append(pageNum).append("_C").append(i + 1).append("/");
						File dir = new File(sb.toString());
						if (!dir.exists()) {
							dir.mkdir();
							dir = null;
						}
						sb.append(picStr.substring(picStr.indexOf("/", 16) + 1, picStr.length()));
						File picFile = new File(sb.toString());
						if (picFile.exists()) {
							fwl.write("[ERROR]" + "Duplication picture! FileName:" + sb.toString() + "\n");
							if (picSubStr.indexOf("\"pic\"", 7) != -1) {
								picSubStr = picSubStr.substring(picSubStr.indexOf("\"pic\"", 7) + 7, picSubStr.length());
							}
							continue;
						}
						GetMethod getPicMethod = new GetMethod("http://www.******.com/" + picStr);
						statusCode = httpClient.executeMethod(getPicMethod);
						if (statusCode != HttpStatus.SC_OK) {
							fwl.write("[ERROR]" + "Method failed: " + " URL:" + "http://www.********.com/" + picStr
									+ getPicMethod.getStatusLine() + "\n");
						} else {
							byte[] picBody = getPicMethod.getResponseBody();
							getPicMethod.releaseConnection();
							FileOutputStream picOutPut = new FileOutputStream(picFile);
							picOutPut.write(picBody);
							picOutPut.close();
							fwl.write("[INFO]" + "Pic" + picCountT++ + " URL:" + "http://www.*********.com/" + picStr + "\n");
							fwl.flush();
							System.out.print('.');
						}
						// System.out.println(picStr);
						if (picSubStr.indexOf("\"pic\"", 7) != -1) {
							picSubStr = picSubStr.substring(picSubStr.indexOf("\"pic\"", 7) + 7, picSubStr.length());
						}
					}
					fwl.write("[DEBUG]" + lt.getLink());
					fwl.flush();
				}
				System.out.println();
				fwl.flush();
			}
			fwl.write("[INFO]" + "Thread " + pageNum + " run over " + picCountT + "pic!!!\n");
			fwl.flush();
		} catch (Exception e) {
			System.out.println("Thread " + pageNum + " Exception!!! PicUrl:" + picStr);
			e.printStackTrace();
		} finally {
			try {
				if (fwl != null)
					fwl.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

	}
}

2
6
分享到:
评论
2 楼 wangshu3000 2012-01-03  
greatghoul 写道
很不错,我也经常抓取些图片,当然都是**网站的,如果只是抓取图片,建议使用正则,会快很多。
还有这个run方法大了点儿,可以再分离下。。。

不好意思,完全没考虑设计, 就是实现功能,一次性的。呵呵。所以也不顾什么设计模式,什么代码结构了。实现功能就OK了。。
1 楼 greatghoul 2012-01-03  
很不错,我也经常抓取些图片,当然都是**网站的,如果只是抓取图片,建议使用正则,会快很多。
还有这个run方法大了点儿,可以再分离下。。。

相关推荐

Global site tag (gtag.js) - Google Analytics