`
rensanning
  • 浏览: 3514149 次
  • 性别: Icon_minigender_1
  • 来自: 大连
博客专栏
Efef1dba-f7dd-3931-8a61-8e1c76c3e39f
使用Titanium Mo...
浏览量:37479
Bbab2146-6e1d-3c50-acd6-c8bae29e307d
Cordova 3.x入门...
浏览量:604323
C08766e7-8a33-3f9b-9155-654af05c3484
常用Java开源Libra...
浏览量:678071
77063fb3-0ee7-3bfa-9c72-2a0234ebf83e
搭建 CentOS 6 服...
浏览量:87257
E40e5e76-1f3b-398e-b6a6-dc9cfbb38156
Spring Boot 入...
浏览量:399816
Abe39461-b089-344f-99fa-cdfbddea0e18
基于Spring Secu...
浏览量:69067
66a41a70-fdf0-3dc9-aa31-19b7e8b24672
MQTT入门
浏览量:90474
社区版块
存档分类
最新评论

抓取 开发者头条 分享的所有文章

 
阅读更多
使用 HttpClient 和 jsoup 抓取开发者头条中分享的所有文章(截止目前15000多条)。

数据:点击下载
代码:点击下载




public class ToutiaoArticles {
	
	public static void main(String[] args) {
		new ToutiaoArticles().fetch();
	}
	
	public void fetch() {
		LocalDate startDate = new LocalDate(2014, 9, 27);
		LocalDate endDate = LocalDate.now();
		File outputFile = new File("D://data.csv");
		String baseUrl = "http://toutiao.io/prev/";
		
		PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
		mgr.setMaxTotal(5);
		mgr.setDefaultMaxPerRoute(5);
		HttpClient httpClient = HttpClientBuilder.create().setConnectionManager(mgr).build();
		HttpGet httpGet = null;
		
		String date = null;
		String url = null;
		List<Link> linkInfos = null;
		StringBuffer articleInfos = null;
		
		while (startDate.isBefore(endDate) || startDate.isEqual(endDate)) {
			date = startDate.toString("yyyy-MM-dd");
			url = baseUrl + date;
			System.out.println("[URL]-----" + url);
			httpGet = new HttpGet(url);
			try {
				linkInfos = httpClient.execute(httpGet, new PageResponseHandler());
				if (linkInfos != null) {
					articleInfos = new StringBuffer();
					for (int i = 0; i < linkInfos.size(); i++) {
						Link k = linkInfos.get(i);
						String data = date + "," + (i+1) + "," + k.getTitle() + "," + k.getOriginLink() + "," + k.getLink();
						System.out.println(data);
						articleInfos.append(data + "\r\n");
					}
					FileUtils.writeStringToFile(outputFile, articleInfos.toString(), "GBK", true);
				}
			} catch (Exception e) {
				e.printStackTrace();
			} finally {
				httpGet.releaseConnection();
			}
			startDate = startDate.plusDays(1);
		}
	}
	
	class PageResponseHandler implements ResponseHandler<List<Link>> {

		@Override
		public List<Link> handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
			
			HttpEntity entity = response.getEntity();

			if (response.getStatusLine().getStatusCode() >= 300) {
				EntityUtils.consume(entity);
				return null;
			}

			if (entity == null) {
				return null;
			}

			RequestConfig requestConfig = RequestConfig.custom().setRedirectsEnabled(false).build();
			PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
			mgr.setMaxTotal(5);
			mgr.setDefaultMaxPerRoute(5);
			HttpClient httpClient = HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).setConnectionManager(mgr).build();
			HttpGet httpGet = null;
			HttpResponse httpResponse = null;
			
			List<Link> linkInfos = new ArrayList<Link>();
			Link lk = null;

			String html = EntityUtils.toString(entity);
			
			Document document = Jsoup.parse(html);
			Elements links = document.getElementsByAttributeValue("target", "_blank");
			for (int i = 0; i < links.size(); i++) {
				lk = new Link();
				lk.setLink(links.get(i).attr("href"));
				lk.setTitle(links.get(i).text());
				
				httpGet = new HttpGet(lk.getLink());
				try {
					httpResponse = httpClient.execute(httpGet);
					if (httpResponse.getStatusLine().getStatusCode() == 302) {
						String loc = httpResponse.getLastHeader("Location").getValue();
						loc = loc.replaceAll("hmsr=toutiao.io", "");
						loc = loc.replaceAll("&utm_medium=toutiao.io", "");
						loc = loc.replaceAll("&utm_source=toutiao.io", "");
						lk.setOriginLink(loc);
					}
	
				} catch (Exception e) {
					e.printStackTrace();
				} finally {
					httpGet.releaseConnection();
				}
				
				linkInfos.add(lk);
			}

			return linkInfos;
		}

	}

	class Link {
		private String title;
		private String link;
		private String originLink;
		
		public String getTitle() {
			return title;
		}
		public void setTitle(String title) {
			this.title = title;
		}
		public String getLink() {
			return link;
		}
		public void setLink(String link) {
			this.link = link;
		}
		public String getOriginLink() {
			return originLink;
		}
		public void setOriginLink(String originLink) {
			this.originLink = originLink;
		}
	}
	
}
  • 大小: 47.8 KB
  • 大小: 26.5 KB
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics