抓取网站全站信息，并导出数据为EXCEL

xiangjinqi

浏览: 164623 次
性别:
来自: 武汉

最近访客更多访客>>

为了ta

jxjyzzc

无缘高富帅

tobylovewho

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java算法

现在以https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1为例

抓取这个站点的汽车信息

1、设置得到信息的的汽车对象类

package com.xiang;

import java.util.List;

public class CarInfo {
	
//	private String manufacturer;
//	private String serieliaze;
//	private String model;
//	private String enginCode;
//	private String kilowatt;
//	private String horsepower;
//	private String makeTime;
	
	List<String> car;

	public List<String> getCar() {
		return car;
	}

	public void setCar(List<String> car) {
		this.car = car;
	}
	
}

2、设置目录的类（包括子目录与父目录的关系）

package com.xiang;

import java.util.List;

public class CategoryAnther {
	private String id;
	private String name;
	private List<CategoryAnther> categoryAnther;
	public String getId() {
		return id;
	}
	public void setId(String id) {
		this.id = id;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public List<CategoryAnther> getCategoryAnther() {
		return categoryAnther;
	}
	public void setCategoryAnther(List<CategoryAnther> categoryAnther) {
		this.categoryAnther = categoryAnther;
	}
	
}

3、主程序抓取

package com.xiang;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.OptionTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class ExportInfo {
	/**
	 * @param args
	 **/
	public static void main(String[] args) {

		System.out.println("main start-----------"+new Date());
		// TODO Auto-generated method stub
		String url1 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1";
		String url2 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=2";
		List<CategoryAnther> firstCategory = new ArrayList<CategoryAnther>();
//		List<CategoryAnther> secondCategory = new ArrayList<CategoryAnther>();
		
		firstCategory = addChildrenToList(url1);
		firstCategory.addAll(addChildrenToList(url2));
//		secondCategory = addChildrenToList(url2);
		List<CarInfo> carInfo = new ArrayList<CarInfo>();
		try{
		File f = new File("liufen.txt");
		if(!f.exists())
			f.createNewFile();
		FileWriter fw = new FileWriter(f,true);
//		readFileByLines("xiangqi.txt",fw);
		for(int i =0;i<firstCategory.size();i++){
			CategoryAnther categoryAnther = firstCategory.get(i);
			List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther();
			for(int j=0;j<childrenCategory.size();j++){
				String _url = url1+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId();
//				System.out.println(_url);
				//start analyze data by url
				carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url));
			}
		}
//		for(int i =0;i<secondCategory.size();i++){
//			CategoryAnther categoryAnther = secondCategory.get(i);
//			List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther();
//			for(int j=0;j<childrenCategory.size();j++){
//				String _url = url2+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId();
//				//start analyze data by url
//				carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url));
//			}
//		}
		fw.write("开始写入1------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(0)+"\r\n");
		}
		fw.write("开始写入2------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(1).replace("&#160;", " ")+"\r\n");
		}
		fw.write("开始写入3------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(2)+"\r\n");
		}
		fw.write("开始写入4------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(3)+"\r\n");
		}
		fw.write("开始写入5------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(4)+"\r\n");
		}
		fw.write("开始写入6------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(5)+"\r\n");
		}
		fw.write("开始写入7------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(6)+"\r\n");
		}
		fw.flush();
		fw.close();

		}catch(Exception e){
			e.printStackTrace();
		}
		System.out.println("main end-----------"+new Date());
	}
	
	public static String getHtmlByUrl(String url){
		int layouttime = 20000;
		String html ="";
		try {
			URL b = new URL(url);
			URLConnection urlConnection = b.openConnection();
			urlConnection.setReadTimeout(layouttime);
			InputStream inputStream = urlConnection.getInputStream();
			BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, "gb2312"));
			String rString = null;
			while ((rString = in.readLine()) != null) {
				html+=rString;
			}
		}catch(Exception e){
			e.printStackTrace();
		}
			return html;
	}
	
	public static List<CarInfo> getDataByUrl(String firstName,String secondName,String url){
		System.out.println("getDataByUrl start-----------"+new Date());
		List<CarInfo> carInfoList = new ArrayList<CarInfo>();
		String html ="";
		html = getHtmlByUrl(url);
		    Parser parser = Parser.createParser(html, "gb2312");

	        NodeFilter nameFilter = new HasAttributeFilter("id",
	        		"rahmen");
	        NodeList list = null;
			try {
				list = parser.extractAllNodesThatMatch(nameFilter);
			} catch (ParserException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
//	        System.out.println("得到的行数的大小1："+list.toHtml());
	        NodeList tablelist= list.elementAt(0).getChildren();
//	        System.out.println("得到的行数的大小2："+tablelist.toHtml());
	        NodeList trlist= tablelist.elementAt(1).getChildren();
//	        System.out.println("得到的行数的大小3："+trlist.toHtml());
	        for(int i =6;i<trlist.size();i=i+2){
	        	List<String> trInfo = new ArrayList<String>();
	        	trInfo.add(firstName);
	        	trInfo.add(secondName);
	        	TableRow tableRow = (TableRow) trlist.elementAt(i);
	        	NodeList tdlist = tableRow.getChildren();
	        	for(int j =2;j<tdlist.size();j=j+3){
	        		TableColumn tableColumn = (TableColumn) tdlist.elementAt(j);
	        		NodeList alist = tableColumn.getChildren();
	        		LinkTag linkTag = null;
	        		if(j==2)
	        			linkTag = (LinkTag) alist.elementAt(1);
	        		else
	        			linkTag = (LinkTag) alist.elementAt(2);
	        		trInfo.add(linkTag.getLinkText());
//	        		System.out.print(linkTag.getLinkText()+"--");
	        	}
	        	CarInfo carInfo = new CarInfo();
	        	carInfo.setCar(trInfo);
	        	System.out.println(trInfo.get(0));
	        	carInfoList.add(carInfo);
	        }
		System.out.println("getDataByUrl end-----------"+new Date());
		return carInfoList;
	}
	
	public static List<CategoryAnther> addChildrenToList(String url){
		System.out.println("addChildrenToList start-----------"+new Date());
		List<CategoryAnther> firstCategrory = getFirstPageCategoryIds(url,"ktlg_01_mrksl");
		for (int i = 0; i < firstCategrory.size(); i++) {
			String _url = url + "&ktlg_01_mrksl=" + firstCategrory.get(i).getId();
			//对二级目录进行解析
			firstCategrory.get(i).setCategoryAnther(getFirstPageCategoryIds(_url,"ktlg_01_mdrsl"));
		}
		System.out.println("addChildrenToList end-----------"+new Date());
		return firstCategrory;
	}
	public static List<CategoryAnther> getFirstPageCategoryIds(String url,String nameValue) {
		System.out.println("getFirstPageCategoryIds start-----------"+new Date());
		List<CategoryAnther> categorys = new ArrayList<CategoryAnther>();
		String html ="";
		html = getHtmlByUrl(url);
		    Parser parser = Parser.createParser(html, "gb2312");

	        NodeFilter nameFilter = new HasAttributeFilter("name",
	        		nameValue);
	        NodeList list = null;
			try {
				list = parser.extractAllNodesThatMatch(nameFilter);
			} catch (ParserException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
	        NodeList optionList= list.elementAt(0).getChildren();
	        for(int i =1;i<optionList.size();i++){
	        	OptionTag option =  (OptionTag) optionList.elementAt(i);
	        	CategoryAnther categoryAnther = new CategoryAnther();
//	        	System.out.print(option.getAttribute("value")+"--");
//	        	System.out.println(option.getChildrenHTML());
	        	categoryAnther.setId(option.getAttribute("value"));
	        	categoryAnther.setName(option.getChildrenHTML());
	        	categorys.add(categoryAnther);
	        }
		System.out.println("getFirstPageCategoryIds end-----------"+new Date());
		return categorys;
	}
	 public static void readFileByLines(String fileName,FileWriter fw) {
	        File file = new File(fileName);
	        BufferedReader reader = null;
	        try {
	            System.out.println("以行为单位读取文件内容，一次读一整行：");
	            reader = new BufferedReader(new FileReader(file));
	            String tempString = null;
	            int line = 1;
	            // 一次读入一行，直到读入null为文件结束
	            while ((tempString = reader.readLine()) != null) {
	                // 显示行号
	            	if(tempString.trim().equals(""))
	            		fw.write(tempString+"\r\n");
	            	else if(tempString.indexOf("-")>-1)
	                	fw.write(tempString+"\r\n");
	                else
	                	fw.write(tempString+"→"+"\r\n");
	                	
	            }
	            reader.close();
	        } catch (IOException e) {
	            e.printStackTrace();
	        } finally {
	            if (reader != null) {
	                try {
	                    reader.close();
	                } catch (IOException e1) {
	                }
	            }
	        }
	    }
}

不懂得call 13886053422 或QQ 526151410

exportinfo.rar (664.7 KB)
下载次数: 4

分享到：

导出EXCEL常用工具类 | File的读取和写入操作 java

2011-11-18 13:21
浏览 955
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

抓取网站全站信息，并导出数据为EXCEL

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

抓取网站全站信息，并导出数据为EXCEL

评论

发表评论

相关推荐

机器学习----逻辑回归

通用广告推荐，团购推荐，商户推荐，商品推荐数据算法框架

算法程序-通过log重现计算过程

机器学习算法模型（一）

商品名匹配算法

求点到曲线的最短距离 垂直逼近算法

牛顿迭代--高斯方程求解

价格弹性指数 价格与销量模型

float 导致的计算精确度问题

价格与销量的关系 JAVA实现该算法 最小二乘法

对URL非法字符进行转义

JAVA 读取CSV文件

OPI 导出EXCEL（JAVA 应用程序非WEB导出）

JSP 无模板导出功能实现

lucene 分词解析器 将商品名全部切成各种词方便匹配

导出EXCEL常用工具类

File的读取和写入操作 java

排序算法，从大到小

正则表达式提取特定字符串内的特定内容

用递归算法查找父节点下的所有叶子节点

最近访客更多访客>>

求点到曲线的最短距离垂直逼近算法

价格弹性指数价格与销量模型

价格与销量的关系 JAVA实现该算法最小二乘法

lucene 分词解析器将商品名全部切成各种词方便匹配