`
残缺的完美
  • 浏览: 42240 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

使用htmlparser抓取阿里巴巴上宁波企业的详细资料

阅读更多

     最近无聊,写个小程序抓取阿里巴巴企业的详细信息,用htmlparser解析.

不多说了,直接上代码;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;



public class TestYehoo {
 public static void main(String[] args) throws Exception
    {
  int from = 488;
  int to = 500;
  String resource = "http://search.china.alibaba.com/search/company_search.htm?province=%D5%E3%BD%AD&city=%C4%FE%B2%A8&filt=y&begin_page=";
  getAlibabaCompanyInfo(resource,from,to);
        System.out.println("====================================");

    }

    /**
     * 得到阿里巴巴企业信息
     */
    @SuppressWarnings("unchecked")
 public static void getAlibabaCompanyInfo(String content,int from,int to) throws Exception
    {
     for(int j=from-1;j<to;j++)
     {
       System.out.println("=========开始取第"+(j+1)+"页数据=============");
       List list = new ArrayList();
       Parser myParser = new Parser(content+(j+1));

          //过滤条件
          NodeFilter aFilter = new TagNameFilter ("A");
          NodeFilter classfilter_l = new HasAttributeFilter("class","l");

          //解析公司名称和公司网址
          AndFilter companyNameFilter = new AndFilter();
          companyNameFilter.setPredicates(new NodeFilter[]{aFilter,classfilter_l});
          Node[] nodes = myParser.parse(companyNameFilter).toNodeArray();
          for(int i=0;i<nodes.length;i++){
           QyxxDomain domain = new QyxxDomain();
           Node anode = (Node) nodes[i];
           int num = (j*30+i+1);
//           if("0".equals(countRecord(num))){
            System.out.println(num);
            System.out.println("公司名:"+anode.toPlainTextString());
            String text = anode.getText();
            String website = text.substring(text.indexOf("href=\"")+6, text.indexOf("/\"")+1);
            System.out.println("网址:"+website);
            String url = website+"athena/companyprofile/"+website.substring(website.indexOf("http://")+7, website.indexOf(".cn.alibaba.com/"))+".html";
            String url1 = website+"athena/contact/"+website.substring(website.indexOf("http://")+7, website.indexOf(".cn.alibaba.com/"))+".html";
            domain.setQymc(anode.toPlainTextString().trim());
            domain.setId(new Long(num).toString());
            try{
             getCompanyDetail(url,domain);
            }catch (Exception e) {
             System.out.println("取"+domain.getQymc()+"详细信息时发生错误!");
             e.printStackTrace();
       continue;
      }
            try{
             getCompanyContact(url1,domain);
            }catch (Exception e) {
             System.out.println("取"+domain.getQymc()+"联系信息时发生错误!");
             e.printStackTrace();
       continue;
      }

            list.add(domain);
//           }
          }
          System.out.println("=========结束取第"+(j+1)+"页数据=============");
          System.out.println("=========开始插入第"+(j+1)+"页数据=============");
          doInsertDB(list);
          System.out.println("=========结束插入第"+(j+1)+"页数据=============");
     }

    }

    private static String countRecord(int id)throws Exception{
     //驱动程序名
  String driverName="com.mysql.jdbc.Driver";

  //数据库用户名

  String userName="root";

  //密码

  String userPasswd="root";

  //数据库名

  String dbName="qiyetong";

  //表名

  String tableName="qyxx";

  //联结字符串

  String url="jdbc:mysql://localhost/"+dbName+"?user="+userName+"&password="+userPasswd;

  Class.forName(driverName).newInstance();

  Connection connection=DriverManager.getConnection(url);

  Statement statement = connection.createStatement();

  String sql = "select count(*) as count from qyxx where id="+id;

  ResultSet rs = statement.executeQuery(sql);
  String count="";
  while(rs.next()){
   count=rs.getString("count");
  }


  return count;

    }

 @SuppressWarnings("unchecked")
 private static void doInsertDB(List list) throws InstantiationException,
   IllegalAccessException, ClassNotFoundException, SQLException {
  //驱动程序名
  String driverName="com.mysql.jdbc.Driver";

  //数据库用户名

  String userName="root";

  //密码

  String userPasswd="root";

  //数据库名

  String dbName="qiyetong";

  //表名

  String tableName="qyxx";

  //联结字符串

  String url="jdbc:mysql://localhost/"+dbName+"?user="+userName+"&password="+userPasswd;

  Class.forName(driverName).newInstance();

  Connection connection=DriverManager.getConnection(url);

  Statement statement = connection.createStatement();

  int succNum = 0;
  for(int k=0;k<list.size();k++)
  {
   try{
    QyxxDomain domain = (QyxxDomain)list.get(k);
    String sql = "insert into qyxx(id,qymc,gsgk,zycphfw,zyhy,qylx,jyms,zczb,gszcd,ygrs,gsclsj,fddbr,zykh,nyye,zyyydd,zysc,khyh,yhzh,oem,zlkz,ncke,yfbmrs,cfmj,ycl,gszy,lxr,dh,yddh,cz,dz,yb)" +
      "VALUES(" +domain.getId()+",'"+
       SQLFilter.filtrateSQL(domain.getQymc(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getGsgk(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getZycphfw(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getZyhy(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getQylx(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getJyms(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getZczb(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getGszcd(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getYgrs(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getGsclsj(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getFddbr(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getZykh(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getNyye(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getZyyydd(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getZysc(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getKhyh(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getYhzh(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getOem(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getZlkz(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getNcke(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getYfbmrs(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getCfmj(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getYcl(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getGszy(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getLxr(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getDh(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getYddh(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getCz(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getDz(), 1)+"','"+
       SQLFilter.filtrateSQL(domain.getYb(), 1)+"');";
    System.out.println(sql);
    statement.execute(sql);
    succNum++;
   }catch (Exception e) {
    System.out.println("插入第"+k+"条信息时发生错误!");
    continue;
   }
  }
  System.out.println("=========成功插入"+succNum+"条数据=============");
 }

    /**
     * 得到企业详细信息
     */
    public static void getCompanyDetail(String content,QyxxDomain domain) throws Exception
    {
      Parser myParser = new Parser(content);
      Parser myParser1 = new Parser(content);

         //过滤条件
         NodeFilter divFilter = new TagNameFilter ("DIV");
         NodeFilter tdFilter = new TagNameFilter ("TD");
         NodeFilter classfilter_companyinfo = new HasAttributeFilter("class","companyinfo mainTextColor");
         NodeFilter classfilter_Slh15 = new HasAttributeFilter("class","S lh15");

         //解析公司概况
         AndFilter detailFilter1 = new AndFilter();
         detailFilter1.setPredicates(new NodeFilter[]{divFilter,classfilter_companyinfo});
         Node[] nodes1 = myParser1.parse(detailFilter1).toNodeArray();
         for(int i=0;i<nodes1.length;i++){
          Node anode = (Node) nodes1[i];
          String gsgk = "";
          try{
           gsgk = splitAndFilterString(anode.getChildren().toNodeArray()[6].toPlainTextString());
          }catch (Exception e) {
     gsgk = "";
    }
          domain.setGsgk(gsgk);
          System.out.println(domain.getGsgk());
         }
         //解析公司详细资料
         AndFilter detailFilter = new AndFilter();
         detailFilter.setPredicates(new NodeFilter[]{tdFilter,classfilter_Slh15});
         Node[] nodes = myParser.parse(detailFilter).toNodeArray();
         for(int i=0;i<nodes.length;i++){
          Node anode = (Node) nodes[i];
          if(anode.toPlainTextString().indexOf("主营产品或服务")!=-1){
           domain.setZycphfw(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("主营产品或服务:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("主营行业")!=-1){
           domain.setZyhy(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("主营行业:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("企业类型")!=-1){
           domain.setQylx(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("企业类型:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("经营模式")!=-1){
           domain.setJyms(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("经营模式:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("注册资本")!=-1){
           domain.setZczb(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("注册资本:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("公司注册地")!=-1){
           domain.setGszcd(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("公司注册地:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("员工人数")!=-1){
           domain.setYgrs(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("员工人数:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("公司成立时间")!=-1){
           domain.setGsclsj(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("公司成立时间:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("法定代表人/负责人")!=-1){
           domain.setFddbr(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("法定代表人/负责人:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("主要客户")!=-1){
           domain.setZykh(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("主要客户:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("年营业额")!=-1){
           domain.setNyye(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("年营业额:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("主要经营地点")!=-1){
           domain.setZyyydd(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("主要经营地点:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("主要市场")!=-1){
           domain.setZysc(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("主要市场:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("开户银行")!=-1){
           domain.setKhyh(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("开户银行:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("银行帐号")!=-1){
           domain.setYhzh(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("银行帐号:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("是否提供OEM代加工")!=-1){
           domain.setOem(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("是否提供OEM代加工:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("质量控制")!=-1){
           domain.setZlkz(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("质量控制:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("年出口额")!=-1){
           domain.setNcke(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("年出口额:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("研发部门人数")!=-1){
           domain.setYfbmrs(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("研发部门人数:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("厂房面积")!=-1){
           domain.setCfmj(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("厂房面积:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("月产量")!=-1){
           domain.setYcl(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("月产量:"+((Node) nodes[i+1]).toPlainTextString());
          }
          if(anode.toPlainTextString().indexOf("公司主页")!=-1){
           domain.setGszy(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
           //System.out.println("公司主页:"+((Node) nodes[i+1]).toPlainTextString());
          }
         }
    }

    /**
     * 得到企业详细信息
     */
    public static void getCompanyContact(String content,QyxxDomain domain) throws Exception
    {
      Parser myParser = new Parser(content);
      Parser myParser1 = new Parser(content);

         //过滤条件
         NodeFilter divFilter = new TagNameFilter ("DIV");
         NodeFilter liFilter = new TagNameFilter ("LI");
         NodeFilter classfilter_companyinfo = new HasAttributeFilter("class","title ml15 b mb20 mt20 mainTextColor");

         //解析联系人
         AndFilter detailFilter1 = new AndFilter();
         detailFilter1.setPredicates(new NodeFilter[]{divFilter,classfilter_companyinfo});
         Node[] nodes1 = myParser1.parse(detailFilter1).toNodeArray();
         for(int i=0;i<nodes1.length;i++){
          Node anode = (Node) nodes1[i];
          domain.setLxr(splitAndFilterString(anode.getChildren().toNodeArray()[1].toPlainTextString())+splitAndFilterString(anode.getChildren().toNodeArray()[2].toPlainTextString()));
          //System.out.println(domain.getLxr());
         }
         //解析联系方式
         Node[] nodes = myParser.parse(liFilter).toNodeArray();
         for(int i=0;i<nodes.length;i++){
          Node anode = (Node) nodes[i];
          if(anode.toPlainTextString().indexOf("电    话")!=-1){
           domain.setDh(splitAndFilterString(anode.toPlainTextString()).substring(3));
           //System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
          }
          if(anode.toPlainTextString().indexOf("移动电话")!=-1){
           domain.setYddh(splitAndFilterString(anode.toPlainTextString()).substring(5));
           //System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(5));
          }
          if(anode.toPlainTextString().indexOf("传    真")!=-1){
           domain.setCz(splitAndFilterString(anode.toPlainTextString()).substring(3));
           //System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
          }
          if(anode.toPlainTextString().indexOf("地    址")!=-1){
           domain.setDz(splitAndFilterString(anode.toPlainTextString()).substring(3));
           //System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
          }
          if(anode.toPlainTextString().indexOf("邮    编")!=-1){
           domain.setYb(splitAndFilterString(anode.toPlainTextString()).substring(3));
           //System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
          }
         }
    }

    /**
     * 读取一个文件到字符串里.
     *
     * @param sFileName  文件名
     * @param sEncode   String
     * @return 文件内容
     */
    public static String readTextFile(String sFileName, String sEncode)
    {
        StringBuffer sbStr = new StringBuffer();

        try
        {
            File ff = new File(sFileName);
            InputStreamReader read = new InputStreamReader(new FileInputStream(ff),
                    sEncode);
            BufferedReader ins = new BufferedReader(read);

            String dataLine = "";
            while (null != (dataLine = ins.readLine()))
            {
                sbStr.append(dataLine);
                sbStr.append("\r\n");
            }

            ins.close();
        }
        catch (Exception e)
        {
            e.printStackTrace();
        }

        return sbStr.toString();
    }

    /**
     * 去掉左右空格后字符串是否为空
     * @param astr String
     * @return boolean
     */
    public static boolean isTrimEmpty(String astr)
    {
        if ((null == astr) || (astr.length() == 0))
        {
            return true;
        }
        if (isBlank(astr.trim()))
        {
            return true;
        }
        return false;
    }

    /**
     * 字符串是否为空:null或者长度为0.
     * @param astr 源字符串.
     * @return boolean
     */
    public static boolean isBlank(String astr)
    {
        if ((null == astr) || (astr.length() == 0))
        {
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
  * 删除input字符串中的html格式
  *
  * @param input
  * @param length
  * @return
  */
 public static String splitAndFilterString(String input) {
  if (input == null || input.trim().equals("")) {
   return "";
  }
  // 去掉所有html元素,
  String str = input.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll(
    "<[^>]*>", "");
  str = str.replaceAll("[(/>)<]", "");
  str = str.replaceAll(" ", "");
  str = str.replaceAll("\\r", "");
  str = str.replaceAll("\\n", "");
  str = str.replaceAll("\\t", "");
  return str;
 } 

 

最近找了个网络爬虫web Harvest 只是跑了个例子,打算摸索下抓取阿里巴巴的企业信息看看效果怎么样,到时候发出来给大家分享!

2
0
分享到:
评论
4 楼 wzpwork 2008-12-09  
谢谢楼主,我也写了一个这样的爬虫,但被阿里给终止了,直接返回找不到页面的信息给我.

3 楼 leeqianjun 2008-11-09  
网络爬虫web Harvest 比较不错,扩展性非常好,只不过相应的处理的时间比较长一点,没有通过HTMLPARSER 速度快。
2 楼 sdh5724 2008-11-08  
最受不了的是写爬虫。 不控制速度的程序员。
每天系统要自动屏蔽很多这样的爬虫。
有些爬虫直接就是放几千连接过来抓数据, 结果导致系统压力升高, 不得不终止他的行为
1 楼 yushan 2008-11-08  
楼主很强悍啊 

相关推荐

Global site tag (gtag.js) - Google Analytics