`
zkl_1987
  • 浏览: 243337 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

Html parser 代码集锦 2

阅读更多
htmlparser使用例子(全)

import java.net.URL;   
  
import junit.framework.TestCase;   
  
import org.apache.log4j.Logger;   
import org.htmlparser.Node;   
import org.htmlparser.NodeFilter;   
import org.htmlparser.Parser;   
import org.htmlparser.Tag;   
import org.htmlparser.beans.LinkBean;   
import org.htmlparser.filters.NodeClassFilter;   
import org.htmlparser.filters.OrFilter;   
import org.htmlparser.filters.TagNameFilter;   
import org.htmlparser.tags.HeadTag;   
import org.htmlparser.tags.ImageTag;   
import org.htmlparser.tags.InputTag;   
import org.htmlparser.tags.LinkTag;   
import org.htmlparser.tags.OptionTag;   
import org.htmlparser.tags.SelectTag;   
import org.htmlparser.tags.TableColumn;   
import org.htmlparser.tags.TableRow;   
import org.htmlparser.tags.TableTag;   
import org.htmlparser.tags.TitleTag;   
import org.htmlparser.util.NodeIterator;   
import org.htmlparser.util.NodeList;   
import org.htmlparser.util.ParserException;   
import org.htmlparser.visitors.HtmlPage;   
import org.htmlparser.visitors.NodeVisitor;   
import org.htmlparser.visitors.ObjectFindingVisitor;   
  
public class T extends TestCase {   
  
  private static final Logger logger = Logger.getLogger(T.class);   
  
  public T(String name) {   
    super(name);   
  }   
  
  /*  
   * 测试ObjectFindVisitor的用法  
   */  
  public void testImageVisitor() {   
    try {   
      ImageTag imgLink;   
      ObjectFindingVisitor visitor = new ObjectFindingVisitor(ImageTag.class);   
      Parser parser = new Parser();   
      parser.setURL("http://www.google.com");   
      parser.setEncoding(parser.getEncoding());   
      parser.visitAllNodesWith(visitor);   
      Node[] nodes = visitor.getTags();   
      for (int i = 0; i < nodes.length; i++) {   
        imgLink = (ImageTag) nodes[i];   
        logger.fatal("testImageVisitor() ImageURL = " + imgLink.getImageURL());   
        logger.fatal("testImageVisitor() ImageLocation = " + imgLink.extractImageLocn());   
        logger.fatal("testImageVisitor() SRC = " + imgLink.getAttribute("SRC"));   
      }   
    } catch (Exception e) {   
      e.printStackTrace();   
    }   
  }   
  
  /*  
   * 测试TagNameFilter用法  
   */  
  public void testNodeFilter() {   
    try {   
      NodeFilter filter = new TagNameFilter("IMG");   
      Parser parser = new Parser();   
      parser.setURL("http://www.google.com");   
      parser.setEncoding(parser.getEncoding());   
      NodeList list = parser.extractAllNodesThatMatch(filter);   
      for (int i = 0; i < list.size(); i++) {   
        logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml());   
      }   
    } catch (Exception e) {   
      e.printStackTrace();   
    }   
  
  }   
  
  /*  
   * 测试NodeClassFilter用法  
   */  
  public void testLinkTag() {   
    try {   
  
      NodeFilter filter = new NodeClassFilter(LinkTag.class);   
      Parser parser = new Parser();   
      parser.setURL("http://www.google.com");   
      parser.setEncoding(parser.getEncoding());   
      NodeList list = parser.extractAllNodesThatMatch(filter);   
      for (int i = 0; i < list.size(); i++) {   
        LinkTag node = (LinkTag) list.elementAt(i);   
        logger.fatal("testLinkTag() Link is :" + node.extractLink());   
      }   
    } catch (Exception e) {   
      e.printStackTrace();   
    }   
  
  }   
  
  /*  
   * 测试<link href=" text=’text/css’ rel=’stylesheet’ />用法  
   */  
  public void testLinkCSS() {   
    try {   
  
      Parser parser = new Parser();   
      parser.setInputHTML("<head><title>Link Test</title>"  
          + "<link href=’/test01/css.css' text='text/css' rel='stylesheet' />"  
          + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>"  
          + "<body>");   
      parser.setEncoding(parser.getEncoding());   
  
      for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {   
        Node node = e.nextNode();   
        logger.fatal("testLinkCSS()" + node.getText() + node.getClass());   
  
      }   
    } catch (Exception e) {   
      e.printStackTrace();   
    }   
  }   
  
  /*  
   * 测试OrFilter的用法  
   */  
  public void testOrFilter() {   
    NodeFilter inputFilter = new NodeClassFilter(InputTag.class);   
    NodeFilter selectFilter = new NodeClassFilter(SelectTag.class);   
  
    NodeList nodeList = null;   
  
    try {   
      Parser parser = new Parser();   
      parser   
          .setInputHTML("<head><title>OrFilter Test</title>"  
              + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />"  
              + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />"  
              + "</head>"  
              + "<body>"  
              + "<input type='text' value='text1′ name='text1′/>"  
              + "<input type='text' value='text2′ name='text2′/>"  
              + "<select><option id='1′>1</option><option id='2′>2</option><option id='3′></option></select>"  
              + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>");   
  
      parser.setEncoding(parser.getEncoding());   
      OrFilter lastFilter = new OrFilter();   
      lastFilter.setPredicates(new NodeFilter[] { selectFilter, inputFilter });   
      nodeList = parser.parse(lastFilter);   
      for (int i = 0; i <= nodeList.size(); i++) {   
        if (nodeList.elementAt(i) instanceof InputTag) {   
          InputTag tag = (InputTag) nodeList.elementAt(i);   
          logger.fatal("OrFilter tag name is :" + tag.getTagName() + " ,tag value is:"  
              + tag.getAttribute("value"));   
        }   
        if (nodeList.elementAt(i) instanceof SelectTag) {   
          SelectTag tag = (SelectTag) nodeList.elementAt(i);   
          NodeList list = tag.getChildren();   
  
          for (int j = 0; j < list.size(); j++) {   
            OptionTag option = (OptionTag) list.elementAt(j);   
            logger.fatal("OrFilter Option" + option.getOptionText());   
          }   
  
        }   
      }   
  
    } catch (ParserException e) {   
      e.printStackTrace();   
    }   
  }   
  
  /*  
   * 测试对<table><tr><td></td></tr></table>的解析  
   */  
  public void testTable() {   
    Parser myParser;   
    NodeList nodeList = null;   
    myParser = Parser.createParser("<body> " + "<table id='table1′ >"  
        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"  
        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"  
        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >"  
        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"  
        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"  
        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>", "GBK");   
    NodeFilter tableFilter = new NodeClassFilter(TableTag.class);   
    OrFilter lastFilter = new OrFilter();   
    lastFilter.setPredicates(new NodeFilter[] { tableFilter });   
    try {   
      nodeList = myParser.parse(lastFilter);   
      for (int i = 0; i <= nodeList.size(); i++) {   
        if (nodeList.elementAt(i) instanceof TableTag) {   
          TableTag tag = (TableTag) nodeList.elementAt(i);   
          TableRow[] rows = tag.getRows();   
  
          for (int j = 0; j < rows.length; j++) {   
            TableRow tr = (TableRow) rows[j];   
            TableColumn[] td = tr.getColumns();   
            for (int k = 0; k < td.length; k++) {   
              logger.fatal("<td>" + td[k].toPlainTextString());   
            }   
  
          }   
  
        }   
      }   
  
    } catch (ParserException e) {   
      e.printStackTrace();   
    }   
  }   
  
  /*  
   * 测试NodeVisitor的用法,遍历所有节点  
   */  
  public void testVisitorAll() {   
    try {   
      Parser parser = new Parser();   
      parser.setURL("http://www.google.com");   
      parser.setEncoding(parser.getEncoding());   
      NodeVisitor visitor = new NodeVisitor() {   
        public void visitTag(Tag tag) {   
          logger.fatal("testVisitorAll()  Tag name is :" + tag.getTagName() + " \n Class is :"  
              + tag.getClass());   
        }   
  
      };   
  
      parser.visitAllNodesWith(visitor);   
    } catch (ParserException e) {   
      e.printStackTrace();   
    }   
  }   
  
  /*  
   * 测试对指定Tag的NodeVisitor的用法  
   */  
  public void testTagVisitor() {   
    try {   
  
      Parser parser = new Parser("<head><title>dddd</title>"  
          + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />"  
          + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>"  
          + "<body>" + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>");   
      NodeVisitor visitor = new NodeVisitor() {   
        public void visitTag(Tag tag) {   
          if (tag instanceof HeadTag) {   
            logger.fatal("visitTag() HeadTag : Tag name is :" + tag.getTagName()   
                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());   
          } else if (tag instanceof TitleTag) {   
            logger.fatal("visitTag() TitleTag : Tag name is :" + tag.getTagName()   
                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText());   
  
          } else if (tag instanceof LinkTag) {   
            logger.fatal("visitTag() LinkTag : Tag name is :" + tag.getTagName()   
                + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText()   
                + " \n getAttribute is :" + tag.getAttribute("href"));   
          } else {   
            logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + " \n Class is :"  
                + tag.getClass() + "\n Text is :" + tag.getText());   
          }   
  
        }   
  
      };   
  
      parser.visitAllNodesWith(visitor);   
    } catch (Exception e) {   
      e.printStackTrace();   
    }   
  }   
  
  /*  
   * 测试HtmlPage的用法  
   */  
  public void testHtmlPage() {   
    String inputHTML = "<html>" + "<head>"  
        + "<title>Welcome to the HTMLParser website</title>" + "</head>" + "<body>"  
        + "Welcome to HTMLParser" + "<table id='table1′ >"  
        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"  
        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"  
        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >"  
        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"  
        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"  
        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>" + "</html>";   
    Parser parser = new Parser();   
    try {   
      parser.setInputHTML(inputHTML);   
      parser.setEncoding(parser.getURL());   
      HtmlPage page = new HtmlPage(parser);   
      parser.visitAllNodesWith(page);   
      logger.fatal("testHtmlPage -title is :" + page.getTitle());   
      NodeList list = page.getBody();   
  
      for (NodeIterator iterator = list.elements(); iterator.hasMoreNodes();) {   
        Node node = iterator.nextNode();   
        logger.fatal("testHtmlPage -node  is :" + node.toHtml());   
      }   
  
    } catch (ParserException e) {   
      // TODO Auto-generated catch block   
      e.printStackTrace();   
    }   
  }   
  
  /*  
   * 测试LinkBean的用法  
   */  
  public void testLinkBean() {   
    Parser parser = new Parser();   
  
    LinkBean linkBean = new LinkBean();   
    linkBean.setURL("http://www.google.com");   
    URL[] urls = linkBean.getLinks();   
  
    for (int i = 0; i < urls.length; i++) {   
      URL url = urls[i];   
      logger.fatal("testLinkBean() -url  is :" + url);   
    }   
  
  }   
  
}  



nekohtml经典小例子一个

public class Demo {   
public static String TextExtractor(Node root){   
   //若是文本节点的话,直接返回   
   if (root.getNodeType() == Node.TEXT_NODE) {   
    return root.getNodeValue().trim();   
   }   
   if(root.getNodeType() == Node.ELEMENT_NODE) {   
    Element elmt = (Element) root;   
    //抛弃脚本   
    if (elmt.getTagName().equals("STYLE")   
      || elmt.getTagName().equals("SCRIPT"))   
     return "";   
      
    NodeList children = elmt.getChildNodes();   
    StringBuilder text = new StringBuilder();   
    for (int i = 0; i < children.getLength(); i++) {   
     text.append(TextExtractor(children.item(i)));   
    }   
    return text.toString();   
   }   
   //对其它类型的节点,返回空值   
   return "";   
}   
public static void main(String[] args) throws Exception{   
   //生成html parser   
   DOMParser parser = new DOMParser();   
   //设置网页的默认编码   
   parser.setProperty(   
     "http://cyberneko.org/html/properties/default-encoding",   
     "gb18030");   
   //input file   
   BufferedReader in = new BufferedReader(new FileReader("input.htm"));   
   parser.parse(new InputSource(in));   
   Document doc = parser.getDocument();   
   //获得body节点,以此为根,计算其文本内容   
   Node body = doc.getElementsByTagName("BODY").item(0);   
   System.out.println(TextExtractor(body));   
}   
}  



nekohtml的2个小例子

//获取网页里面的keywords和description   
    public static void main(String[] argv) throws Exception {   
           
        BufferedReader in = new BufferedReader(new FileReader("d:/163.html"));   
        DOMParser parser = new DOMParser();   
         parser.setProperty(   
                 "http://cyberneko.org/html/properties/default-encoding",   
                 "gb2312");   
        parser.parse(new InputSource(in));   
           
        Document doc = parser.getDocument();   
        NodeList list = doc.getElementsByTagName("META");   
        for(int i = 0, n = list.getLength(); i < n ; i++){   
            Element e = (Element) list.item(i);   
            if(e.getAttribute("name").equalsIgnoreCase("keywords")){   
                String keywords = e.getAttribute("content");   
                System.out.println("keywords: " + keywords);   
            }   
            if(e.getAttribute("name").equalsIgnoreCase("description")){   
                String description = e.getAttribute("content");   
                System.out.println(description);   
            }   
        }   
    }    
  
==========================================================================   
//2、test使用DOMFragmentParser,提取所有正文,由于没有过滤一些没用的标签,所以会有没用的信息打印,这个可以再事先过滤一下。   
  
    public static void main(String[] argv) throws Exception {   
        DOMFragmentParser parser = new DOMFragmentParser();   
        HTMLDocument document = new HTMLDocumentImpl();   
        DocumentFragment fragment = document.createDocumentFragment();   
        parser.parse("http://sports.sina.com.cn/f1/2009-09-21/20104599271.shtml", fragment);   
        print(fragment, "");   
    }    
  
    /** Prints a node's class name. */  
    public static void print(Node node, String indent) {   
           
//      System.out.println(indent + node.getClass().getName());   
//      System.out.println(node.getNodeType());   
           
        if (node.getNodeType() == Node.TEXT_NODE) {   
            System.out.println(indent + node.getNodeValue());   
        }   
        Node child = node.getFirstChild();   
        while (child != null) {   
            print(child, indent + " ");   
            child = child.getNextSibling();   
        }   
    }    
  
}   



htmlparser提取正文

htmlparser提取正文,提取新浪等门户首页是不太干净

import org.htmlparser.Parser;    
import org.htmlparser.beans.StringBean;    
import org.htmlparser.filters.NodeClassFilter;    
import org.htmlparser.parserapplications.StringExtractor;    
import org.htmlparser.tags.BodyTag;    
import org.htmlparser.util.NodeList;    
import org.htmlparser.util.ParserException;    
    
public class GetContent {    
       
    public void getContentUsingStringBean(String url) {    
        StringBean sb = new StringBean();    
        sb.setLinks(true);              
        sb.setCollapse(true);     
        sb.setReplaceNonBreakingSpaces(true);// If true regular space    
        sb.setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");    
        System.out.println("The Content is :\n" + sb.getStrings());    
    
    }    
       
    public void getContentUsingStringExtractor(String url, boolean link) {    
        StringExtractor se = new StringExtractor(url);    
        String text = null;    
        try {    
            text = se.extractStrings(link);    
            System.out.println("The content is :\n" + text);    
        } catch (ParserException e) {    
            e.printStackTrace();    
        }    
    }    
    
    public void getContentUsingParser(String url) {    
        NodeList nl;    
        try {    
            Parser p = new Parser(url);    
            nl = p.parse(new NodeClassFilter(BodyTag.class));    
            BodyTag bt = (BodyTag) nl.elementAt(0);    
            System.out.println(bt.toPlainTextString());    
            } catch (ParserException e) {    
            e.printStackTrace();    
        }    
    }    
       
    public static void main(String[] args) {   
        GetContent g = new GetContent();   
//      g.getContentUsingStringBean("");   
//      g.getContentUsingParser("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");   
        g.getContentUsingStringExtractor("http://www.sina.com.cn/", false);   
    }  



httpclient htmlparser来查询手机号相关信息


httpclient  htmlparser来查询手机号相关信息
http://htmlparser.com.cn/post/20090816119.html

public class GetMobile {    
    
    /**   
     * @author rrong_m 51jsp.cn   
     * @throws IOException    
     * @throws HttpException    
     * @throws IOException    
     * @throws HttpException    
     * @throws ParserException    
     */    
    public static String getPostString(String mobile) throws HttpException, IOException//发送数据51jsp.cn    
    {    
        HttpClient hc=new HttpClient();    
        PostMethod pm=new PostMethod("http://www.ip138.com:8080/search.asp");    
        hc.getParams().setContentCharset("gb2312");    
        pm.addParameter("mobile",mobile);    
        pm.addParameter("action","mobile");    
        hc.executeMethod(pm);    
        return pm.getResponseBodyAsString();    
    }    
    public static void getMobileInfor(String poststring) throws ParserException    
    {    
        Parser parser=new Parser(poststring);    
        NodeList nodelist=null;    
        NodeFilter filter=new HasAttributeFilter("class","tdc2");    
        nodelist=parser.extractAllNodesThatMatch(filter);    
        for(int i=0;i<nodelist.size();i++)    
        {    
            System.out.println(nodelist.elementAt(i).toPlainTextString().replace("&nbsp;",""));    
        }    
    }    
    public static void main(String[] args) throws HttpException, IOException, ParserException {    
        getMobileInfor(getPostString("1380001"));//直接填写想要查询的手机号码    
    }    
    
}   
分享到:
评论
1 楼 jkbjxy 2012-07-26  
这么好的资源没人顶,我来顶一个!!!

相关推荐

Global site tag (gtag.js) - Google Analytics