- 浏览: 243337 次
- 性别:
- 来自: 杭州
文章分类
最新评论
-
你的样子越来越美:
楼主是来炫耀的。
java图片处理解决方案 -
di1984HIT:
关注一下。
hypertable安装 -
di1984HIT:
katta好久不更新了。
Katta源码分析(1)--显示节点 -
tracyitbird:
谢谢博主,很好说的很明白。不过一般bin的环境变量已经配置好了 ...
解决Hadoop报Name node is in safe mode 错误 -
凌寒_微雨:
总结的很好
android 笔记——listView应用
htmlparser使用例子(全)
nekohtml经典小例子一个
nekohtml的2个小例子
htmlparser提取正文
htmlparser提取正文,提取新浪等门户首页是不太干净
httpclient htmlparser来查询手机号相关信息
httpclient htmlparser来查询手机号相关信息
http://htmlparser.com.cn/post/20090816119.html
import java.net.URL; import junit.framework.TestCase; import org.apache.log4j.Logger; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.beans.LinkBean; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.HeadTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.InputTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.OptionTag; import org.htmlparser.tags.SelectTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.TableTag; import org.htmlparser.tags.TitleTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.HtmlPage; import org.htmlparser.visitors.NodeVisitor; import org.htmlparser.visitors.ObjectFindingVisitor; public class T extends TestCase { private static final Logger logger = Logger.getLogger(T.class); public T(String name) { super(name); } /* * 测试ObjectFindVisitor的用法 */ public void testImageVisitor() { try { ImageTag imgLink; ObjectFindingVisitor visitor = new ObjectFindingVisitor(ImageTag.class); Parser parser = new Parser(); parser.setURL("http://www.google.com"); parser.setEncoding(parser.getEncoding()); parser.visitAllNodesWith(visitor); Node[] nodes = visitor.getTags(); for (int i = 0; i < nodes.length; i++) { imgLink = (ImageTag) nodes[i]; logger.fatal("testImageVisitor() ImageURL = " + imgLink.getImageURL()); logger.fatal("testImageVisitor() ImageLocation = " + imgLink.extractImageLocn()); logger.fatal("testImageVisitor() SRC = " + imgLink.getAttribute("SRC")); } } catch (Exception e) { e.printStackTrace(); } } /* * 测试TagNameFilter用法 */ public void testNodeFilter() { try { NodeFilter filter = new TagNameFilter("IMG"); Parser parser = new Parser(); parser.setURL("http://www.google.com"); parser.setEncoding(parser.getEncoding()); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml()); } } catch (Exception e) { e.printStackTrace(); } } /* * 测试NodeClassFilter用法 */ public void testLinkTag() { try { NodeFilter filter = new NodeClassFilter(LinkTag.class); Parser parser = new Parser(); parser.setURL("http://www.google.com"); parser.setEncoding(parser.getEncoding()); NodeList list = parser.extractAllNodesThatMatch(filter); for (int i = 0; i < list.size(); i++) { LinkTag node = (LinkTag) list.elementAt(i); logger.fatal("testLinkTag() Link is :" + node.extractLink()); } } catch (Exception e) { e.printStackTrace(); } } /* * 测试<link href=" text=’text/css’ rel=’stylesheet’ />用法 */ public void testLinkCSS() { try { Parser parser = new Parser(); parser.setInputHTML("<head><title>Link Test</title>" + "<link href=’/test01/css.css' text='text/css' rel='stylesheet' />" + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>" + "<body>"); parser.setEncoding(parser.getEncoding()); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { Node node = e.nextNode(); logger.fatal("testLinkCSS()" + node.getText() + node.getClass()); } } catch (Exception e) { e.printStackTrace(); } } /* * 测试OrFilter的用法 */ public void testOrFilter() { NodeFilter inputFilter = new NodeClassFilter(InputTag.class); NodeFilter selectFilter = new NodeClassFilter(SelectTag.class); NodeList nodeList = null; try { Parser parser = new Parser(); parser .setInputHTML("<head><title>OrFilter Test</title>" + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />" + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>" + "<body>" + "<input type='text' value='text1′ name='text1′/>" + "<input type='text' value='text2′ name='text2′/>" + "<select><option id='1′>1</option><option id='2′>2</option><option id='3′></option></select>" + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>"); parser.setEncoding(parser.getEncoding()); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { selectFilter, inputFilter }); nodeList = parser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof InputTag) { InputTag tag = (InputTag) nodeList.elementAt(i); logger.fatal("OrFilter tag name is :" + tag.getTagName() + " ,tag value is:" + tag.getAttribute("value")); } if (nodeList.elementAt(i) instanceof SelectTag) { SelectTag tag = (SelectTag) nodeList.elementAt(i); NodeList list = tag.getChildren(); for (int j = 0; j < list.size(); j++) { OptionTag option = (OptionTag) list.elementAt(j); logger.fatal("OrFilter Option" + option.getOptionText()); } } } } catch (ParserException e) { e.printStackTrace(); } } /* * 测试对<table><tr><td></td></tr></table>的解析 */ public void testTable() { Parser myParser; NodeList nodeList = null; myParser = Parser.createParser("<body> " + "<table id='table1′ >" + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >" + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>", "GBK"); NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); try { nodeList = myParser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; TableColumn[] td = tr.getColumns(); for (int k = 0; k < td.length; k++) { logger.fatal("<td>" + td[k].toPlainTextString()); } } } } } catch (ParserException e) { e.printStackTrace(); } } /* * 测试NodeVisitor的用法,遍历所有节点 */ public void testVisitorAll() { try { Parser parser = new Parser(); parser.setURL("http://www.google.com"); parser.setEncoding(parser.getEncoding()); NodeVisitor visitor = new NodeVisitor() { public void visitTag(Tag tag) { logger.fatal("testVisitorAll() Tag name is :" + tag.getTagName() + " \n Class is :" + tag.getClass()); } }; parser.visitAllNodesWith(visitor); } catch (ParserException e) { e.printStackTrace(); } } /* * 测试对指定Tag的NodeVisitor的用法 */ public void testTagVisitor() { try { Parser parser = new Parser("<head><title>dddd</title>" + "<link href='/test01/css.css' text='text/css' rel='stylesheet' />" + "<link href='/test02/css.css' text='text/css' rel='stylesheet' />" + "</head>" + "<body>" + "<a href='http://www.yeeach.com'>yeeach.com</a>" + "</body>"); NodeVisitor visitor = new NodeVisitor() { public void visitTag(Tag tag) { if (tag instanceof HeadTag) { logger.fatal("visitTag() HeadTag : Tag name is :" + tag.getTagName() + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText()); } else if (tag instanceof TitleTag) { logger.fatal("visitTag() TitleTag : Tag name is :" + tag.getTagName() + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText()); } else if (tag instanceof LinkTag) { logger.fatal("visitTag() LinkTag : Tag name is :" + tag.getTagName() + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText() + " \n getAttribute is :" + tag.getAttribute("href")); } else { logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + " \n Class is :" + tag.getClass() + "\n Text is :" + tag.getText()); } } }; parser.visitAllNodesWith(visitor); } catch (Exception e) { e.printStackTrace(); } } /* * 测试HtmlPage的用法 */ public void testHtmlPage() { String inputHTML = "<html>" + "<head>" + "<title>Welcome to the HTMLParser website</title>" + "</head>" + "<body>" + "Welcome to HTMLParser" + "<table id='table1′ >" + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id='table2′ >" + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>" + "</html>"; Parser parser = new Parser(); try { parser.setInputHTML(inputHTML); parser.setEncoding(parser.getURL()); HtmlPage page = new HtmlPage(parser); parser.visitAllNodesWith(page); logger.fatal("testHtmlPage -title is :" + page.getTitle()); NodeList list = page.getBody(); for (NodeIterator iterator = list.elements(); iterator.hasMoreNodes();) { Node node = iterator.nextNode(); logger.fatal("testHtmlPage -node is :" + node.toHtml()); } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /* * 测试LinkBean的用法 */ public void testLinkBean() { Parser parser = new Parser(); LinkBean linkBean = new LinkBean(); linkBean.setURL("http://www.google.com"); URL[] urls = linkBean.getLinks(); for (int i = 0; i < urls.length; i++) { URL url = urls[i]; logger.fatal("testLinkBean() -url is :" + url); } } }
nekohtml经典小例子一个
public class Demo { public static String TextExtractor(Node root){ //若是文本节点的话,直接返回 if (root.getNodeType() == Node.TEXT_NODE) { return root.getNodeValue().trim(); } if(root.getNodeType() == Node.ELEMENT_NODE) { Element elmt = (Element) root; //抛弃脚本 if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT")) return ""; NodeList children = elmt.getChildNodes(); StringBuilder text = new StringBuilder(); for (int i = 0; i < children.getLength(); i++) { text.append(TextExtractor(children.item(i))); } return text.toString(); } //对其它类型的节点,返回空值 return ""; } public static void main(String[] args) throws Exception{ //生成html parser DOMParser parser = new DOMParser(); //设置网页的默认编码 parser.setProperty( "http://cyberneko.org/html/properties/default-encoding", "gb18030"); //input file BufferedReader in = new BufferedReader(new FileReader("input.htm")); parser.parse(new InputSource(in)); Document doc = parser.getDocument(); //获得body节点,以此为根,计算其文本内容 Node body = doc.getElementsByTagName("BODY").item(0); System.out.println(TextExtractor(body)); } }
nekohtml的2个小例子
//获取网页里面的keywords和description public static void main(String[] argv) throws Exception { BufferedReader in = new BufferedReader(new FileReader("d:/163.html")); DOMParser parser = new DOMParser(); parser.setProperty( "http://cyberneko.org/html/properties/default-encoding", "gb2312"); parser.parse(new InputSource(in)); Document doc = parser.getDocument(); NodeList list = doc.getElementsByTagName("META"); for(int i = 0, n = list.getLength(); i < n ; i++){ Element e = (Element) list.item(i); if(e.getAttribute("name").equalsIgnoreCase("keywords")){ String keywords = e.getAttribute("content"); System.out.println("keywords: " + keywords); } if(e.getAttribute("name").equalsIgnoreCase("description")){ String description = e.getAttribute("content"); System.out.println(description); } } } ========================================================================== //2、test使用DOMFragmentParser,提取所有正文,由于没有过滤一些没用的标签,所以会有没用的信息打印,这个可以再事先过滤一下。 public static void main(String[] argv) throws Exception { DOMFragmentParser parser = new DOMFragmentParser(); HTMLDocument document = new HTMLDocumentImpl(); DocumentFragment fragment = document.createDocumentFragment(); parser.parse("http://sports.sina.com.cn/f1/2009-09-21/20104599271.shtml", fragment); print(fragment, ""); } /** Prints a node's class name. */ public static void print(Node node, String indent) { // System.out.println(indent + node.getClass().getName()); // System.out.println(node.getNodeType()); if (node.getNodeType() == Node.TEXT_NODE) { System.out.println(indent + node.getNodeValue()); } Node child = node.getFirstChild(); while (child != null) { print(child, indent + " "); child = child.getNextSibling(); } } }
htmlparser提取正文
htmlparser提取正文,提取新浪等门户首页是不太干净
import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.parserapplications.StringExtractor; import org.htmlparser.tags.BodyTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class GetContent { public void getContentUsingStringBean(String url) { StringBean sb = new StringBean(); sb.setLinks(true); sb.setCollapse(true); sb.setReplaceNonBreakingSpaces(true);// If true regular space sb.setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html"); System.out.println("The Content is :\n" + sb.getStrings()); } public void getContentUsingStringExtractor(String url, boolean link) { StringExtractor se = new StringExtractor(url); String text = null; try { text = se.extractStrings(link); System.out.println("The content is :\n" + text); } catch (ParserException e) { e.printStackTrace(); } } public void getContentUsingParser(String url) { NodeList nl; try { Parser p = new Parser(url); nl = p.parse(new NodeClassFilter(BodyTag.class)); BodyTag bt = (BodyTag) nl.elementAt(0); System.out.println(bt.toPlainTextString()); } catch (ParserException e) { e.printStackTrace(); } } public static void main(String[] args) { GetContent g = new GetContent(); // g.getContentUsingStringBean(""); // g.getContentUsingParser("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html"); g.getContentUsingStringExtractor("http://www.sina.com.cn/", false); }
httpclient htmlparser来查询手机号相关信息
httpclient htmlparser来查询手机号相关信息
http://htmlparser.com.cn/post/20090816119.html
public class GetMobile { /** * @author rrong_m 51jsp.cn * @throws IOException * @throws HttpException * @throws IOException * @throws HttpException * @throws ParserException */ public static String getPostString(String mobile) throws HttpException, IOException//发送数据51jsp.cn { HttpClient hc=new HttpClient(); PostMethod pm=new PostMethod("http://www.ip138.com:8080/search.asp"); hc.getParams().setContentCharset("gb2312"); pm.addParameter("mobile",mobile); pm.addParameter("action","mobile"); hc.executeMethod(pm); return pm.getResponseBodyAsString(); } public static void getMobileInfor(String poststring) throws ParserException { Parser parser=new Parser(poststring); NodeList nodelist=null; NodeFilter filter=new HasAttributeFilter("class","tdc2"); nodelist=parser.extractAllNodesThatMatch(filter); for(int i=0;i<nodelist.size();i++) { System.out.println(nodelist.elementAt(i).toPlainTextString().replace(" ","")); } } public static void main(String[] args) throws HttpException, IOException, ParserException { getMobileInfor(getPostString("1380001"));//直接填写想要查询的手机号码 } }
相关推荐
C++、MFC源代码parser_src
C++、MFC源代码parser_demo
HTML parser with asm inline
LPSV2.D2 是 Log Parser 2.2 的 GUI
html parser delphi7 delphixe2 ....
Html parser, parse html page, and get all of the tags
对html进行转换,适用爬虫/小程序富文本显示等;对html进行转换,适用爬虫/小程序富文本显示等对html进行转换,适用爬虫/小程序富文本显示等对html进行转换,适用爬虫/小程序富文本显示等
enju parser enju parser
基于JavaParser的代码调用链分析,可以用于分析Java代码的方法调用链
本资源配笔主写的博文食用最佳哦! 当初学习之用写到的demo 涨到了10分下载 现在又修改为0分下载,不知道是否可行
simple_html_dom中文解析手册
html5-parser一个用于Python基于C的快速HTML5解析
前端开源库-html-react-parserHTML React Parser,一个HTML to React Parser。
Apk parser lib, for decoding binary xml file, getting apk meta info. Table of Contents Features Get apk-parser Usage 1. Apk info 2. Get binary xml and manifest xml file 3. Get dex classes 4. Get Apk ...
MSXML 4.0 SP2 Parser and SDK
python html parser
刚开始解码H264视频时,查阅文档说有Nalu头,参考雷神代码,将文件指针传入去掉头可以解码,但是需要每次传入定量buffer解码,不是传入文件指针,经过修改可以用有bug
Html_Parser Thrid party
JavaParser This project contains a set of libraries implementing a Java 1.0 - Java 12 Parser with advanced analysis functionalities. Our main site is at JavaParser.org Setup The project binaries ...