HtmlParase解析html文件

water84222
浏览: 368616 次
性别:
来自: 大连
最近访客更多访客>>

wxahlp
JsonLi0929
bobi1
大白lalalal
博主相关

博客
微博
相册
留言
关于我
文章分类

社区版块

存档分类

博客分类：
web开发
HTML J#JSP F#
第一次使用htmlparser到现在已经有4个月了。现在想整理一下，备忘。
package epson;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.TextExtractingVisitor;


public class HtmlAnalysis {
    /**
     * @param args
     */
    private String metaDataString;
    private String title;
    private String charset;
    private String contentType;
    private String content;
    private String link;
    
    
    private String localPath ;
    private Parser parser = null;
    private String htmlsource=null;
    
    public static final String META_KEYWORDS="keywords";
    public static final String META_AUTHOR="author";
    public static final String META_DESCRIPTION="description";
    public static final String META_HTTP_EQUIV="http-equiv";
    
    public HtmlAnalysis(String htmlsource){
    	this.htmlsource = htmlsource; 
    }
    
    public HtmlAnalysis(File htmlsource){
    	
    	try{
    	String resource = this.getContentByLocalFile(htmlsource);
    	this.htmlsource = resource;
    	}catch(Exception e){
    		
    	}
    }
    
    public void init() throws Exception{
    	try{
    	parser = new Parser(this.htmlsource);
    	}catch(Exception e){
    		throw e;
    	}
    }
    
    
    public String getMetaKeywords(){
    	String metaKeywords = "";
        	
    	try {
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
				String cont  = mt.getAttribute("name") ;
				
				if (cont!=null && cont.equalsIgnoreCase("Keywords")) {
					metaKeywords = mt.getAttribute("content");
					break;
				}
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaKeywords;
    }

    public String getTitle() {
    	String title="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(TitleTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
                TitleTag titlenode = (TitleTag) nodeList.elementAt(i) ;
                title = titlenode.getTitle();
                break;
			}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return title;
    }

    public String getBody() {
    	String body="";
       	
    	try {
		NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
		NodeList nodeList = parser.parse(nt);
		for (int i = 0 ; i< nodeList.size(); i++) {
                   BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
                   body = bodynode.getChildrenHTML();
                   break;
		}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return body;
    }

    public String getBodyOnload() {
    	String bodyonload=""; 	
    	try {
			NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
				bodyonload = bodynode.getAttribute("onload");
                               break;
			}  
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return bodyonload;
    }    
    
    public String getHeadInfo() {
    	String head="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(HeadTag.class) ;
			NodeList nodeList = parser.parse(nt);
            
			HeadTag headnode = null;
			for (int i = 0 ; i< nodeList.size(); i++) {
				headnode = (HeadTag) nodeList.elementAt(i) ;
				break;
			}  
			
			
			if(headnode !=null){
				SimpleNodeIterator tag = headnode.children();
				int i=0;
				while(tag.hasMoreNodes()){
					Node node =tag.nextNode();
					if((node instanceof MetaTag) || node instanceof TitleTag){
						headnode.removeChild(i);
					}
					
					i++;
				}
			}
			
			head = headnode.getChildrenHTML();
            
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return head;
    } 
    
    
    public String getMetaInfo(String keytype){
    	String metaInfo = "";
        	
    	try {
    		
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			
    		if(META_KEYWORDS.equalsIgnoreCase(keytype)
    			||
    			META_AUTHOR.equalsIgnoreCase(keytype)
    			||
    			META_DESCRIPTION.equalsIgnoreCase(keytype))
    		{

				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else if(META_HTTP_EQUIV.equals(keytype)){
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("http-equiv") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else{
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null) {
						
						if(META_KEYWORDS.equalsIgnoreCase(cont)
				    			||
				    			META_AUTHOR.equalsIgnoreCase(cont)
				    			||
				    			META_DESCRIPTION.equalsIgnoreCase(cont)){
							
							//
						}else{
							String tempmetaInfo = mt.getAttribute("content");
							metaInfo +="<"+cont+">"+tempmetaInfo+"</"+cont+">";
						}
							
						
					}
				}
    			
    		}
    		
    		
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaInfo;
    }
    
    
    public String  getContentByLocalFile (File path) throws IOException {
    	StringBuffer sbStr = new StringBuffer();
    	BufferedReader reader = null ;
    	String result = null ;
		try {
			reader = new BufferedReader(new FileReader(path));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		String temp = "";
		while((temp=reader.readLine())!=null)
		  {
		   sbStr.append(temp);
		   sbStr.append("\r\n");
		  }
		  reader.close();
		  result = sbStr.toString();
    	return result ;
    }

    
    public String getContentByUrl(String url){
    	return null ;
    }
    
    public void getmetaDataByVistor() {
    }

    public String getURLContent(String Url) {
        Parser parser = null;

        try {
            parser = new Parser(Url);
            String a="";
            parser = new Parser(a);
            TextExtractingVisitor visitor = new TextExtractingVisitor();
            parser.visitAllNodesWith(visitor);
            content = visitor.getExtractedText();
        } catch (ParserException e1) {
            e1.printStackTrace();
        }

        return content;
    }
    public NodeList getDiv(){
    	  NodeList nodelist=null;
    	  NodeFilter[] nodeFilter=new NodeFilter[2];
    	  try{
    	   parser.setEncoding("GB2312");//set encode
    	   TagNameFilter divFilter=new TagNameFilter("div");//get the table content
    	   HasAttributeFilter divAttribute=new HasAttributeFilter("id","Cont_13");//hava the attribute "bgcolor"
    	   nodeFilter[0]=divFilter;
    	   nodeFilter[1]=divAttribute;
    	   AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together
    	   nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter
    	  }catch(Exception e){
    	   e.printStackTrace();
    	  }
    	  return nodelist;
    }
    public NodeList getTable() throws ParserException{
    	NodeList nodelist=null;
    	String dd = getDiv().toHtml();
    	Parser parser2 = new Parser(dd);
    	TagNameFilter tableFilter=new TagNameFilter("table");
    	nodelist = parser2.extractAllNodesThatMatch(tableFilter);
    	String htmlresult ="";
    	for (int i = 0; i <= nodelist.size(); i++) {
            if (nodelist.elementAt(i) instanceof TableTag) {
                TableTag tag = (TableTag) nodelist.elementAt(i);
                TableRow[] rows = tag.getRows();

                for (int j = 0; j < rows.length; j++) {
                    TableRow tr = (TableRow) rows[j];
                    TableColumn[] td = tr.getColumns();
                    for (int k = 0; k < td.length; k++) {
                    	String result = td[k].toPlainTextString().trim().replace("\t", "");
                    	if(k==0){
                    		htmlresult += "<title>"+result+"</title>";
                    	}
                    	else
                    		htmlresult += "<id>"+result+"</id>";
                    }
                }
            }
        }
    	System.out.println(htmlresult);
		return nodelist;
    }
    public void testTable() {
//        Parser myParser;
        NodeList nodeList = null;
//        myParser = Parser.createParser("<body> " + "<table id=’table1′ >"
//                + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
//                + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
//                + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"
//                + "<table id=’table2′ >"
//                + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
//                + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
//                + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"
//                + "</body>", "GBK");
        NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[] { tableFilter });
        try {
            nodeList = parser.parse(lastFilter);
            for (int i = 0; i <= nodeList.size(); i++) {
                if (nodeList.elementAt(i) instanceof TableTag) {
                    TableTag tag = (TableTag) nodeList.elementAt(i);
                    TableRow[] rows = tag.getRows();

                    for (int j = 0; j < rows.length; j++) {
                        TableRow tr = (TableRow) rows[j];
                        TableColumn[] td = tr.getColumns();
                        for (int k = 0; k < td.length; k++) {
                            System.out.println("<td>" + td[k].toPlainTextString());
                        }

                    }

                }
            }

        } catch (ParserException e) {
            e.printStackTrace();
        }
    }
    public String getImg() {
    	String img="";
    	ImageTag imgnode=null;
    	File file = new File("e:\\test\\jsp\\jsp\\test1.htm");
    	String imgRealPath="";
    	if(file.exists())
    	{  
    		file.delete();  
    		try 
    		{
				file.createNewFile();
			} catch (IOException e) 
			{
				e.printStackTrace();
			}  
    	}else{  
			  try 
			  {
				file.createNewFile();
			  } catch (IOException e) {
				e.printStackTrace();
//					 TODO Auto-generated catch block
			  }  
    	}   
    	try {
			NodeFilter nt = new NodeClassFilter(ImageTag.class) ;
			
			//BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
			NodeList nodeList = parser.parse(nt);
			
			for (int i = 0 ; i< nodeList.size(); i++){
				int num=0;
				imgnode = (ImageTag)nodeList.elementAt(i);
                img = imgnode.getImageURL();
                System.out.println(img);
               /* String[] filePath = file.getParent().split("\\\\");
                String[] imgPath = img.split("/");
                System.out.println(img+"  "+file.getParent());
                for(int j=0;j<imgPath.length;j++)
                {
                	if(imgPath[j].equals(".."))
                	{	
                		num++;
                	}
                }
                System.out.println(img.indexOf(":")+"img.indexOf(:)"+img);
                if(img.indexOf(":")!=-1)
                {
                	imgRealPath=img;
                }
                else if(num>1)
                {
                	System.out.println("img before replace"+img);
                	img = img.replace("../","");
                	System.out.println("img num>1"+img+num);
                	imgRealPath = filePath[filePath.length-1-num]+"/"+img;
                	while((filePath.length-1-num)>0)
                	{
                		num++;
                		imgRealPath = filePath[filePath.length-1-num]+imgRealPath;
                	}
                	System.out.println("imgRealPath"+imgRealPath+(filePath.length-1-num));
                }
                else if(imgPath[0].equals("."))
                {
                	System.out.println(file.getParent()+"imgPath[0].equals(.)");
                	img = img.replace("./","");
                	imgRealPath=file.getParent()+"\\"+img;
                }
                else
                {
	                for(int j=0;j<imgPath.length;j++)
	                {
	                	if(imgPath[j].equals(".."))
	                	{
	                		imgPath[j] = (String)( imgPath[j].replace("..",filePath[j+1]));
	                		System.out.println(imgPath[j]);
	                	}
	                	if(!imgPath[j].equals(""))
	                		imgRealPath += "/"+imgPath[j];
	                }
	                imgRealPath=filePath[0]+imgRealPath;
                }
                imgRealPath = imgRealPath.replaceAll("\\\\","/");
                imgnode.setImageURL(imgRealPath);
                imgRealPath="";
                writer.write(imgnode.toHtml()); */
			}  
			//writer.flush();
           // writer.close ();  
		} catch (Exception e) {
			e.printStackTrace();
		}
        return imgRealPath;
    }

    public static void main(String[] args) {
    	HtmlAnalysis htmlAnalysis= new HtmlAnalysis(new File("f:\\test.html")); 
    	try{
    		htmlAnalysis.init();
//    		System.out.println(htmlAnalysis.getMetaInfo("keywords"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("author"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("description"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("other"));
//    		htmlAnalysis.parser.reset();
    		//System.out.println(htmlAnalysis.getTitle());
    		//htmlAnalysis.parser.reset();
    		//System.out.println(htmlAnalysis.getHeadInfo());
    		htmlAnalysis.getTable();
//    		htmlAnalysis.testTable();
    	}catch(Exception e){
    		
    	}

    }
    
    public static void visitTag(Tag tag) {
        if (tag.getAttribute("class") != null) {
            System.out.println(" " + tag.getTagName() +
                tag.getAttribute("class"));
        }
    }
    
    

    public String getCharset() {
        return charset;
    }

    public void setCharset(String charset) {
        this.charset = charset;
    }

    public String getContentType() {
        return contentType;
    }

    public void setContentType(String contentType) {
        this.contentType = contentType;
    }

    public String getMetaDataString() {
        return metaDataString;
    }

    public void setMetaDataString(String metaDataString) {
        this.metaDataString = metaDataString;
    }



    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }
}
分享到：
ini文件的读取 | 待补充
2008-08-14 17:28
浏览 2871
评论(1)
查看更多
1 楼 menglinxi 2009-09-29
org.htmlparser 似乎不能对域名中包含特殊字符的连接进行抓取：如http://www.oj-sz.com，会出现
org.htmlparser.util.ParserException: Exception getting input stream from http://www.oj-sz.com/ (Not in GZIP format).;
java.io.IOException: Not in GZIP format。
发表评论

您还没有登录,请您登录后再发表评论
最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

HtmlParase解析html文件

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

HtmlParase解析html文件

评论

发表评论

相关推荐

Hibernate优化

Session IllegalStateException

JSP页面缓存设置及其它

java与excell,xml交互

prototype.js开发笔记

jar打包成exe工具

htc文件的简单介绍

LightBox

google map

使用 JFreeChart来创建基于web的图表

静态页面的生成

待补充

ajax实现的动态展开

httpclient应用

Tree的实现，js开发组件dtree

FileUpload文件上传组件与java.util.zip解压缩类

利用过滤器实现request的编码转换

JSP分页技术实现 使用工具类实现通用分页处理

一段生成验证码图片的程序

最近访客更多访客>>

JSP分页技术实现使用工具类实现通用分页处理