`
water84222
  • 浏览: 368616 次
  • 性别: Icon_minigender_1
  • 来自: 大连
社区版块
存档分类
最新评论

HtmlParase解析html文件

阅读更多

    第一次使用htmlparser到现在已经有4个月了。现在想整理一下,备忘。

package epson;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.TextExtractingVisitor;


public class HtmlAnalysis {
    /**
     * @param args
     */
    private String metaDataString;
    private String title;
    private String charset;
    private String contentType;
    private String content;
    private String link;
    
    
    private String localPath ;
    private Parser parser = null;
    private String htmlsource=null;
    
    public static final String META_KEYWORDS="keywords";
    public static final String META_AUTHOR="author";
    public static final String META_DESCRIPTION="description";
    public static final String META_HTTP_EQUIV="http-equiv";
    
    public HtmlAnalysis(String htmlsource){
    	this.htmlsource = htmlsource; 
    }
    
    public HtmlAnalysis(File htmlsource){
    	
    	try{
    	String resource = this.getContentByLocalFile(htmlsource);
    	this.htmlsource = resource;
    	}catch(Exception e){
    		
    	}
    }
    
    public void init() throws Exception{
    	try{
    	parser = new Parser(this.htmlsource);
    	}catch(Exception e){
    		throw e;
    	}
    }
    
    
    public String getMetaKeywords(){
    	String metaKeywords = "";
        	
    	try {
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
				String cont  = mt.getAttribute("name") ;
				
				if (cont!=null && cont.equalsIgnoreCase("Keywords")) {
					metaKeywords = mt.getAttribute("content");
					break;
				}
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaKeywords;
    }

    public String getTitle() {
    	String title="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(TitleTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
                TitleTag titlenode = (TitleTag) nodeList.elementAt(i) ;
                title = titlenode.getTitle();
                break;
			}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return title;
    }

    public String getBody() {
    	String body="";
       	
    	try {
		NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
		NodeList nodeList = parser.parse(nt);
		for (int i = 0 ; i< nodeList.size(); i++) {
                   BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
                   body = bodynode.getChildrenHTML();
                   break;
		}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return body;
    }

    public String getBodyOnload() {
    	String bodyonload=""; 	
    	try {
			NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
				bodyonload = bodynode.getAttribute("onload");
                               break;
			}  
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return bodyonload;
    }    
    
    public String getHeadInfo() {
    	String head="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(HeadTag.class) ;
			NodeList nodeList = parser.parse(nt);
            
			HeadTag headnode = null;
			for (int i = 0 ; i< nodeList.size(); i++) {
				headnode = (HeadTag) nodeList.elementAt(i) ;
				break;
			}  
			
			
			if(headnode !=null){
				SimpleNodeIterator tag = headnode.children();
				int i=0;
				while(tag.hasMoreNodes()){
					Node node =tag.nextNode();
					if((node instanceof MetaTag) || node instanceof TitleTag){
						headnode.removeChild(i);
					}
					
					i++;
				}
			}
			
			head = headnode.getChildrenHTML();
            
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return head;
    } 
    
    
    public String getMetaInfo(String keytype){
    	String metaInfo = "";
        	
    	try {
    		
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			
    		if(META_KEYWORDS.equalsIgnoreCase(keytype)
    			||
    			META_AUTHOR.equalsIgnoreCase(keytype)
    			||
    			META_DESCRIPTION.equalsIgnoreCase(keytype))
    		{

				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else if(META_HTTP_EQUIV.equals(keytype)){
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("http-equiv") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else{
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null) {
						
						if(META_KEYWORDS.equalsIgnoreCase(cont)
				    			||
				    			META_AUTHOR.equalsIgnoreCase(cont)
				    			||
				    			META_DESCRIPTION.equalsIgnoreCase(cont)){
							
							//
						}else{
							String tempmetaInfo = mt.getAttribute("content");
							metaInfo +="<"+cont+">"+tempmetaInfo+"</"+cont+">";
						}
							
						
					}
				}
    			
    		}
    		
    		
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaInfo;
    }
    
    
    public String  getContentByLocalFile (File path) throws IOException {
    	StringBuffer sbStr = new StringBuffer();
    	BufferedReader reader = null ;
    	String result = null ;
		try {
			reader = new BufferedReader(new FileReader(path));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		String temp = "";
		while((temp=reader.readLine())!=null)
		  {
		   sbStr.append(temp);
		   sbStr.append("\r\n");
		  }
		  reader.close();
		  result = sbStr.toString();
    	return result ;
    }

    
    public String getContentByUrl(String url){
    	return null ;
    }
    
    public void getmetaDataByVistor() {
    }

    public String getURLContent(String Url) {
        Parser parser = null;

        try {
            parser = new Parser(Url);
            String a="";
            parser = new Parser(a);
            TextExtractingVisitor visitor = new TextExtractingVisitor();
            parser.visitAllNodesWith(visitor);
            content = visitor.getExtractedText();
        } catch (ParserException e1) {
            e1.printStackTrace();
        }

        return content;
    }
    public NodeList getDiv(){
    	  NodeList nodelist=null;
    	  NodeFilter[] nodeFilter=new NodeFilter[2];
    	  try{
    	   parser.setEncoding("GB2312");//set encode
    	   TagNameFilter divFilter=new TagNameFilter("div");//get the table content
    	   HasAttributeFilter divAttribute=new HasAttributeFilter("id","Cont_13");//hava the attribute "bgcolor"
    	   nodeFilter[0]=divFilter;
    	   nodeFilter[1]=divAttribute;
    	   AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together
    	   nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter
    	  }catch(Exception e){
    	   e.printStackTrace();
    	  }
    	  return nodelist;
    }
    public NodeList getTable() throws ParserException{
    	NodeList nodelist=null;
    	String dd = getDiv().toHtml();
    	Parser parser2 = new Parser(dd);
    	TagNameFilter tableFilter=new TagNameFilter("table");
    	nodelist = parser2.extractAllNodesThatMatch(tableFilter);
    	String htmlresult ="";
    	for (int i = 0; i <= nodelist.size(); i++) {
            if (nodelist.elementAt(i) instanceof TableTag) {
                TableTag tag = (TableTag) nodelist.elementAt(i);
                TableRow[] rows = tag.getRows();

                for (int j = 0; j < rows.length; j++) {
                    TableRow tr = (TableRow) rows[j];
                    TableColumn[] td = tr.getColumns();
                    for (int k = 0; k < td.length; k++) {
                    	String result = td[k].toPlainTextString().trim().replace("\t", "");
                    	if(k==0){
                    		htmlresult += "<title>"+result+"</title>";
                    	}
                    	else
                    		htmlresult += "<id>"+result+"</id>";
                    }
                }
            }
        }
    	System.out.println(htmlresult);
		return nodelist;
    }
    public void testTable() {
//        Parser myParser;
        NodeList nodeList = null;
//        myParser = Parser.createParser("<body> " + "<table id=’table1′ >"
//                + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
//                + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
//                + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"
//                + "<table id=’table2′ >"
//                + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
//                + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
//                + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"
//                + "</body>", "GBK");
        NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[] { tableFilter });
        try {
            nodeList = parser.parse(lastFilter);
            for (int i = 0; i <= nodeList.size(); i++) {
                if (nodeList.elementAt(i) instanceof TableTag) {
                    TableTag tag = (TableTag) nodeList.elementAt(i);
                    TableRow[] rows = tag.getRows();

                    for (int j = 0; j < rows.length; j++) {
                        TableRow tr = (TableRow) rows[j];
                        TableColumn[] td = tr.getColumns();
                        for (int k = 0; k < td.length; k++) {
                            System.out.println("<td>" + td[k].toPlainTextString());
                        }

                    }

                }
            }

        } catch (ParserException e) {
            e.printStackTrace();
        }
    }
    public String getImg() {
    	String img="";
    	ImageTag imgnode=null;
    	File file = new File("e:\\test\\jsp\\jsp\\test1.htm");
    	String imgRealPath="";
    	if(file.exists())
    	{  
    		file.delete();  
    		try 
    		{
				file.createNewFile();
			} catch (IOException e) 
			{
				e.printStackTrace();
			}  
    	}else{  
			  try 
			  {
				file.createNewFile();
			  } catch (IOException e) {
				e.printStackTrace();
//					 TODO Auto-generated catch block
			  }  
    	}   
    	try {
			NodeFilter nt = new NodeClassFilter(ImageTag.class) ;
			
			//BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
			NodeList nodeList = parser.parse(nt);
			
			for (int i = 0 ; i< nodeList.size(); i++){
				int num=0;
				imgnode = (ImageTag)nodeList.elementAt(i);
                img = imgnode.getImageURL();
                System.out.println(img);
               /* String[] filePath = file.getParent().split("\\\\");
                String[] imgPath = img.split("/");
                System.out.println(img+"  "+file.getParent());
                for(int j=0;j<imgPath.length;j++)
                {
                	if(imgPath[j].equals(".."))
                	{	
                		num++;
                	}
                }
                System.out.println(img.indexOf(":")+"img.indexOf(:)"+img);
                if(img.indexOf(":")!=-1)
                {
                	imgRealPath=img;
                }
                else if(num>1)
                {
                	System.out.println("img before replace"+img);
                	img = img.replace("../","");
                	System.out.println("img num>1"+img+num);
                	imgRealPath = filePath[filePath.length-1-num]+"/"+img;
                	while((filePath.length-1-num)>0)
                	{
                		num++;
                		imgRealPath = filePath[filePath.length-1-num]+imgRealPath;
                	}
                	System.out.println("imgRealPath"+imgRealPath+(filePath.length-1-num));
                }
                else if(imgPath[0].equals("."))
                {
                	System.out.println(file.getParent()+"imgPath[0].equals(.)");
                	img = img.replace("./","");
                	imgRealPath=file.getParent()+"\\"+img;
                }
                else
                {
	                for(int j=0;j<imgPath.length;j++)
	                {
	                	if(imgPath[j].equals(".."))
	                	{
	                		imgPath[j] = (String)( imgPath[j].replace("..",filePath[j+1]));
	                		System.out.println(imgPath[j]);
	                	}
	                	if(!imgPath[j].equals(""))
	                		imgRealPath += "/"+imgPath[j];
	                }
	                imgRealPath=filePath[0]+imgRealPath;
                }
                imgRealPath = imgRealPath.replaceAll("\\\\","/");
                imgnode.setImageURL(imgRealPath);
                imgRealPath="";
                writer.write(imgnode.toHtml()); */
			}  
			//writer.flush();
           // writer.close ();  
		} catch (Exception e) {
			e.printStackTrace();
		}
        return imgRealPath;
    }

    public static void main(String[] args) {
    	HtmlAnalysis htmlAnalysis= new HtmlAnalysis(new File("f:\\test.html")); 
    	try{
    		htmlAnalysis.init();
//    		System.out.println(htmlAnalysis.getMetaInfo("keywords"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("author"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("description"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("other"));
//    		htmlAnalysis.parser.reset();
    		//System.out.println(htmlAnalysis.getTitle());
    		//htmlAnalysis.parser.reset();
    		//System.out.println(htmlAnalysis.getHeadInfo());
    		htmlAnalysis.getTable();
//    		htmlAnalysis.testTable();
    	}catch(Exception e){
    		
    	}

    }
    
    public static void visitTag(Tag tag) {
        if (tag.getAttribute("class") != null) {
            System.out.println(" " + tag.getTagName() +
                tag.getAttribute("class"));
        }
    }
    
    

    public String getCharset() {
        return charset;
    }

    public void setCharset(String charset) {
        this.charset = charset;
    }

    public String getContentType() {
        return contentType;
    }

    public void setContentType(String contentType) {
        this.contentType = contentType;
    }

    public String getMetaDataString() {
        return metaDataString;
    }

    public void setMetaDataString(String metaDataString) {
        this.metaDataString = metaDataString;
    }



    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }
}

 

分享到:
评论
1 楼 menglinxi 2009-09-29  
org.htmlparser   似乎不能对域名中包含特殊字符的连接进行抓取:如http://www.oj-sz.com,会出现
org.htmlparser.util.ParserException: Exception getting input stream from http://www.oj-sz.com/ (Not in GZIP format).;
java.io.IOException: Not in GZIP format。

相关推荐

    httpClient&htmlParase

    下载了几个抓包的jar包,httpClient和htmlParase包,以及连个淘宝包

    pre_o_1csdn63m9a1bs0e1rr51niuu33e.a

    pre_o_1csdn63m9a1bs0e1rr51niuu33e.a

    matlab建立计算力学课程的笔记和文件.zip

    matlab建立计算力学课程的笔记和文件.zip

    FT-Prog-v3.12.38.643-FTD USB 工作模式设定及eprom读写

    FT_Prog_v3.12.38.643--FTD USB 工作模式设定及eprom读写

    matlab基于RRT和人工势场法混合算法的路径规划.zip

    matlab基于RRT和人工势场法混合算法的路径规划.zip

    matlab基于matlab的两步定位软件定义接收机的开源GNSS直接位置估计插件模块.zip

    matlab基于matlab的两步定位软件定义接收机的开源GNSS直接位置估计插件模块.zip

    office 2016三和一精简版

    office 2016三和一精简版

    Scrapy-1.0.2-py2-none-any.whl

    文件操作、数据分析和网络编程等。Python社区提供了大量的第三方库,如NumPy、Pandas和Requests,极大地丰富了Python的应用领域,从数据科学到Web开发。Python库的丰富性是Python成为最受欢迎的编程语言之一的关键原因之一。这些库不仅为初学者提供了快速入门的途径,而且为经验丰富的开发者提供了强大的工具,以高效率、高质量地完成复杂任务。例如,Matplotlib和Seaborn库在数据可视化领域内非常受欢迎,它们提供了广泛的工具和技术,可以创建高度定制化的图表和图形,帮助数据科学家和分析师在数据探索和结果展示中更有效地传达信息。

    麦肯锡咨询顾问必备宝典-时间管理.ppt

    麦肯锡咨询顾问必备宝典-时间管理.ppt

    setuptools-0.6c10-py2.4.egg

    文件操作、数据分析和网络编程等。Python社区提供了大量的第三方库,如NumPy、Pandas和Requests,极大地丰富了Python的应用领域,从数据科学到Web开发。Python库的丰富性是Python成为最受欢迎的编程语言之一的关键原因之一。这些库不仅为初学者提供了快速入门的途径,而且为经验丰富的开发者提供了强大的工具,以高效率、高质量地完成复杂任务。例如,Matplotlib和Seaborn库在数据可视化领域内非常受欢迎,它们提供了广泛的工具和技术,可以创建高度定制化的图表和图形,帮助数据科学家和分析师在数据探索和结果展示中更有效地传达信息。

    麦肯锡顾问的黄金思考方法.pptx

    麦肯锡顾问的黄金思考方法.pptx

    91fdd461elb59a4ce8dfcfc46bc283a7.msi

    91fdd461elb59a4ce8dfcfc46bc283a7.msi

    ansys maxwell

    ansys maxwell

    5-5.py

    5-5

    xx广告促销计划流程实施手册.ppt

    xx广告促销计划流程实施手册.ppt

    仿小米商城微信小程序源码+项目说明.zip

    仿小米商城微信小程序源码+项目说明.zip

    pytest-4.6.0.tar.gz

    文件操作、数据分析和网络编程等。Python社区提供了大量的第三方库,如NumPy、Pandas和Requests,极大地丰富了Python的应用领域,从数据科学到Web开发。Python库的丰富性是Python成为最受欢迎的编程语言之一的关键原因之一。这些库不仅为初学者提供了快速入门的途径,而且为经验丰富的开发者提供了强大的工具,以高效率、高质量地完成复杂任务。例如,Matplotlib和Seaborn库在数据可视化领域内非常受欢迎,它们提供了广泛的工具和技术,可以创建高度定制化的图表和图形,帮助数据科学家和分析师在数据探索和结果展示中更有效地传达信息。

    Scrapy-2.10.1.tar.gz

    文件操作、数据分析和网络编程等。Python社区提供了大量的第三方库,如NumPy、Pandas和Requests,极大地丰富了Python的应用领域,从数据科学到Web开发。Python库的丰富性是Python成为最受欢迎的编程语言之一的关键原因之一。这些库不仅为初学者提供了快速入门的途径,而且为经验丰富的开发者提供了强大的工具,以高效率、高质量地完成复杂任务。例如,Matplotlib和Seaborn库在数据可视化领域内非常受欢迎,它们提供了广泛的工具和技术,可以创建高度定制化的图表和图形,帮助数据科学家和分析师在数据探索和结果展示中更有效地传达信息。

    麦肯锡xx客户满意服务.ppt

    麦肯锡xx客户满意服务.ppt

    网课专注度监测预警系统基于yolov5目标检测的网课专注度检测系统源码+模型+pyqt5界面.zip

    网课专注度监测预警系统基于yolov5目标检测的网课专注度检测系统源码+模型+pyqt5界面.zip

Global site tag (gtag.js) - Google Analytics