`
tiantian911
  • 浏览: 216288 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

Html解析生成纯文本-使用SAX以及htmlcleaner

阅读更多
package testlucene;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

public class SAXxhtml extends DefaultHandler {
	/**
	 * Logger for this class
	 */
	private static final Logger logger = Logger.getLogger(SAXxhtml.class);

	public StringBuffer sb = new StringBuffer();
	public boolean usable = true;
	private String sPath = "";

	public SAXxhtml() {
		super();
		// TODO Auto-generated constructor stub
		// PropertyConfigurator.configure("log4j.properties");
		BasicConfigurator.configure();
	}

	public void startElement(String namespaceURI, String localName,
			String rawName, Attributes atts) throws SAXException {
		if (rawName.equals("style") || rawName.equals("script")) {
			usable = false;
		}

	}

	// 解析完成后的统计工作
	public void endDocument() throws SAXException {
		try {
			PrintWriter pw = new PrintWriter(new FileOutputStream(sPath));
			pw.print(sb.toString());
			pw.flush();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public void characters(char[] ch, int start, int length) {
		String charEncontered = new String(ch, start, length);
		/*
		 * if (!charEncontered.startsWith("<!")||!charEncontered.startsWith("<
		 * ")) { sb.append("\n"); sb.append(charEncontered); }
		 */
		if (usable) {
			sb.append(charEncontered);
			sb.append("\n");
		}

		usable = true;

	}

	
	
	/**
	 * @param args
	 */

	public static void main(String[] args) {
		// TODO Auto-generated method stub
		/*
		 * SAXParserFactory spf = SAXParserFactory.newInstance(); XMLReader
		 * xmlReader = null; SAXParser saxParser = null; try { //
		 * 创建一个解析器SAXParser对象 saxParser = spf.newSAXParser(); //
		 * 得到SAXParser中封装的SAX XMLReader xmlReader = saxParser.getXMLReader();
		 * saxParser.parse(new File("d:/sina.xml"), new SAXxhtml()); } catch
		 * (Exception ex) { logger.error("main(String[]) - " + ex, ex);
		 * System.exit(1); }
		 */

	}

	@Override
	public void endElement(String arg0, String arg1, String arg2)
			throws SAXException {
		// TODO Auto-generated method stub
		super.endElement(arg0, arg1, arg2);

	}

	public void parse(String sPath, String Scontent) {
		this.sPath = sPath;
		try {
			// System.out.println(Scontent);
			HtmlCleaner hc = new HtmlCleaner(Scontent);
			hc.clean();
			PrintWriter pw = new PrintWriter(new FileOutputStream("e:/tmpfile/tmp.txt"));
			pw.print(sb.toString());
			pw.flush();
			pw.close();
			FileInputStream fis = new FileInputStream(new File("e:/tmpfile/tmp.txt"));
			String mid = hc.getBrowserCompactXmlAsString();
			StringReader sr = new StringReader(mid);
			InputSource iSrc = new InputSource(sr);
			System.out.println(iSrc.toString());
			SAXParserFactory spf = SAXParserFactory.newInstance();
			XMLReader xmlReader = null;
			SAXParser saxParser = null;
			// 创建一个解析器SAXParser对象
			saxParser = spf.newSAXParser();
			// 得到SAXParser中封装的SAX XMLReader
			xmlReader = saxParser.getXMLReader();
			saxParser.parse(fis, new SAXxhtml());

		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ParserConfigurationException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (SAXException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}


	}
}

 

 

具体思路是Html->xml,然后就可以用sax对xml解析,但是程序总调不通,有人能帮助解决一下么?

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics