`

不同类型文件读取工具类

阅读更多

一个可读取:Excel,html,pdf,txt,word工具类:

  

package com.topsoft.info.services;

/**
 * Created by IntelliJ IDEA.
 * User:  * Date: 2009-10-9
 * Time: 10:40:12
 * To change this template use File | Settings | File Templates.
 */

import java.io.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.FormTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;


public class FileParseDomainImpl implements FileParseDomain {
    private static final Log log = LogFactory
            .getLog(FileParseDomainImpl.class);

    /**
     * 读取Excel类型的文件
     *
     * @param filePath 文件路径
     * @return 读取后返回的饿字符串
     */
    public String readExcel(String filePath) {
        String content = "";// 字符串
        File file = new File(filePath);
        if (!file.exists()) {
            return "";
        }
        InputStream in = null;
        try {
            in = new FileInputStream(filePath);
            HSSFWorkbook workbook = new HSSFWorkbook(in);
            ExcelExtractor extractor = new ExcelExtractor(workbook);
            extractor.setFormulasNotResults(false);
            extractor.setIncludeSheetNames(false);
            content = extractor.getText();
        } catch (Exception ex) {
            log.debug("读取excel文件出错" + ex.getMessage(), ex);
            throw new RuntimeException("读取excel文件出错" + ex.getMessage());
            // ex.printStackTrace();
        } finally {
            try {
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return content.trim();
    }

    /**
     * 读取html类型的文件
     *
     * @param filePath 文件路径
     * @return 读取后返回的字符串
     */
    public String readHtml(String filePath) {
        StringBuffer content = new StringBuffer("");
        String line = null;// 行
        try {
            File file = new File(filePath);
            if (!file.exists()) {
                return "";
            }
            FileInputStream fis = null;
            fis = new FileInputStream(file);
            BufferedReader reader = new BufferedReader(new InputStreamReader(
                    fis, "gbk"));// 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
            while ((line = reader.readLine()) != null) {
                content.append(line + "\n");
            }
            reader.close();
        } catch (Exception ex) {
            log.debug("读取html或htm文件出错" + ex.getMessage(), ex);
            throw new RuntimeException("读取html或htm文件出错" + ex.getMessage());
            // ex.printStackTrace();
        }
        return content.toString();
    }

    /**
     * 读取pdf类型的文件
     *
     * @param filePath 文件路径
     * @return 读取后返回的字符串
     */
    public String readPDF(String filePath) {
        File file = new File(filePath);
        if (!file.exists()) {
            return "";
        }
        StringBuffer content = new StringBuffer("");// 文档内容
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(filePath);
            PDFParser p = new PDFParser(fis);
            p.parse();
            PDFTextStripper ts = new PDFTextStripper();
            content.append(ts.getText(p.getPDDocument()));

        } catch (Exception ex) {
            log.debug("读取pdf文件出错" + ex.getMessage(), ex);
            throw new RuntimeException("读取pdf文件出错" + ex.getMessage());
            // ex.printStackTrace();
        } finally {
            try {
                fis.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return content.toString();
    }

    /**
     * 读取文本类型的文件
     *
     * @param filePath 文件路径
     * @return 读取后的字符串
     */
    public String readTxt(String filePath) {
        File file = new File(filePath);
        if (!file.exists()) {
            return "";
        }
        StringBuffer content = new StringBuffer("");// 文档内容
        String s1 = null;
        try {
            FileReader reader = new FileReader(filePath);
            BufferedReader br = new BufferedReader(reader);

            while ((s1 = br.readLine()) != null) {
                content.append(s1 + "\r");
            }
            br.close();
            reader.close();
        } catch (Exception e) {
            log.debug("读取文本文件出错" + e.getMessage(), e);
            throw new RuntimeException("读取文本文件出错" + e.getMessage());
            // e.printStackTrace();
        }
        return content.toString().trim();
    }

    /**
     * 读取word类型的文件
     *
     * @param filePath 文件路径
     * @return 读取后的字符串
     */
    public String readWord(String filePath) {
        File file = new File(filePath);
        if (!file.exists()) {
            return "";
        }
        StringBuffer text = new StringBuffer("");
        FileInputStream in = null;
        try {
            in = new FileInputStream(filePath);
            HWPFDocument doc = new HWPFDocument(in);
            Range range = doc.getRange();
            int paragraphCount = range.numParagraphs();// 段落

            for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
                Paragraph pp = range.getParagraph(i);
                text.append(pp.text());
            }
            in.close();
        } catch (Exception ex) {
            log.debug("读取word文件出错" + ex.getMessage(), ex);
            throw new RuntimeException("读取word文件出错" + ex.getMessage());
            // ex.printStackTrace();
        } finally {
            try {
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        String str = text.toString();
        str = str.replaceAll(".*?", "");
        str = str.replaceAll("\\d*?", "");
        str = str.replaceAll("[]", "");
        // str = str.replaceAll("\\r{2}\\r+", "\\r");
        // System.out.println(str);
        return str;
    }

    /**
     * 读取html类温江并过滤标签
     */
    public String readHtmlText(String filePath) {
        File file = new File(filePath);
        if (!file.exists()) {
            return "";
        }
        String htmlText = readHtml(filePath);
        Parser myParser;
        NodeList nodeList = null;
        StringBuffer result = new StringBuffer();
        myParser = Parser.createParser(htmlText, "GBK");
        NodeFilter textFilter = new NodeClassFilter(TextNode.class);
        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
        NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);
        NodeFilter styleFilter = new NodeClassFilter(StyleTag.class);
        NodeFilter divFilter = new NodeClassFilter(Div.class);
        NodeFilter formFilter = new NodeClassFilter(FormTag.class);
        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[]{textFilter, linkFilter,
                metaFilter, styleFilter, divFilter, formFilter});
        try {
            nodeList = myParser.parse(lastFilter);
        } catch (ParserException e) {
            log.debug("html过滤出错(readHtmlText)" + e.getMessage(), e);
            throw new RuntimeException("html过滤出错(readHtmlText)"
                    + e.getMessage());
            // log.info("html过滤出错(readHtmlText)");
            // e.printStackTrace(); //To change body of catch statement use File
            // | Settings | File Templates.
        }
        Node[] nodes = nodeList.toNodeArray();
        String line = "";
        for (int i = 0; i < nodes.length; i++) {
            Node anode = nodes[i];
            if (anode instanceof TextNode) {
                TextNode textnode = (TextNode) anode;
                line = textnode.getText();
            } else if (anode instanceof LinkTag) {
                LinkTag linknode = (LinkTag) anode;
                line = linknode.getLink();
            }
            if (isTrimEmpty(line))
                continue;
            result.append(line);
        }
        return result.toString().replaceAll("&nbsp;", "");
    }

    // public String officeToHtml(String paths, String savepaths) {
    // File d = new File(paths);
    // String filename = d.getName();
    // // 判断是否为doc文件
    // File s = new File(savepaths + "\\temp");
    // boolean fl = s.exists();
    // if (!fl) {
    // s.mkdir();
    // }
    // String tpFile = savepaths + "\\temp\\" + filename.substring(0,
    // (filename.length() - 4)) + ".html";
    // boolean tpexist = new File(tpFile).exists();
    // if ((paths.endsWith(".doc") || paths.endsWith(".xls")) && !tpexist) {
    // String type = "";
    // String property = "";
    // Variant variant = null;
    // if (paths.endsWith(".doc")) {
    // type = "Word.Application";
    // property = "Documents";
    // variant = new Variant(8);
    // } else if (paths.endsWith(".xls")) {
    // type = "Excel.Application";
    // property = "Workbooks";
    // variant = new Variant(44);
    // }
    // // 打印当前目录路径
    // ActiveXComponent app = new ActiveXComponent(type);
    // System.out.println("启动word");
    // // 要转换的word文件
    // // HTML文件
    // try {
    // app.setProperty("Visible", new Variant(false));
    // // 设置word不可见
    // Dispatch docs = app.getProperty(property).toDispatch();
    // Dispatch doc = Dispatch.invoke(docs, "Open", Dispatch.Method, new
    // Object[]{paths, new Variant(false), new Variant(true)}, new
    // int[1]).toDispatch();
    // // 打开word文件
    // Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]{tpFile,
    // variant}, new int[1]);
    // // 作为html格式保存到临时文件
    // Variant f = new Variant(false);
    // Dispatch.call(doc, "Close", f);
    // } catch (Exception e) {
    // e.printStackTrace();
    // } finally {
    // app.invoke("Quit", new Variant[]{});
    // }
    // System.out.println("转化完毕!");
    // }
    // System.out.println("开始读取");
    // FileParseImpl imp = new FileParseImpl();
    // return imp.readHtml(tpFile);
    // }

    private boolean isTrimEmpty(String astr) {
        return (null == astr) || (astr.length() == 0) || isBlank(astr.trim());
    }

    /**
     * 字符串是否为空:null或者长度为0.
     *
     * @param astr 源字符串.
     * @return boolean
     */
    private static boolean isBlank(String astr) {
        return (null == astr) || (astr.length() == 0);
    }
}

 所用到的包在附近中。

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics