关于lucene处理word、excel、ppt、txt、pdf等格式文件

JAVA海洋

浏览: 601119 次
性别:
来自: 太原

最近访客更多访客>>

kristy_yy

jin_config

xulong

vision_xie

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Java Web 开发

Excel lucene ITeye J#

关于lucene处理word、excel、ppt、txt、pdf等格式文件的代码如下：

public class ReadFile {
	
	/**
	 * 处理word2003
	 * @param path
	 * @return
	 * @throws Exception
	 */
	public static String readWord(String path) throws Exception {
		String bodyText = null;
		InputStream inputStream = new FileInputStream(path);
		WordExtractor extractor = new WordExtractor(inputStream);  
		bodyText = extractor.getText();
		return bodyText;
	}
	
	/**
	 * 处理word2007
	 * @param path
	 * @return
	 * @throws IOException
	 * @throws OpenXML4JException
	 * @throws XmlException
	 */
	public static String readWord2007(String path) throws IOException, OpenXML4JException, XmlException {
		OPCPackage opcPackage = POIXMLDocument.openPackage(path);
		POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
		return ex.getText();
	}
	/**
	 * 处理excel2003
	 * @param path
	 * @return
	 * @throws IOException
	 */
	public static String ReadExcel(String path) throws IOException {
		InputStream inputStream = null;
		String content = null;
		try {
			inputStream = new FileInputStream(path);
			HSSFWorkbook wb = new HSSFWorkbook(inputStream);
			ExcelExtractor extractor = new ExcelExtractor(wb);
			extractor.setFormulasNotResults(true);
			extractor.setIncludeSheetNames(false);
			content = extractor.getText();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		return content;
	}
	/**
	 * 处理excel2007
	 * @param path
	 * @return
	 * @throws IOException
	 */
	public static String readExcel2007(String path) throws IOException {
		StringBuffer content = new StringBuffer();
		// 构造 XSSFWorkbook 对象，strPath 传入文件路径
		XSSFWorkbook xwb = new XSSFWorkbook(path);
		// 循环工作表Sheet
		for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
			XSSFSheet xSheet = xwb.getSheetAt(numSheet);
			if (xSheet == null) {
				continue;
			}
			// 循环行Row
			for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
				XSSFRow xRow = xSheet.getRow(rowNum);
				if (xRow == null) {
					continue;
				}
				// 循环列Cell
				for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
					XSSFCell xCell = xRow.getCell(cellNum);
					if (xCell == null) {
						continue;
					}
					if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
						content.append(xCell.getBooleanCellValue());
					} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
						content.append(xCell.getNumericCellValue());
					} else {
						content.append(xCell.getStringCellValue());
					}
				}
			}
		}

		return content.toString();
	}
	/**
	 * 处理ppt
	 * @param path
	 * @return
	 */
	public static String readPowerPoint(String path) {
		StringBuffer content = new StringBuffer("");
		InputStream inputStream = null;
		try {
			inputStream = new FileInputStream(path);
			SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(path)));// is
			// 为文件的InputStream，建立SlideShow
			Slide[] slides = ss.getSlides();// 获得每一张幻灯片
			for (int i = 0; i < slides.length; i++) {
				TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容，建立TextRun
				for (int j = 0; j < t.length; j++) {
					content.append(t[j].getText());// 这里会将文字内容加到content中去
				}
			}
		} catch (Exception ex) {
			System.out.println(ex.toString());
		}
		return content.toString();
	}
	/**
	 * 处理pdf
	 * @param path
	 * @return
	 * @throws IOException
	 */
	public static String readPdf(String path) throws IOException {
		StringBuffer content = new StringBuffer("");// 文档内容
		PDDocument pdfDocument = null;
		try {
			FileInputStream fis = new FileInputStream(path);
			PDFTextStripper stripper = new PDFTextStripper();
			pdfDocument = PDDocument.load(fis);
			StringWriter writer = new StringWriter();
			stripper.writeText(pdfDocument, writer);
			content.append(writer.getBuffer().toString());
			fis.close();
		} catch (java.io.IOException e) {
			System.err.println("IOException=" + e);
			System.exit(1);
		} finally {
			if (pdfDocument != null) {
				org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();
				cos.close();
				pdfDocument.close();
			}
		}
		
		return content.toString();

	}
	/**
	 * 处理txt
	 * @param path
	 * @return
	 * @throws IOException
	 */
	public static String readTxt(String path) throws IOException {
		StringBuffer sb = new StringBuffer("");
		InputStream is = new FileInputStream(path);
		// 必须设置成GBK，否则将出现乱码
		BufferedReader reader = new BufferedReader(new InputStreamReader(is, "GBK"));
		try {
			String line = "";
			while ((line = reader.readLine()) != null) {
				sb.append(line + "\r");
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		return sb.toString().trim();
	}
	
}

使用jar包如下：