`
maomao1
  • 浏览: 1467 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

各种针对文档操作的代码示例

 
阅读更多
做之前,要下载pdfbox和poi插件,网上很多,很容找到,代码demo如下
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

/**
*  读取各种文件的类
*  作用:用于读取各种文件的内容
* @author sang
*
*/
public class ReadFile {
private static final Logger log=Logger.getLogger(ReadFile.class);
public static void main(String args[]){
String path="D:\\temp\\106-113.p3.pdf";
String content=readPdf(path);
System.out.println(content);
System.out.println("****************************************************************************************");
log.info(content);
}
/* @author sang
* 读 word方法
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readWord(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
try {

HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}

} catch (Exception e) {

}
return content.toString().trim();
}

/* @author sang
* 读 pdf方法
* input param path 是文件路径
* output param content 读取的文件内容
*/
public static String readPdf(String path) {
StringBuffer content = new StringBuffer("");// 文档内容

FileInputStream fis;
try {
fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
//org.fontbox.cmap.CMapParser
content.append(ts.getText(p.getPDDocument()));
p.getPDDocument().close();
fis.close();

} catch (Exception e) {
System.out.println("读取pdf文件出现异常");
e.printStackTrace();
}

return content.toString().trim();
}

/*
* 读取PDF 方法2
*/
public static String readpaffile(String path) {
String docText = null;

PDFParser parser;
try {
parser = new PDFParser(new FileInputStream(new File(path)));
parser.parse();

COSDocument cosdoc = parser.getDocument();

PDFTextStripper stripper = new PDFTextStripper();

docText = stripper.getText(new PDDocument(cosdoc));

} catch (FileNotFoundException e) {

e.printStackTrace();
} catch (IOException e) {

e.printStackTrace();
}

return docText.trim();
}

/* @author sang
* 读html
* input param urlString 是文件路径
* output param contentString 读取的文件内容
*/

public static String readHtml(String urlString) {

StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis, "GBK"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码

String line = null;

while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
System.out.println("读取html出现异常");
}
String contentString = content.toString();
return contentString;
}

/* @author sang
* 读txt
* input param path 是文件路径
* output param content 读取的文件内容
*/

public static String readTxt(String path) {
StringBuffer content = new StringBuffer("");// 文档内容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null;

while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
br.close();
reader.close();
} catch (IOException e) {
System.out.println("读取txt出现异常");
}
return content.toString().trim();
}

/* @author sang
* 读ppt
* input param url 是文件路径
* output param content 读取的文件内容
*/

public static String ReadPPt(String url) throws Exception {
StringBuffer content = new StringBuffer("");
try {

SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(
url)));//is 为文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();//获得每一张幻灯片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());//这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}

} catch (Exception ex) {
System.out.println("读取ppt异常:" + ex.toString());
}
return content.toString();
}

/* @author sang
* 读excel
* input param url 是文件路径
* output param content 读取的文件内容
*/
public static String ReadExcel(String url) throws Exception {
StringBuffer content = new StringBuffer();
try {
HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(url));//创建对Excel工作簿文件的引用 
for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {

if (null != workbook.getSheetAt(numSheets)) {

HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet

for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet
.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行
for (int cellNumOfRow = 0; cellNumOfRow <= aRow
.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值

content.append(aCell.getStringCellValue());

}
}
}
}
}
}
} catch (Exception e) {
System.out.println(" 读取excel异常 :  " + e);
}
return content.toString();
}

}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics