java pdf

nolan022

浏览: 169037 次
性别:
来自: 青岛

最近访客更多访客>>

james110

smallcici

wuhangirl

AlexTing

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java

Java DOS Apache lucene J#

引用 :http://www.iteye.com/post/599330
各位好：在javaeye好长时间了，一直在各大网站学习各位的经验很感谢各位，目前我遇到一个关于lucene索引的问题，在国内和国外的网站上找了很久也没找到一个比较满意的解决办法，所以在这里想问问大家，希望有过这方面的经验的朋友给些帮助，最好能有些比较好的代码或可行性建议，我的代码大概如下




import com.messagesolution.message.viewer.util.HtmlDocument;
import com.messagesolution.util.logger.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.textmining.text.extraction.WordExtractor;

import java.io.*;


public class DocumentConverter
{

 public static boolean convertPDF(String fromfile, String tofile)
    {
        PDFParser parser = null;
        String s = null;
        FileInputStream in = null;
        FileOutputStream fos = null;
        //BufferedOutputStream bos = null;
        DataOutputStream dos = null;
        try
        {
            try {
    PDFTextStripper _stripper = new PDFTextStripper();
    in = new FileInputStream(new File(fromfile));
    parser = new PDFParser(in);
    parser.parse();
    s = _stripper.getText(parser.getDocument());
    if (StringToolKit.isEmpty(s)){
     Logger.getInstance().error("read string of pdf is empty");
     
     return false;       //nothing to write
    }
        
   } catch (Exception e) {
    Logger.getInstance().error("read pdf or convert it error");
    e.printStackTrace();
    return false;
   }

            try {
    //now write this string to a file
    fos = new FileOutputStream(new File(tofile));
    //bos = new BufferedOutputStream(fos);
    //bos.write(s.getBytes());  //what about other language?
    dos = new DataOutputStream(fos);
    dos.writeBytes(s);
   } catch (Exception e) {
    Logger.getInstance().error("write converted txt error");
    e.printStackTrace();
    return false;
   }
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertPDF for file: " + fromfile, t);
            System.err.println("Exception occurred in convertPDF, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally {
            try
            {
                if (parser != null)
                    parser.getDocument().close();
                if (in != null)
                    in.close();
                if (fos != null)
                    fos.close();
                //if (bos != null)
                //    bos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception ex) {
              Logger.getInstance().error(ex.toString());
            }
        }

  return true;
 }

 public static boolean convertDOC(String fromfile, String tofile)
    {
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;

        try
        {
            fis = new FileInputStream (new File(fromfile));
            WordExtractor extractor = new WordExtractor();
            String s = extractor.extractText(fis);

            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertDOC for file: " + fromfile, t);
            System.err.println("Exception occurred in convertDOC, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }

 public static boolean convertHTML(String fromfile, String tofile)
    {
        try
        {
            String htmlCharset = HtmlDocument.convertHtml(fromfile, tofile);
            System.out.println("htmlCharset: " + htmlCharset);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertHTML for file: " + fromfile, t);
            System.err.println("Exception occurred in convertHTML, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }

  return true;
 }

 public static boolean convertPPT(String fromfile, String tofile)
    {
        System.err.println("convertPPT not supported yet!");
        Thread.dumpStack();
        return false;
 // return false;
 }

 public static boolean convertXLS(String fromfile, String tofile)
    {
        StringBuffer sb = new StringBuffer();
        FileInputStream fis = null;
        FileOutputStream fos = null;
        DataOutputStream dos = null;
        HSSFWorkbook wb = null;

        try
        {
            fis = new FileInputStream(new File(fromfile));
            wb = new HSSFWorkbook(fis);

            int numSheets = wb.getNumberOfSheets();
            for (int i=0;i<numSheets;++i)
            {
                HSSFSheet sheet = wb.getSheetAt(i);
                int numRows = sheet.getLastRowNum();
                for (int j=0;j<numRows;++j)
                {
                    HSSFRow row = sheet.getRow(j);
                    if (row == null)
                        continue;
                    
                    int numCells = row.getLastCellNum();
                    for (int k=0;k<numCells;++k)
                    {
                        HSSFCell cell = row.getCell((short)k);
                        if(cell!=null)
                        {
                            int type = cell.getCellType();
                            if(type==HSSFCell.CELL_TYPE_STRING)
                            {
                                String str = cell.getStringCellValue();
                                str=str.trim();
                                str=replace(str,"\n","");
                                sb.append(str).append(" ");
                            }
                        }
                        // We will ignore all other types - numbers, forumlas, etc.
                        // as these don't hold alot of meaning outside of their tabular context.
                        // else if(type==, CELL_TYPE_NUMERIC, CELL_TYPE_FORMULA, CELL_TYPE_BOOLEAN, CELL_TYPE_ERROR
                    } // cells
                    //sb.append("\n"); // break on each row
                } // rows
                sb.append("\n"); // break on each sheet
            } // sheets

            String s = sb.toString();
            //now write this string to a file
            fos = new FileOutputStream(new File(tofile));
            //bos = new BufferedOutputStream(fos);
            //bos.write(s.getBytes());  //what about other language?
            dos = new DataOutputStream(fos);
            dos.writeBytes(s);
        }
        catch (Throwable t)
        {
            if (t instanceof OutOfMemoryError)
                Logger.getInstance().fatal("OutOfMemoryError occurred in convertXSL for file: " + fromfile, t);
            System.err.println("Exception occurred in convertXSL, t: " + t);
            t.printStackTrace();
            return false;   //something wrong during the conversion
        }
        finally
        {
            try
            {
                if (fis != null)
                    fis.close();
                if (fos != null)
                    fos.close();
                if (dos != null)
                    dos.close();
            } catch (Exception e) {}
        }

  return true;
 }


    // This should really be made 'static' and moved into a utility class,
 // included here to simplify things
    private final static String replace(String line, String oldString, String newString)
    {
        if (line == null) {
            return null;
        }
        int i = 0;
        if ((i = line.indexOf(oldString, i)) >= 0) {
            char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length();
            StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength;
            int j = i;
            while ((i = line.indexOf(oldString, i)) > 0) {
                buf.append(line2, j, i - j).append(newString2); i += oLength; j = i;
            }
            buf.append(line2, j, line2.length - j); return buf.toString();
        }
        return line;
    }

    public static void main(String[] args)
    {
        int index = 0;
        String action = args[index++];
        String f1 = args[index++];
        String f2 = args[index++];

        long start = System.currentTimeMillis();
        long end = 0;
        if (action.equals("pdf"))
            convertPDF(f1, f2);
        else if (action.equals("doc"))
            convertDOC(f1, f2);
        else if (action.equals("xls"))
            convertXLS(f1, f2);
        else if (action.equals("ppt"))
            convertPPT(f1, f2);
        else if (action.equals("ppt"))
            convertHTML(f1, f2);

        end = System.currentTimeMillis();
        System.out.println(action + " convert " + f1 + " took " + ((end-start)/1000) + " seconds.");
    }

}

main方法主要是输入三个参数第一个是转换文档的格式，第二个是文档存放的路径，第三个是要输出的文档存放的位置，

然后对输出的文档进行索引, 平均每个文档在1M-5M之间，

问题：在进行文档转换的时候pdf，word，xls 都非常慢，本来想写一个threadpool来进行文档的转换，可是测试数据表明多线程转换还不如单线程的快，而且也容易出现outofmemory, 后来我又想了一个办法，把大的pdf ,word xls 进行切分，可是写了一个java的切分成小文档的方法，只能对txt文档进行转换，word 和pdf 因为里面有很多格式和样式的东西都是二进制的，在合成一个大的文档就合并不回去了（c++ 或.net 到时有办法切分），所以希望有过索引大量pdf ，word，xls 文档的朋友给写帮助，能快速处理，目前的数据量大概是1T(大概是100G)，服务器配置大概是4个cpu ,4G内存，虚拟机开到了1.2个G用的是jdk1.4在大也开不了了，谢谢帮助

分享到：

java String to byte[] | java月份时间（第一天，最后一天）

2008-07-13 13:39
浏览 1103
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

java pdf

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

java pdf

评论

发表评论

相关推荐

java调用google map api 根据经纬度读取经纬度地址

java 读取http url joson 格式

【转】用Lucene的SpellChecker实现Google的“您是不是要找”功能

Java 接口和抽象类区别

php 环境搭建

abstract class vc interface

二分查找vc线性查找

java 取字符串中汉字之前的部门

java 数组 操作

查询代码网站

java String to byte[]

java月份时间（第一天，最后一天）

java中文件操作大全

较好的Java网站收集

java 自定义排序

2008年Java开发者最迫切的五个期望

Java精华积累:每个初学者都应该搞懂的问题！

如何让EditPlus可以编译执行Java程序

阴阳转换的类！ 算法支持1900-2050

整理了sun网站java环境下获取网卡信息的资料

最近访客更多访客>>

java 数组操作

阴阳转换的类！算法支持1900-2050