`
464872333
  • 浏览: 89625 次
  • 性别: Icon_minigender_1
  • 来自: 安徽
社区版块
存档分类
最新评论

lucene进行全文检索的一个简单例子

阅读更多

    最近在研究关于lucene检索文档的问题,参考网上一些人的例子,但是结果只能检索英文的,有人说要通过中文分词,但我也用了,结果是一样的,不能检索中文。呵呵。。。后来经过一些高手的指点,解决了中文的问题。我用的lucene版本为3.0.2,中文分词是IKAnalyzer3.2.下面是我的一些代码。仅供参考。
第一步:
 建立文件索引:

   public class IndexProcesser {

// 成员变量存储创建的索引文件存放的位置
private static String INDEX_STORE_PATH = "G:\\学习\\Lucene相关\\IndexWriter";
private static String DATA_DIR = "G:\\学习\\Lucene相关\\IndexWriter\\searchFolder";

/**
* 索引dataDir下.txt文件,并储存在indexDir下,返回索引的文件数量
* 
* @param indexDir
* @param dataDir
* @return
* @throws Exception
*/
public static int createIndex(File indexDir, File dataDir) throws Exception {
if (!dataDir.exists() || !dataDir.isDirectory()) {
throw new IOException(dataDir
+ " does not exist or is not a directory");
}
Analyzer analyzer = new IKAnalyzer();//IK分词器,网上还有别的分词器。
IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),
analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
writer.setMergeFactor(1000);	//合并因子
writer.setMaxBufferedDocs(1000); //最大缓存文档数  
writer.setMaxMergeDocs(Integer.MAX_VALUE); //最大合并文档数
writer.setMaxFieldLength(99999999);//增加内存域长度限制
indexDirectory(writer, dataDir);
int numIndexed = writer.numDocs();
writer.optimize();
writer.close();
return numIndexed;
}

private static void indexDirectory(IndexWriter writer, File dataDir) {
File[] files = dataDir.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
indexDirectory(writer, f);
} else {
try {
indexFile(writer, f);
} catch (IOException e) {
e.printStackTrace();
}
}
}

}

private static void indexFile(IndexWriter writer, File f)
throws IOException {
if (f.isHidden() || !f.canRead() || !f.exists()) {
return;
}
System.out.println("indexIng>>" + f.getCanonicalPath());
Document doc = new Document();
doc.add(new Field("filePath", f.getAbsolutePath(), Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("content", readFile(f), Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(doc);

}
//readFile()方法主要对你所要检索的文档进行解析,会用到一些相应的组件,如pdf有pdfBox组件,pdfBox对中文支持不好。我用的是xpdf,关于xpdf的配置,会在后面进行说明。word用到的是POI组件,还有别的格式,此处不在详说。
private static String readFile(File f) {
StringBuffer content = null;
FileInputStream is = null;

if (f.getName().endsWith(".doc")) {
content=new StringBuffer();
try{
is=new FileInputStream(f);
WordExtractor wordExtractor=new WordExtractor(is);
content=content.append(wordExtractor.getText());
is.close();
}catch(Exception e){
e.printStackTrace();
}

} else if (f.getName().endsWith(".pdf")) {
String PATH_TO_XPDF="C:\\xpdftest\\xpdf\\pdftotext.exe";
String[] cmd=new String[]{ PATH_TO_XPDF, "-enc", "UTF-8", "-q",f.getAbsoluteFile().toString(), "-" };
try {
Process p=Runtime.getRuntime().exec(cmd);
BufferedInputStream bis=new BufferedInputStream(p.getInputStream());
InputStreamReader reader=new InputStreamReader(bis,"UTF-8");
int len=0;
content=new StringBuffer();
while((len=reader.read())!=-1){
content.append((char)len);
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}else{
try {
content = new StringBuffer();
is = new FileInputStream(f);
BufferedReader br = new BufferedReader(new InputStreamReader(
is, "GBK"));
for (String line = null; (line = br.readLine()) != null;) {
content.append(line).append("\n");
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return content.toString();
}

public static void main(String[] args) {
long start = new Date().getTime();
int numIndexed = 0;
try {
numIndexed = createIndex(new File(INDEX_STORE_PATH), new File(DATA_DIR));
} catch (Exception e) {
e.printStackTrace();
}
long end = new Date().getTime();

System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + "  milliseconds");
}

}

 
//xpdf配置说明:
  1.从http://www.foolabs.com/xpdf/download.html上下载xpdf3.02(xpdf-3.02pl2-win32.zip)和xpdf-chinese-simplified.tar.gz。
  2.将xpdf-3.02pl2-win32.zip解压放入c:/xpdf,同时将xpdf-chinese-simplified.tar.gz解压放入到该文件夹内。
  3.打开解压后的xpdf-chinese-simplified文件夹下的add-to-xpdfrc文件,将其内容拷贝到
xpdfrc.txt中,

 

 

修改如下代码:
 #----- begin Chinese Simplified support package (2004-jul-27)
cidToUnicode Adobe-GB1 C:/xpdf/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode
unicodeMap ISO-2022-CN C:/xpdf/xpdf-chinese-simplified/ISO-2022-CN.unicodeMap
unicodeMap EUC-CN  C:/xpdf/xpdf-chinese-simplified/EUC-CN.unicodeMap
unicodeMap GBK  C:/xpdf/xpdf-chinese-simplified/GBK.unicodeMap
cMapDir  Adobe-GB1 C:/xpdf/xpdf-chinese-simplified/CMap
toUnicodeDir   C:/xpdf/xpdf-chinese-simplified/CMap
fontDir c:/windows/fonts
displayCIDFontTT Adobe-GB1 c:/windows/fonts/simhei(truetype)
textEOL CR+LF
#----- end Chinese Simplified support package

 注意“ C:/xpdf”部分路径,要和你本机的路径一致。

下面就是检索了:
public class Search {

private static String IndexDir="G:\\学习\\Lucene相关\\IndexWriter";
private static String keyWord="努力";
private static int TOP_NUM = 100;

public static void doSearch(File indexDir,String key) throws Exception{
IndexSearcher searcher=new IndexSearcher(FSDirectory.open(indexDir),true);
String field="content";
Query query=IKQueryParser.parse(field, keyWord);
		
		//=========================================================================
		long start=new Date().getTime();
		TopDocs hits=searcher.search(query, TOP_NUM);
		long end = new Date().getTime();//end time
		System.out.println("共找到文档数:"+hits.totalHits);
		 System.out.println("搜索完毕用时:" + (end - start)    + "毫秒");  
		 if(hits.totalHits==0){
			 System.out.println("没有找到您需要的结果!");
		 }else{
			 for(int i=0;i<hits.scoreDocs.length;i++){
				 try{
					 ScoreDoc scoreDoc = hits.scoreDocs[i];// 有变化的地方  
	                    Document doc = searcher.doc(scoreDoc.doc);// 有变化的地方  
	                    System.out.print("这是第" + (i+1) + "个检索结果,文件路径为:");  
	                    System.out.println(doc.get("filePath"));  
				 }catch(Exception e){
					 
				 }
			 }
		 }
searcher.close();   
}
public static void main(String[] args) throws Exception {
File indexDir=new File(IndexDir);
if(!indexDir.isDirectory()||!indexDir.exists()){
throw new Exception(indexDir +
        " does not exist or is not a directory。");
}
doSearch(indexDir, keyWord);
}
}
 

 

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics