`
thomas0988
  • 浏览: 473129 次
  • 性别: Icon_minigender_1
  • 来自: 南阳
社区版块
存档分类
最新评论

Lucene 3.0.2使用方法(转)写的不错

阅读更多

引用地址:http://clucene.org/lucene/63

================================================================

到官网下载lucene 3.0.2 官网地址:http://lucene.apache.org/

官网下载地址:http://repo1.maven.org/maven2/org/apache/lucene/

下载以下包:

lucene-core-3.0.2.jar

lucene-demos-3.0.2.jar

lucene-analyzers-3.0.2.jar

lucene-fast-vector-highlighter-3.0.2.jar

lucene-highlighter-3.0.2.jar

lucene-memory-3.0.2.jar

中文分词使用google IKAnalyze 官网地址:http://code.google.com/p/ik-analyzer/

IKAnalyzer3.2.5Stable.jar

创建索引,添加txt内容文件代码:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class IndexerOK {
private static String INDEX_DIR = “c:\\Lucene\\index”;// 索引存放目录
private static String DATA_DIR = “c:\\Lucene\\file1″;// 小文件存放的目录

public static void main(String[] args) throws Exception {

   long start = new Date().getTime();
   int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));// 调用index方法
   long end = new Date().getTime();
   System.out.println(“Indexing ” + numIndexed + ” files took ”
     + (end – start) + ” milliseconds”);
}

/**
* 索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量
*
* @param indexDir
* @param dataDir
* @return int
* @throws IOException
*/
public static int index(File indexDir, File dataDir) throws IOException {

   if (!dataDir.exists() || !dataDir.isDirectory()) {
    throw new IOException(dataDir
      + ” does not exist or is not a directory”);
   }
   Analyzer analyzer = new IKAnalyzer();// 采用的分词器

   //第三个参数 为true表示新建,false表示添加到原有索引中
   IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),
     analyzer, false, IndexWriter.MaxFieldLength.LIMITED);

   indexDirectory(writer, dataDir);// 调用indexDirectory方法
   int numIndexed = writer.numDocs();
   writer.optimize();
   writer.close();
   return numIndexed;
}

/**
* 循环遍历目录下的所有.txt文件并进行索引
*
* @param writer
* @param dir
* @throws IOException
*/
private static void indexDirectory(IndexWriter writer, File dir)
    throws IOException {

   File[] files = dir.listFiles();

   for (int i = 0; i < files.length; i++) {
    File f = files[i];
    if (f.isDirectory()) {
     indexDirectory(writer, f); // recurse
    } else if (f.getName().endsWith(“.txt”)) {
     indexFile(writer, f);
    }
   }
}

/**
* 对单个txt文件进行索引
*
* @param writer
* @param f
* @throws IOException
*/
private static void indexFile(IndexWriter writer, File f)
    throws IOException {

   if (f.isHidden() || !f.exists() || !f.canRead()) {
    return;
   }

   System.out.println(“Indexing ” + f.getCanonicalPath());

   Document doc = new Document();
   // doc.add(new Field(“contents”, new FileReader(f)));
   doc.add(new Field(“filename”, f.getCanonicalPath(), Field.Store.YES, Field.Index.ANALYZED));

   String temp = FileReaderAll(f.getCanonicalPath(), “GBK”);
   System.out.println(temp);

   doc.add(new Field(“TTT”, temp, Field.Store.YES, Field.Index.ANALYZED));

   doc.add(new Field(“path”, f.getPath(), Field.Store.YES,
     Field.Index.ANALYZED));

   doc.add(new Field(“modified”, DateTools.timeToString(f.lastModified(),
     DateTools.Resolution.MINUTE), Field.Store.YES,
     Field.Index.ANALYZED));

   FileInputStream fis = new FileInputStream(f);
   // 按照 UTF-8 编码方式将字节流转化为字符流
   InputStreamReader isr = new InputStreamReader(fis, “utf-8″);
   // 从字符流中获取文本并进行缓冲
   BufferedReader br = new BufferedReader(isr);

   doc.add(new Field(“contents”, br));

   writer.setUseCompoundFile(false);
   writer.addDocument(doc);
}

public static String FileReaderAll(String FileName, String charset)
    throws IOException {
   BufferedReader reader = new BufferedReader(new InputStreamReader(
     new FileInputStream(FileName), charset));
   String line = new String();
   String temp = new String();

   while ((line = reader.readLine()) != null) {
    temp += line;
   }
   reader.close();
   return temp;
}

}

 

管理lucene代码:

public class SearchDocBean {
private String id;

private String path;

private String contents;

private String dateTime;

private String fileName;

/**
* @return the id
*/
public String getId() {
   return id;
}

/**
* @param id
*            the id to set
*/
public void setId(String id) {
   this.id = id;
}

/**
* @return the path
*/
public String getPath() {
   return path;
}

/**
* @param path
*            the path to set
*/
public void setPath(String path) {
   this.path = path;
}

/**
* @return the contents
*/
public String getContents() {
   return contents;
}

/**
* @param contents
*            the contents to set
*/
public void setContents(String contents) {
   this.contents = contents;
}

/**
* @return the dateTime
*/
public String getDateTime() {
   return dateTime;
}

/**
* @param dateTime
*            the dateTime to set
*/
public void setDateTime(String dateTime) {
   this.dateTime = dateTime;
}

/**
* @return the fileName
*/
public String getFileName() {
   return fileName;
}

/**
* @param fileName
*            the fileName to set
*/
public void setFileName(String fileName) {
   this.fileName = fileName;
}

}

 

import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.SQLException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class ManageIndexFile {
private static String INDEX_DIR = “c:\\Lucene\\index”;// 索引存放目录

public static void DeleteIndex(SearchDocBean bean) throws IOException {
   Directory dir = FSDirectory.open(new File(INDEX_DIR));
   IndexReader reader = IndexReader.open(dir, false);
   Term term = new Term(“modified”, bean.getId());
   int count = reader.deleteDocuments(term);
   reader.close();
   System.out.println(“Successful Delete ” + count + ” path==” + bean.getId());

}

public static void DeleteIndex(int[] posIDS) throws IOException {
   Directory dir = FSDirectory.open(new File(INDEX_DIR));
   IndexReader reader = IndexReader.open(dir, false);
   for (int i = 0; i < posIDS.length; i++) {
    Term term = new Term(“posID”, Integer.toString(posIDS[i]));
    reader.deleteDocuments(term);
   }
   reader.close();

}

public static void UpdateIndex(SearchDocBean bean) throws IOException {
   Directory dir = FSDirectory.open(new File(INDEX_DIR));
   IndexReader reader = IndexReader.open(dir, false);
   Term term = new Term(“modified”, bean.getId());
   reader.deleteDocuments(term);
   reader.close();

   IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
     INDEX_DIR)), new StandardAnalyzer(Version.LUCENE_CURRENT),
     true, IndexWriter.MaxFieldLength.LIMITED);
   Document doc = new Document();

   doc.add(new Field(“modified”, bean.getId(), Field.Store.YES,
     Field.Index.NOT_ANALYZED));
   writer.addDocument(doc);
   writer.optimize();
   writer.close();

}

public static void AddIndex(SearchDocBean bean,
    Connection conn) throws IOException, SQLException {
   Analyzer analyzer = new IKAnalyzer();// 采用的分词器

   IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
     INDEX_DIR)), analyzer, false,
     IndexWriter.MaxFieldLength.LIMITED);

   Document doc = new Document();
   doc.add(new Field(“filename”, bean.getFileName(), Field.Store.YES,
     Field.Index.ANALYZED));

   doc.add(new Field(“path”, bean.getPath(), Field.Store.YES,
     Field.Index.ANALYZED));

   doc.add(new Field(“dateTime”, bean.getId(), Field.Store.YES,
     Field.Index.ANALYZED));

   doc.add(new Field(“TTT”, bean.getContents(), Field.Store.YES, Field.Index.ANALYZED));

   writer.setUseCompoundFile(false);
   writer.addDocument(doc);
   writer.optimize();
   writer.close();
}

}

 

分词查询加高亮显示:

import java.io.File;
import java.io.StringReader;
import java.util.Date;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;

public class SearchQueryOK {
private static String INDEX_DIR = “c:\\Lucene\\index”;// 索引所在的路径
private static String KEYWORD = “人民”;// 关键词
private static int TOP_NUM = 100;// 显示前100条结果

public static void main(String[] args) throws Exception {
   File indexDir = new File(INDEX_DIR);
   if (!indexDir.exists() || !indexDir.isDirectory()) {
    throw new Exception(indexDir
      + ” does not exist or is not a directory.”);
   }
   search(indexDir, KEYWORD);// 调用search方法进行查询
}

/**
* 查询
*
* @param indexDir
* @param q
* @throws Exception
*/
public static void search(File indexDir, String q) throws Exception {
   IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only
   String[] field = {“TTT”,”modified”,”filename”};
   long start = new Date().getTime();// start time

   // 高亮设置
   Analyzer analyzer = new IKAnalyzer();// 设定分词器
   Query query2 = IKQueryParser.parseMultiField(field, KEYWORD);
   // 实例化搜索器
   IndexSearcher isearcher1 = new IndexSearcher(FSDirectory.open(indexDir));
   // 在索引器中使用IKSimilarity相似度评估器
   isearcher1.setSimilarity(new IKSimilarity());
  
   Sort sort = new Sort(new SortField(“path”, SortField.DOC,false));
   //TermQuery q1 = new TermQuery(new Term(“filename”, “1″));
   // 搜索相似度最高的记录
   TopDocs topDocs1 = isearcher1.search(query2,null, TOP_NUM,sort);
  
   ScoreDoc[] hits3 = topDocs1.scoreDocs;
   SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(
     “<span style=’color:#ff0000′>”, “</span>”);// 设定高亮显示的格式,也就是对高亮显示的词组加上前缀后缀
   Highlighter highlighter = new Highlighter(simpleHtmlFormatter,
     new QueryScorer(query2));
   for (int i = 0; i < hits3.length; i++) {
    Document doc = is.doc(hits3[i].doc);
    String docTTT = doc.get(“TTT”);
    highlighter.setTextFragmenter(new SimpleFragmenter(docTTT.length()));// 设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧,当然这里也是设定只展示部分数据
    TokenStream tokenStream = analyzer.tokenStream(“”,
      new StringReader(docTTT));
    String str = highlighter.getBestFragment(tokenStream, docTTT);
    System.out.println(” 高亮设置: ” + str );
   
    String docModified = doc.get(“filename”);
    highlighter.setTextFragmenter(new SimpleFragmenter(docModified.length()));
   
    TokenStream tokenStream2 = analyzer.tokenStream(“”,
      new StringReader(docModified));
    String str2 = highlighter.getBestFragment(tokenStream2, docModified);
    System.out.println(” 高亮设置: ” + str2 );
   
    List<Fieldable> list = doc.getFields();
    for (int j = 0; j < list.size(); j++) {
     Fieldable fieldable = list.get(j);
     System.out.println(fieldable.name() + ” : ”
       + fieldable.stringValue() + “<br>”);
    }
   }

   long end = new Date().getTime();// end time

   System.out.println(“Found ” + hits3.length
     + ” document(s) (in ” + (end – start)
     + ” milliseconds) that matched query ‘” + q + “‘:”);
}

}

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics