`
hai0378
  • 浏览: 516949 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

Lucene 3.X 自定义排序

 
阅读更多

Lucene 3.X 自定义排序

这是摘录:原文请看http://www.oschina.net/code/snippet_54100_6338

 

/* 加入几家店作为索引数据 */

058  
059       addPoint(writer, "starbuck", "cafe", 2, 0);
060  
061       addPoint(writer, "El Charro", "restaurant", 1, 2);
062  
063       addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9);
064  
065       addPoint(writer, "Los Betos", "restaurant", 9, 6);
066  
067       addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8);
068  
069       addPoint(writer, "central perk", "cafe", 3, 8);
070  
071       writer.close();
072  
073  
074  
075       searcher = new IndexSearcher(directory);
076  
077       /* 查询类型为restaurant的店 */
078  
079       query = new TermQuery(new Term("type", "restaurant"));
080  
081    }
082  
083  
084  
085    /* name为店名,type为类型, x, y, 分别是店的x, y 坐标 */
086  
087    private static void addPoint(IndexWriter writer, String name,
088  
089 String type, int x, int y) throws IOException {
090  
091       Document doc = new Document();
092  
093      
094  
095       doc.add(new Field("name", name, Field.Store.YES,
096  
097              Field.Index.NOT_ANALYZED));
098  
099       doc.add(new Field("type", type, Field.Store.YES,
100  
101              Field.Index.NOT_ANALYZED));
102  
103       doc.add(new Field("location", (x + "," + y), Field.Store.YES,
104  
105              Field.Index.NOT_ANALYZED));
106  
107       writer.addDocument(doc);
108  
109    }
110  
111  
112  
113    public void testNearestRestaurantToHome() throws Exception {
114  
115       /* 使用自定义的排序规则 ,传入坐标0,0*/
116  
117       DistanceComparator distanceComparator = new
118  
119 DistanceComparator( 0, 0 );
120  
121        Sort sort = new Sort( new SortField( "location",
122  
123 distanceComparator ) );
124  
125        /* 查询取3条结果 */
/* 实现FieldComparatorSource接口 */
016  
017 public class DistanceComparator extends FieldComparatorSource {
018  
019    private static final long serialVersionUID = 1L;
020  
021    private int x;
022  
023    private int y;
024  
025  
026  
027    /* 传给构造函数x,y坐标 */
028  
029    public DistanceComparator(int x, int y) { // #2
030  
031       this.x = x;
032  
033       this.y = y;
034  
035    }
036  
037  
038  
039    /* 实现newComparator函数,产生一个新的FieldComparator */
040  
041    public FieldComparator newComparator(String fieldname, int numHits,
042  
043           int sortPos, boolean reversed) throws IOException {
044  
045       return new DistanceScoreDocLookupComparator(fieldname,
046  
047             numHits);
048  
049    }
050  
051  
052  
053    /* 继承FieldComparator */
054  
055    private class DistanceScoreDocLookupComparator // #4
056  
057           extends FieldComparator {
058  
059       /* top N个文档的距离值 */
060  
061       private float[] values; // #6
062  
063       /* top N文档中最远距离 */
064  
065       private float bottom; // #7
066  
067       private String fieldName;
068  
069       /* 保存location中的值 */
070  
071       private String[] currentReaderValues;
072  
073  
074  
075       public DistanceScoreDocLookupComparator(String fieldName,
076  
077 int numHits) throws IOException {
078  
079           values = new float[numHits];
080  
081           this.fieldName = fieldName;
082  
083       }
084  
085  
086  
087       public void setNextReader(IndexReader reader, int docBase )
088  
089 throws IOException {
090  
091           this.currentReaderValues = FieldCache.DEFAULT.getStrings(
092  
093 reader, this.fieldName );
094  
095       }
096  
097  
098  
099       /** 比较两个距离 */
100  
101       public int compare(int slot1, int slot2) { // #11
102  
103           if (values[slot1] < values[slot2])
104  
105              return -1; // #11
106  
107           if (values[slot1] > values[slot2])
108  
109              return 1; // #11
110  
111           return 0; // #11
112  
113       }
114  
115  
116  
117       /** 记录top N中最远距离  */
118  
119       public void setBottom(int slot) { // #12
120  
121           bottom = values[slot];
122  
123       }
124  
125  
126  
127       /** 得到top N中的第slot个值 */
128  
129       public Comparable value(int slot) { // #15
130  
131           return new Float(values[slot]); // #15
132  
133       } // #15
134  
135  
136  
137       /* 比较新的doc与最远距离 */
138  
139       public int compareBottom(int doc) throws IOException {
140  
141           float distance = distance( currentReaderValues[doc] );
142  
143          
144  
145           if (bottom < distance )
146  
147              return -1; // #13
148  
149           if (bottom > distance )
150  
151              return 1; // #13
152  
153           return 0; // #13
154  
155       }
156  
157  
158  
159       /* 将新doc的值插入top N */

 

发表在 lucene 发表回复

lucene3.5多线程搜索释义

转载自:Lucene3.4多线程多索引搜索问题,3.5相对3.4变化不大。

hi.baidu.com/zuimao2004/blog/item/886066727245ab018601b05a.html

多线程多索引搜索:
* Lucene从3.1.0开始不主张再使用ParalellMultiSearcher,并修改了ParalellMultiSearcher的底层实现,首次引入缓存池机制,如果不指定缓冲池,会使用一个缺省的缓冲池。目前在Lucene3.4中仍然可以使用,当然如果你实在是看不惯那条提示Deprecated的删除线的话,
* 这里有两个个可供选择的替代方案:
* 1.通过ParallelReader来实现
* 但注意的是:使用ParallelReader时,要保证多个索引中的文档的数目和内容都是一样的,包括索引创建和修改的顺序和方式,
* 如果你向一个索引中添加了文档,同时你也必须用同样的方式添加同样的文档到其他索引中。
* 从这一点来讲还是有别于ParalellMultiSearcher的,ParallelReader只是单纯的加快索引的读取速度,
* 并不关心外围的其他功能,所以严格来讲,还不是实际应用中所需要的多线程多索引搜索,因为实际开发中
* 的多个索引目录在创建方式和存储的内容上大都是不同的。
* 2.Lucene3.1.0开始废除了ParallelMultiSearcher,它主张用带有ExecutorService参数的IndexSearcher
* 的构造函数来实现(可查看官方文档)。ExecutorService提供了一种用户自定义线程池的机会。

*public IndexSearcher( r, ExecutorService executor),用于对每个检索索引segment的检索器分配一个独立的线程在executor线程池中,需要注意的是indexsearcher的关闭并不会关闭executor线程池,除非你自己手动关闭。这样意味着多个索引器可以共用一个线程池。

代码:

String INDEX_STORE_PATH1=”./index1″;
String INDEX_STORE_PATH2=”./index2″;
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_34);
/**
* 创建多索引目录
* 将索引存储在不同的目录中
*/
@Test
public void createMultiIndexs()throws Exception
{
IndexWriterConfig iwc1=new IndexWriterConfig(Version.LUCENE_34,analyzer);
iwc1.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriterConfig iwc2=new IndexWriterConfig(Version.LUCENE_34,analyzer);
iwc2.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
//创建第一个索引
IndexWriter writer1=new IndexWriter(FSDirectory.open(new File(INDEX_STORE_PATH1)),iwc1);
Document doc1=new Document();
Document doc2=new Document();
Document doc3=new Document();

Field bookname1=new Field(“bookname”,”钢铁是怎样炼成的”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice1=new Field(“price”,”20.5″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname2=new Field(“bookname”,”钢铁战士”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice2=new Field(“price”,”18.4″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname3=new Field(“bookname”,”钢和铁是两种金属”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice3=new Field(“price”,”20.5″,Field.Store.YES,Field.Index.NOT_ANALYZED);

doc1.add(bookname1);
doc1.add(bookprice1);

doc2.add(bookname2);
doc2.add(bookprice2);

doc3.add(bookname3);
doc3.add(bookprice3);

writer1.addDocument(doc1);
writer1.addDocument(doc2);
writer1.addDocument(doc3);
writer1.close();

//创建第二个索引
IndexWriter writer2=new IndexWriter(FSDirectory.open(new File(INDEX_STORE_PATH2)),iwc2);
Document doc4=new Document();
Document doc5=new Document();
Document doc6=new Document();

Field bookname4=new Field(“bookname”,”钢要比铁有更多的碳元素”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice4=new Field(“price”,”22″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname5=new Field(“bookname”,”钢和铁是两种重要的金属”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice5=new Field(“price”,”15.9″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname6=new Field(“bookname”,”钢铁是很重要的金属材料”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice6=new Field(“price”,”19.0″,Field.Store.YES,Field.Index.NOT_ANALYZED);

doc4.add(bookname4);
doc4.add(bookprice4);

doc5.add(bookname5);
doc5.add(bookprice5);

doc6.add(bookname6);
doc6.add(bookprice6);

writer2.addDocument(doc4);
writer2.addDocument(doc5);
writer2.addDocument(doc6);
writer2.close();
}

/**
* 多索引搜索:
* 由于在Lucene3.4中废弃了MultiSearcher
* 通过MultiReader来代替实现
* @throws Exception
*/
@Test
public void testMultiReader()throws Exception
{
//创建两个IndexReader
IndexReader reader1=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH1)));
IndexReader reader2=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH2)));
//构造MultiReader
MultiReader multiReader=new MultiReader(new IndexReader[]{reader1,reader2},true);

Term t1=new Term(“bookname”,”和”);
TermDocs docs=multiReader.termDocs(t1);
System.out.println(“检索的匹配结果有”+multiReader.docFreq(t1)+”个”);
while(docs.next())
{
System.out.println(multiReader.document(docs.doc()).toString());
System.out.println(“——————————–”);
}

}
/**
* 多线程多索引搜索:
* @throws Exception
*/
@Test
public void testParallelReader()throws Exception
{
//创建两个IndexReader
IndexReader reader1=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH1)));
IndexReader reader2=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH2)));
//构造ParallelReader
ParallelReader parallelReader=new ParallelReader();
parallelReader.add(reader1);
parallelReader.add(reader2);
Term t1=new Term(“bookname”,”和”);
TermDocs docs=parallelReader.termDocs(t1);
System.out.println(“检索的匹配结果有”+parallelReader.docFreq(t1)+”个”);
while(docs.next())
{
System.out.println(parallelReader.document(docs.doc()).toString());
System.out.println(“——————————–”);
}

发表在 lucene 发表回复

lucene3的IndexWriter参数释义

IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.UNLIMITED);
1.MAXBufferedDocs

MaxBufferedDocs这个参数默认是disabled的,因为Lucene中还用另外一个参数(RAMBufferSizeMB)控制这个bufffer的索引文档个数。
其实MaxBufferedDocs和RAMBufferSizeMB这两个参数是可以一起使用的,一起使用时只要有一个触发条件满足就写入硬盘,生成一个新的索引segment文件。

2.RAMBufferSize

控制用于buffer索引文档的内存上限,如果buffer的索引文档个数到达该上限就写入硬盘。当然,一般来说也只越大索引速度越快。当我们对文档大小不太确定时,这个参数就相当有用,不至于outofmemory error.

3.MegerFactor
SetMergeFactor是控制segment合并频率的,其决定了一个索引块中包括多少个文档,当硬盘上的索引块达到多少时,将它们合并成一个较大的索引块。当MergeFactor值较大时,生成索引的速度较快。MergeFactor的默认值是10,建议在建立索引前将其设置的大一些。

发表在 lucene 发表回复

lucene3.5入门实例

1.建立索引

核心类

IndexWriterConfig:建立索引的配置对象,里面包含一个索引解析器Analyzer

IndexWriter:写索引的类。

  1. import java.io.BufferedReader;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.FileNotFoundException;
  5. import java.io.IOException;
  6. import java.io.InputStreamReader;
  7. import org.apache.commons.io.IOUtils;
  8. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  9. import org.apache.lucene.document.Document;
  10. import org.apache.lucene.document.Field;
  11. import org.apache.lucene.index.CorruptIndexException;
  12. import org.apache.lucene.index.FieldInfo.IndexOptions;
  13. import org.apache.lucene.index.IndexWriter;
  14. import org.apache.lucene.index.IndexWriterConfig;
  15. import org.apache.lucene.index.IndexWriterConfig.OpenMode;
  16. import org.apache.lucene.index.Term;
  17. import org.apache.lucene.store.FSDirectory;
  18. import org.apache.lucene.util.Version;
  19. public class FileIndexer {
  20.     private IndexWriter indexWriter;
 
  1.    // 索引文件存放目录
  2.     private File        fileIndex = new File(FileSearchConstant.FILE_INDEX);
  3.    // 要被建立索引的文件目录
  4.     private File        fileDir   = new File(FileSearchConstant.FILE_DIR);
  5.     // private static Logger logger = LoggerFactory.getLogger(FileIndexer.class);
  6.     public static void main(String[] args) throws IOException {
  7.         FileIndexer fileIndexer = new FileIndexer();
  8.         fileIndexer.buildIndex();
  9.     }
  10.     public void buildIndex() throws IOException {
  11.         boolean isCreate = true;
  12.         //建立索引的配置类,包含了一个解析器
  13.         IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35,
  14.                                                                     new StandardAnalyzer(Version.LUCENE_35));
  15.         //设置我们的解析器是新建还是追加更新
  16.         setModel(isCreate, indexWriterConfig);
  17.         //索引的建立类 第一个参数索引的存放位置,第二个参数索引的配置对象
  18.         indexWriter = new IndexWriter(FSDirectory.open(fileIndex), indexWriterConfig);
  19.         long startTime = System.currentTimeMillis();
  20.         //建立索引
  21.         indexDocs(fileDir, indexWriter);
  22.         //这个方法在新增索引的情况会很有用,就是讲原来散落的索引文件重新进行整理合并!
  23.         indexWriter.forceMerge(1);
  24.         // indexWriter.commit();
  25.         // 关闭索引写
  26.         indexWriter.close();
  27.         long endTime = System.currentTimeMillis();
  28.         System.out.println(“cost :” + (endTime - startTime) + ”seconds”);
  29.     }
  30.     private void setModel(boolean isCreate, IndexWriterConfig indexWriterConfig) {
  31.         if (isCreate) {
  32.             indexWriterConfig.setOpenMode(OpenMode.CREATE);
  33.         } else {
  34.             indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
  35.         }
  36.     }
  37.     private void indexDocs(File fileDir, IndexWriter indexWriter) throws CorruptIndexException, IOException {
  38.         if (fileDir.canRead()) {
  39.             if (fileDir.isDirectory()) {
  40.                 String[] listFiles = fileDir.list();
  41.                 for (String file : listFiles) {
  42.                     indexDocs(new File(fileDir, file), indexWriter);
  43.                 }
  44.             } else {
  45.                 // Document 代表一个索引类型件
  46.                 Document doc = new Document();
  47.                 // Field 代表索引的项,比如我们这里对文件的路径进行建索引
  48.                 Field pathField = new Field(FileSearchConstant.PATH, fileDir.getPath(), Field.Store.YES,
  49.                                             Field.Index.NOT_ANALYZED_NO_NORMS);
  50.                 /*索引建立有两个很关键的因素:
  51.                   * Document Frequency 即文档频次
  52.                   * Document Frequency (df):即有多少文档包含次Term。df 越大说明越不重要
  53.                   * 这里代表在建索引时不考虑这两点,只建立索引
  54.                  */
  55.                 pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
  56.                 doc.add(pathField);
  57.                 // NumericField modifyField = new NumericField(IndexerConstant.MODIFIED);
  58.                 // modifyField.setLongValue(fileDir.lastModified());
  59.                 // doc.add(modifyField);
  60.                 //对文件内容建立索引
  61.                 Field contentField = new Field(FileSearchConstant.CONTENTS, getContents(fileDir), Field.Store.YES,
  62.                                                Field.Index.ANALYZED);
  63.                 // FileInputStream fileInputStream = getFileInputStream(fileDir);
  64.                 // doc.add(new Field(IndexerConstant.CONTENTS, new BufferedReader(new InputStreamReader(fileInputStream,
  65.                 // ”UTF-8″))));
  66.                 // fileInputStream.close();
  67.                 contentField.setIndexOptions(IndexOptions.DOCS_ONLY);
  68.                 doc.add(contentField);
  69.                 //添加或是更新索引
  70.                 if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE) {
  71.                     indexWriter.addDocument(doc);
  72.                 } else if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE_OR_APPEND) {
  73.                     indexWriter.updateDocument(new Term(FileSearchConstant.PATH, fileDir.getPath()), doc);
  74.                 }
  75.             }
  76.         }
  77.     }
  78.     public String getContentByUtils(File fileDir) {
  79.         String content = null;
  80.         try {
  81.             content = IOUtils.toString(new FileInputStream(fileDir), ”UTF-8″);
  82.         } catch (Exception e) {
  83.             e.printStackTrace();
  84.         }
  85.         return content;
  86.     }
  87.     /*
  88.     *  这个方法不推荐使用,在readline的时候没有把换行符带入,带入在建立索引的时候不能做正确的分词,建议使用上面那个文件读取
  89.     */
  90.     public String getContents(File fileDir) {
  91.         StringBuffer result = new StringBuffer();
  92.         BufferedReader reader = null;
  93.         try {
  94.             reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileDir), ”UTF-8″));
  95.             String temp = null;
  96.             while ((temp = reader.readLine()) != null) {
  97.                 result.append(temp);
  98.             }
  99.         } catch (Exception e) {
  100.             e.printStackTrace();
  101.             return null;
  102.         } finally {
  103.             if (null != reader) {
  104.                 try {
  105.                     reader.close();
  106.                 } catch (IOException e) {
  107.                     e.printStackTrace();
  108.                 }
  109.             }
  110.         }
  111.         return result.toString();
  112.     }
  113. }

 

 

 

2.按照索引查找

  1. import java.io.File;
  2. import java.io.IOException;
  3. import org.apache.lucene.analysis.Analyzer;
  4. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  5. import org.apache.lucene.document.Document;
  6. import org.apache.lucene.index.IndexReader;
  7. import org.apache.lucene.queryParser.ParseException;
  8. import org.apache.lucene.queryParser.QueryParser;
  9. import org.apache.lucene.search.IndexSearcher;
  10. import org.apache.lucene.search.Query;
  11. import org.apache.lucene.search.ScoreDoc;
  12. import org.apache.lucene.search.TopDocs;
  13. import org.apache.lucene.store.FSDirectory;
  14. import org.apache.lucene.util.Version;
  15. public class FileSearcher {
  16.     private String content = ”haha”;
  17.     public void search() throws IOException, ParseException {
  18.         // FSDirectory.open(new File(FileSearchConstant.FILE_INDEX) 这个会根据你的操作系统类型,打开最为合适的索引读取器:简单文件读取,内存,NIO等三种
  19.         IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(FileSearchConstant.FILE_INDEX)));
  20.         IndexSearcher searcher = new IndexSearcher(indexReader);
  21.         //构建要查找的内容项
  22.         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
  23.         QueryParser queryParser = new QueryParser(Version.LUCENE_35, FileSearchConstant.CONTENTS, analyzer);
  24.         //解析内容
  25.         Query query = queryParser.parse(content);
  26.         System.out.println(“Searching for: ” + query.toString());
  27.         //TopDocs是返回结果,10代表最大取10个,在实际应用中我们一般要做分页处理,所以这里需要特别注意下!
  28.         TopDocs topDocs = searcher.search(query, 10);
  29.         ScoreDoc[] docs = topDocs.scoreDocs;
  30.         if (null != docs) {
  31.             for (int i = 0; i < docs.length; i++) {
  32.                 ScoreDoc scoreDoc = docs[i];
  33.                 //这步很重要的,也是最为关键的一步
  34.                 Document doc = searcher.doc(scoreDoc.doc);
  35.                //取的自己想要的数据
  36.                 String contents = doc.get(FileSearchConstant.CONTENTS);
  37.                 String path = doc.get(FileSearchConstant.PATH);
  38.                 System.out.println(contents);
  39.                 System.out.println(path);
  40.             }
  41.         }
  42.     }
  43.     public static void main(String[] args) throws IOException, ParseException {
  44.         FileSearcher fileSearcher = new FileSearcher();
  45.         fileSearcher.search();
  46.     }
  47. }

发表在 lucene 发表回复

lucene+IKAnnlyzer+highlighter实例

中文分词 IKAnalyzer和高亮highlighter的使用

1. package demo.test;
2.
3. import java.io.IOException;
4. import java.io.StringReader;
5.
6. import org.apache.lucene.analysis.Analyzer;
7. import org.apache.lucene.analysis.TokenStream;
8. import org.apache.lucene.analysis.tokenattributes.TermAttribute;
9. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
10. import org.wltea.analyzer.lucene.IKAnalyzer;
11.
12. public class TestIKAnalyzer {
13.
14.     public static void main(String[] args) throws IOException {
15.         Analyzer analyzer = new IKAnalyzer();
16.         TokenStream tokenStream = analyzer.tokenStream(“”, new StringReader(“永和服装饰品有限公司”));
17.         //2.x写法 3.0之后不支持了
18.         /*Token token =new Token();
19.         while(tokenStream.next(token)!=null){
20.             System.out.println(token.term());
21.         }*/
22.         //3.x的写法
23.         TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
24.         TypeAttribute typeAtt = (TypeAttribute) tokenStream.getAttribute(TypeAttribute.class);
25.
26.         while (tokenStream.incrementToken()) {
27.             System.out.print(termAtt.term());
28.             System.out.print(‘ ‘);
29.             System.out.println(typeAtt.type());
30.         }
31.     }
32.
33. }

package demo.test;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TestIKAnalyzer {

public static void main(String[] args) throws IOException {
Analyzer analyzer = new IKAnalyzer();
TokenStream tokenStream = analyzer.tokenStream(“”, new StringReader(“永和服装饰品有限公司”));
//2.x写法 3.0之后不支持了
/*Token token =new Token();
while(tokenStream.next(token)!=null){
System.out.println(token.term());
}*/
//3.x的写法
TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) tokenStream.getAttribute(TypeAttribute.class);

while (tokenStream.incrementToken()) {
System.out.print(termAtt.term());
System.out.print(‘ ‘);
System.out.println(typeAtt.type());
}
}

}

分词结果 永和 和服 服装 装饰品 装饰 饰品 有限公司 有限 公司

2.我们开始采用IKAnalyzer创建索引
Java代码  收藏代码

1. package demo.test;
2.
3. import java.io.BufferedReader;
4. import java.io.File;
5. import java.io.FileInputStream;
6. import java.io.IOException;
7. import java.io.InputStreamReader;
8.
9. import org.apache.lucene.analysis.Analyzer;
10. import org.apache.lucene.document.Document;
11. import org.apache.lucene.document.Field;
12. import org.apache.lucene.index.IndexWriter;
13. import org.wltea.analyzer.lucene.IKAnalyzer;
14.
15. public class CreatIndex {
16.
17.     @SuppressWarnings(“deprecation”)
18.     public static void main(String[] args) throws IOException {
19.         String path = “index”;//索引目录
20.         Analyzer analyzer = new IKAnalyzer();//采用的分词器
21.         IndexWriter iwriter = new IndexWriter(path, analyzer, true);
22.         File dir = new File(“data”);//待索引的数据文件目录
23.         File[] files = dir.listFiles();
24.         for(int i=0;i<files.length;i++){
25.             Document doc = new Document();
26.             File file = files[i];
27.             FileInputStream fis = new FileInputStream(file);
28.             String content = “”;
29.             BufferedReader reader = new BufferedReader(new InputStreamReader(fis));
30.
31.             StringBuffer buffer = new StringBuffer(“”);
32.             content = reader.readLine();
33.             while (content != null) {
34.                 buffer.append(content);
35.                 content = reader.readLine();
36.             }
37.             doc.add(new Field(“title”,file.getName(),Field.Store.YES,Field.Index.ANALYZED));
38.             doc.add(new Field(“content”,buffer.toString(),Field.Store.YES,Field.Index.ANALYZED));
39.             iwriter.addDocument(doc);
40.         }
41.         iwriter.close();
42.     }
43.
44. }

package demo.test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class CreatIndex {

@SuppressWarnings(“deprecation”)
public static void main(String[] args) throws IOException {
String path = “index”;//索引目录
Analyzer analyzer = new IKAnalyzer();//采用的分词器
IndexWriter iwriter = new IndexWriter(path, analyzer, true);
File dir = new File(“data”);//待索引的数据文件目录
File[] files = dir.listFiles();
for(int i=0;i<files.length;i++){
Document doc = new Document();
File file = files[i];
FileInputStream fis = new FileInputStream(file);
String content = “”;
BufferedReader reader = new BufferedReader(new InputStreamReader(fis));

StringBuffer buffer = new StringBuffer(“”);
content = reader.readLine();
while (content != null) {
buffer.append(content);
content = reader.readLine();
}
doc.add(new Field(“title”,file.getName(),Field.Store.YES,Field.Index.ANALYZED));
doc.add(new Field(“content”,buffer.toString(),Field.Store.YES,Field.Index.ANALYZED));
iwriter.addDocument(doc);
}
iwriter.close();
}

}

3.对索引进行查询并进行高亮highlighter处理
Java代码  收藏代码

1. package demo.test;
2.
3. import java.io.File;
4. import java.io.IOException;
5. import java.io.StringReader;
6.
7. import org.apache.lucene.analysis.Analyzer;
8. import org.apache.lucene.analysis.TokenStream;
9. import org.apache.lucene.document.Document;
10. import org.apache.lucene.index.Term;
11. import org.apache.lucene.search.IndexSearcher;
12. import org.apache.lucene.search.Query;
13. import org.apache.lucene.search.ScoreDoc;
14. import org.apache.lucene.search.TermQuery;
15. import org.apache.lucene.search.TopDocs;
16. import org.apache.lucene.search.highlight.Highlighter;
17. import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
18. import org.apache.lucene.search.highlight.QueryScorer;
19. import org.apache.lucene.search.highlight.SimpleFragmenter;
20. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
21. import org.apache.lucene.store.Directory;
22. import org.apache.lucene.store.FSDirectory;
23. import org.wltea.analyzer.lucene.IKAnalyzer;
24.
25. public class TestHighlighter {
26.
27.     @SuppressWarnings(“deprecation”)
28.     public static void main(String[] args) throws IOException, InvalidTokenOffsetsException {
29.         String path = “index”;//索引目录
30.         Directory dir = FSDirectory.getDirectory(new File(path));
31.         IndexSearcher search = new IndexSearcher(dir);
32.         Term term = new Term(“content”,”纯粹”);
33.         Query query = new TermQuery(term);
34.         TopDocs topDocs = search.search(query, 10);
35.         ScoreDoc[] hits = topDocs.scoreDocs;
36.         //正常产生的查询
37.         for(int i=0;i<hits.length;i++){
38.             Document doc = search.doc(hits[i].doc);
39.             System.out.print(doc.get(“title”)+”:”);
40.             System.out.println(doc.get(“content”));
41.         }
42.         //高亮设置
43.         Analyzer analyzer = new IKAnalyzer();//设定分词器
44.         SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(“<B>”,”</B>”);//设定高亮显示的格式,也就是对高亮显示的词组加上前缀后缀
45.         Highlighter highlighter = new Highlighter(simpleHtmlFormatter,new QueryScorer(query));
46.         highlighter.setTextFragmenter(new SimpleFragmenter(150));//设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧,当然这里也是设定只展示部分数据
47.         for(int i=0;i<hits.length;i++){
48.             Document doc = search.doc(hits[i].doc);
49.             TokenStream tokenStream = analyzer.tokenStream(“”,new StringReader(doc.get(“content”)));
50.             String str = highlighter.getBestFragment(tokenStream, doc.get(“content”));
51.             System.out.println(str);
52.         }
53.     }
54.
55. }

package demo.test;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TestHighlighter {

@SuppressWarnings(“deprecation”)
public static void main(String[] args) throws IOException, InvalidTokenOffsetsException {
String path = “index”;//索引目录
Directory dir = FSDirectory.getDirectory(new File(path));
IndexSearcher search = new IndexSearcher(dir);
Term term = new Term(“content”,”纯粹”);
Query query = new TermQuery(term);
TopDocs topDocs = search.search(query, 10);
ScoreDoc[] hits = topDocs.scoreDocs;
//正常产生的查询
for(int i=0;i<hits.length;i++){
Document doc = search.doc(hits[i].doc);
System.out.print(doc.get(“title”)+”:”);
System.out.println(doc.get(“content”));
}
//高亮设置
Analyzer analyzer = new IKAnalyzer();//设定分词器
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(“<B>”,”</B>”);//设定高亮显示的格式,也就是对高亮显示的词组加上前缀后缀
Highlighter highlighter = new Highlighter(simpleHtmlFormatter,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(150));//设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧,当然这里也是设定只展示部分数据
for(int i=0;i<hits.length;i++){
Document doc = search.doc(hits[i].doc);
TokenStream tokenStream = analyzer.tokenStream(“”,new StringReader(doc.get(“content”)));
String str = highlighter.getBestFragment(tokenStream, doc.get(“content”));
System.out.println(str);
}
}

}

发表在 lucene 发表回复

lucenelucene3.3 多目录索引

lucenelucene3.3 多目录索引

最近做的一项目,需要用到lucene,项目开始的时候lucene3.4还没有发布,选择了最新的3.3版本先说一下业务背景:需要搜索的文件是TXT文件,每天会增量增加,而且文件一直保留,文件存放的结构化数据,具体结构如下:  Id|name|address|date

 

需要根据name address date进行搜索  date需要考虑跨时间段

由于业务每天会生成一个文件,当天的文件日期肯定是一致的,这个由业务方保证,每天的文件最大会有200M左右

考虑到文件是增量增加,然后需要按时间跨度搜索,时间跨度最大可以是三个月之久,响应时间必须在2S内

如果放在一个文件里面,索引的建立会随着文件不断增长而变得无比庞大,同时对于索引的搜索和优化也很麻烦

根据实际情况,考虑了如下个方案:

1、根据文件日期做索引分类,这需要数据提供者配合给每天生成的文件名中必须要包含日期

2、然后按照日期格式生成诸如2011/10/2011-10-11这样的目录结构,对应每天的索引就存放在对应的日期文件夹内

这样的好处有如下几点:

1) 索引源文件里面不需要再处理日期,可以直接把日期字段删除,减少索引文件大小

2)搜索时不需要进行区间搜索,如搜索2011-10-01至2011-10-31号的数据,可以生成31个文件目录,如2011/10/2011-10-01,2011/10/2011-10-02等,

直接去指定日期目录下通过多目录索引搜索文件

3)对于生成的文件可以随时重建索引,因为是分目录索引,所以重建效率非常高,不需要进行专门的索引优化

/**
*根据不同的域使用不同的分词器
*
* @return
*/
public PerFieldAnalyzerWrapper kmsAnalyzer() {
Analyzer standardAnalyzer = new StandardAnalyzer(LUCENE_VERSION);

Analyzer kwAnalyzer = new KeywordAnalyzer();
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(kwAnalyzer);
analyzer.addAnalyzer(“content”, standardAnalyzer);
return analyzer;
}

//构造索引生成对象

private IndexWriter indexWriter33(String indexPath) throws CorruptIndexException,
LockObtainFailedException, IOException {

File file = new File(indexPath);
LogMergePolicy policy = new LogDocMergePolicy();

// SetUseCompoundFile这个方法可以使Lucene在创建索引库时,会合并多个 Segments 文件到一个 .cfs中
// 此方式有助于减少索引文件数量,对于将来搜索的效率有较大影响。
// 压缩存储(True则为复合索引格式)
policy.setUseCompoundFile(true);

//合并因子,当硬盘上的索引块达到设置的数量时,会合并成一个一个较大的索引块
policy.setMergeFactor(5000);
IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, this.kmsAnalyzer());
config.setOpenMode(OpenMode.CREATE);
config.setMergePolicy(policy);

//最大缓存文档数 根据内存大小进行设置,设置较大的数目可以加快建索引速度
config.setMaxBufferedDocs(200000);

//构造索引存放目录
FSDirectory directory = FSDirectory.open(file);

//写索引对象
IndexWriter indexWriter = new IndexWriter(directory, config);

return indexWriter;
}

/**
* 构造document
*
* @param lineRecord 针对每一行记录构造document
* @return
*/
private Document buildDocument(String lineRecord, boolean fileStatus) {
Document doc = new Document();
String[] columns = lineRecord.split(String.valueOf((char) 5));
//长度为4时,说明是站内词
if (columns.length == 3) {
Field cateId = new Field(“cateId”, columns[2], Store.NO, Index.ANALYZED);
doc.add(cateId);
}
//长度为6时,表示是站外词
else if (columns.length == 5) {
Field sessionId = new Field(“sessionId”, columns[2], Store.NO, Index.ANALYZED);
Field countryId = new Field(“countryId”, columns[3], Store.NO, Index.ANALYZED);
Field urlGourpId = new Field(“urlGourpId”, columns[4], Store.NO, Index.ANALYZED);
doc.add(sessionId);
doc.add(countryId);
doc.add(urlGourpId);
} else {
logger.error(“The file content [" + lineRecord + "] error.”);
fileStatus = false;
return new Document();
}
Field id = new Field(“id”, columns[0], Store.YES, Index.ANALYZED);
Field keyword = new Field(“keyword”, columns[1], Store.NO, Index.ANALYZED);
//需要进行分词处理
Field content = new Field(“content”, columns[1], Store.NO, Index.ANALYZED);
//Field date = new Field(“date”, columns[2], Store.YES, Index.ANALYZED);

doc.add(id);
doc.add(keyword);
doc.add(content);
//doc.add(date);
return doc;

}

public void createIndex(String srcPath, String desPath) {
//文件内容正常标识
boolean fileStatus = true;
String path = null;
//获取所有*.dat文件
List<File> fileList = SEFileUtil.getSrcFiles(SEFileUtil.pathToFile(srcPath),
FILE_SUFFIX_DAT);

// 获取所有以*.lock的文件
List<File> lockFileList = SEFileUtil.getSrcFiles(SEFileUtil.pathToFile(srcPath),
FILE_SUFFIX_LOCK);

//建立索引
label0: for (File file : fileList) {
IndexWriter writer = null;
BufferedReader br = null;
//构造写索引对象
try {
String prxFileName = file.getName().substring(0, file.getName().indexOf(“_”));

//需要索引的文件正在生成时不处理
if (lockFileList != null && !lockFileList.isEmpty()) {
for (File lockFile : lockFileList) {
String preLockFileName = lockFile.getName().substring(0,
file.getName().indexOf(“_”));
if (preLockFileName.equalsIgnoreCase(prxFileName)) {
lockFileList.remove(lockFile);
continue label0;
}
}
}
//生成索引文件存储路径
path = SEFileUtil.buildFilePath(desPath, prxFileName, “yyyyMMdd”);
if (logger.isDebugEnabled()) {
logger.debug(“The index file path: ” + path);
}
writer = this.indexWriter33(SEFileUtil.createDirectory(path));
br = new BufferedReader(new InputStreamReader(new FileInputStream(file), “UTF-8″));
String record = null;

//生成索引文件
while (StringUtils.isNotBlank(record = br.readLine())) {
writer.addDocument(this.buildDocument(record, fileStatus));
}
writer.optimize();
writer.commit();
} catch (Exception e) {
e.printStackTrace();
return;
} finally {
this.close(writer, br);
}
//文件解析异常时不删除原文件
if (fileStatus) {
if (StringUtils.isNotBlank(this.getIndexCopyToIP())) {
String[] ipArray = this.getIndexCopyToIP().split(“\|”);
for (String ip : ipArray) {
int exitValue = this.copyIndex(ip.trim(), path);
if (0 != exitValue) {
logger.error(“^_^ Copy index directory [" + path + "] to [" + ip
+ "] failed.”);
}
}
}
//删除文件
boolean flag = SEFileUtil.deleteFile(file);
if (!flag) {
logger.error(“Delete file failed: ” + file.getPath());
}
}

}

}

下面是搜索,搜索由于我的业务相对简单,所以搜索也比较简单,主要有一点就是我需要搜索的返回值只要取其中的ID值就可以了,

开始不太了解lucene,取值时把整个document都装载进去了,所以取文件时很慢,后来通过MapFieldSelector使速度提高了5部以下,具体用法如下

下面是多目录搜索

/**

下面是查询主方法,包括构造搜索条件和多目录索引

*/

public List<Long> searchIndex(Map<String, String> paramMap) {
Long startTime = null;
if (logger.isDebugEnabled()) {
startTime = System.currentTimeMillis();
logger.debug(“^_^ start search: ” + paramMap);
}
List<Long> ids = null;
//打开索引文件存放目录
String keyword = paramMap.get(“keyword”);//关键字
String cateId = paramMap.get(“cateId”);//类目标识
String matchFlag = paramMap.get(“matchFlag”); //匹配标识 0:精确,1:模糊
String cateType = paramMap.get(“cateType”);//02:发布类目 03:展示类目
String siteWord = paramMap.get(“siteWord”);//0:=站内词,1:=站外词
String sessionId = paramMap.get(“sessionId”);//来源
String countryId = paramMap.get(“countryId”);//国家
String urlGourpId = paramMap.get(“urlGourpId”);//url 组
String fromDate = paramMap.get(“startDate”);//起始时间
String toDate = paramMap.get(“endDate”);//结束时间

//获取搜索目录
String searchPath = this.getSearchPath(siteWord, cateType);

//计算时间段内所有日期
List<String> dateStringList = SEDateUtil.getDateRange(fromDate, toDate);
//        IndexReader[] subReaders = new IndexReader[dateStringList.size()];
List<IndexReader> subReadersList = new ArrayList<IndexReader>();
boolean flag = true;
try {
//构造索引搜索对象
for (int i = 0; i < dateStringList.size(); i++) {
//获取所有搜索路径文件
String fullPath = SEFileUtil.buildFilePath(searchPath, dateStringList.get(i),
“yyyy-MM-dd”);
File file = SEFileUtil.pathToFile(fullPath);
if (!file.isDirectory()) {
if (logger.isDebugEnabled()) {
logger.debug(“The directory is not exist: ” + fullPath);
}
continue;
}
FSDirectory directory = FSDirectory.open(new File(fullPath));
IndexReader subReader = IndexReader.open(directory);
flag = false;
subReadersList.add(subReader);
}
if (flag) {
return null;
}
IndexReader[] subReaders = subReadersList
.toArray(new IndexReader[subReadersList.size()]);
if (logger.isDebugEnabled()) {

logger.debug(“Build search directory consume time: ”
+ (System.currentTimeMillis() – startTime));
startTime = System.currentTimeMillis();
}
//获取搜索结果
ids = this.getSearchResult(subReaders, matchFlag, keyword, cateId, sessionId,
countryId, urlGourpId);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (null != subReadersList) {
subReadersList = null;
}
}
if (logger.isDebugEnabled()) {
Long endTime = (System.currentTimeMillis() – startTime);
logger.debug(“search end. Consume Time(s): ” + endTime);
}
if (null != ids && !ids.isEmpty()) {
//按ID升级排序
Collections.sort(ids, new IndicatorComparator());
}
return ids;
}

/**
*
*/
private List<Long> getSearchResult(IndexReader[] subReaders, String matchFlag, String keyword,
String cateId, String sessionId, String countryId,
String urlGourpId) throws ParseException,
CorruptIndexException, Exception {
List<Long> result = null;
PerFieldAnalyzerWrapper analyzer = buildIndexJob.kmsAnalyzer();
IndexReader multiReader = new MultiReader(subReaders);

BooleanQuery query = new BooleanQuery();
//分词匹配keyword
if (“1″.equals(matchFlag) && StringUtil.isNotBlank(keyword)) {
QueryParser queryParser = new QueryParser(BuildIndexJob.LUCENE_VERSION, “content”,
analyzer);
//两个词之间的关系是or
queryParser.setDefaultOperator(QueryParser.OR_OPERATOR);
query.add(queryParser.parse(QueryParser.escape(keyword.toLowerCase())), Occur.MUST);
}
//全量匹配keyword
else if (“0″.equals(matchFlag) && StringUtils.isNotBlank(keyword)) {
Query kQuery = new TermQuery(new Term(“keyword”, keyword.toLowerCase()));
query.add(kQuery, Occur.MUST);
}
//categoryId匹配
if (StringUtils.isNotBlank(cateId)) {
Query bQuery = new TermQuery(new Term(“cateId”, cateId));
query.add(bQuery, Occur.MUST);
}
if (StringUtils.isNotBlank(sessionId)) {
Query bQuery = new TermQuery(new Term(“sessionId”, sessionId));
query.add(bQuery, Occur.MUST);
}
if (StringUtils.isNotBlank(countryId)) {
Query bQuery = new TermQuery(new Term(“countryId”, countryId));
query.add(bQuery, Occur.MUST);
}
if (StringUtils.isNotBlank(urlGourpId)) {
Query bQuery = new TermQuery(new Term(“urlGourpId”, urlGourpId));
query.add(bQuery, Occur.MUST);
}
Long startTime = System.currentTimeMillis();
IndexSearcher search = new IndexSearcher(multiReader);
//最多只返回20W数据,此数据是业务需求
TopDocs topDocs = search.search(query, 200000);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if (logger.isDebugEnabled()) {
logger.debug(“search result: ” + scoreDocs.length);
logger.debug(“search consume time: ” + (System.currentTimeMillis() – startTime));
startTime = System.currentTimeMillis();
}
if (scoreDocs.length <= 0) {
return null;
}
result = this.getIds(scoreDocs, search);
if (logger.isDebugEnabled()) {
logger.debug(“Reader [id] consume time: ” + (System.currentTimeMillis() – startTime));
}
return result;
}

/**
* 从doc对象中获取所有的ID集合
*
* @param scoreDocs
* @param multiSearcher
* @return
* @throws CorruptIndexException
* @throws IOException
* @throws ExecutionException
* @throws InterruptedException
*/
public List<Long> getIds(ScoreDoc[] scoreDocs, IndexSearcher search)
throws CorruptIndexException, IOException {
List<Long> ids = new ArrayList<Long>(scoreDocs.length);
Map<String, FieldSelectorResult> fieldSelections = new HashMap<String, FieldSelectorResult>(
1);
fieldSelections.put(“id”, FieldSelectorResult.LOAD);
FieldSelector fieldSelector = new MapFieldSelector(fieldSelections);

//获取ID集合
for (int i = 0; i < scoreDocs.length; i++) {
Document doc = search.doc(scoreDocs[i].doc, fieldSelector);
ids.add(Long.valueOf(doc.getFieldable(“id”).stringValue()));
}
return ids;
}

红色部分是取文件时只装载ID,其它的域不装载,主样获取速度会快很多,文件越大加载越快

再有就是在性能测试时,windows和linux区别很大,主要是windows上默认没有使用多线程和内存映射,源代码如下

/** Just like {@link #open(File)}, but allows you to
*  also specify a custom {@link LockFactory}. */
public static FSDirectory open(File path, LockFactory lockFactory) throws IOException {
if ((Constants.WINDOWS || Constants.SUN_OS || Constants.LINUX)
&& Constants.JRE_IS_64BIT && MMapDirectory.UNMAP_SUPPORTED) {
return new MMapDirectory(path, lockFactory);
} else if (Constants.WINDOWS) {
return new SimpleFSDirectory(path, lockFactory);
} else {
return new NIOFSDirectory(path, lockFactory);
}
}

所以测试效果必需要在64位的linux上才能体现出

转载自:http://xmgestapo.iteye.com/blog/1217606

发表在 lucene 发表回复

lucene3.5删除索引

Lucene提供了两种删除索引的方式,一种是通过documentId删除某document文档,第二种是根据Term删除文档document.

public class MyDeleteIndexes {
	public static final String STORE_PATH = "lucene_index";
	public static void deleteIndexes(String field , String keyword) throws IOException{
		long startTime = System.currentTimeMillis();
		Directory dir = FSDirectory.open(new File(STORE_PATH));
		IndexReader reader = IndexReader.open(dir,false);
		Term term = new Term(field,keyword);
		reader.deleteDocuments(term);
		//可以按documentId删除文档
		//reader.deleteDocument(1);
		reader.flush();
		reader.close();
		//System.out.println(reader.lastModified(dir));
		long endTime = System.currentTimeMillis();
		System.out.println("total time: " + (endTime - startTime) + " ms");
	}
}

发表在 lucene 发表回复

使用Lucene TermVector提高关键字高亮显示速度

使用Lucene TermVector提高关键字高亮显示速度
转载自:Lucene TermVector用法:相关搜索功能及提高高亮显示性能:hi.baidu.com/z57354658/blog/item/b80f524b2c92e1fa82025cbd.html
public class TermVectorTest {
	    Analyzer analyzer = new SimpleAnalyzer();
	    Directory ramDir = new RAMDirectory();
	    public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{
	        IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);
	        Document doc1 = new Document();
	        doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));
	        doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));
	        doc1.add(new Field("subject","学java,用java的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
	        Document doc2 = new Document();
	        doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));
	        doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));
	        doc2.add(new Field("subject","薄荷味23让他",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
	        Document doc3 = new Document();
	        doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));
	        doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));
	        doc3.add(new Field("subject","说都给我一 23523条去",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
	        writer.addDocument(doc1);
	        writer.addDocument(doc2);
	        writer.addDocument(doc3);
	        writer.optimize();
	        writer.close();
	    }
	    public void search() throws CorruptIndexException, IOException{
	        IndexReader reader = IndexReader.open(ramDir);
	        IndexSearcher searcher = new IndexSearcher(reader);
	        Term term = new Term("title","java");   //在title里查询java词条
	        TermQuery query = new TermQuery(term);
	        Hits hits = searcher.search(query);
	        for (int i = 0; i<hits.length(); i++)
	        {
	            Document doc = hits.doc(i);
	            System.out.println(doc.get("title"));
	            System.out.println(doc.get("subject"));
	            System.out.println("moreLike search: ");
	            morelikeSearch(reader,hits.id(i));
	        }
	    }
	    private void morelikeSearch(IndexReader reader,int id) throws IOException
	    {
	        //根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息
	        TermFreqVector vector = reader.getTermFreqVector(id, "subject");
	        BooleanQuery query = new BooleanQuery();
	        for (int i = 0; i <vector.size(); i++)
	        {
	             TermQuery tq = new TermQuery(new Term("subject",
	                        vector.getTerms()[i]));   //获取每个term保存的Token
	                 query.add(tq, BooleanClause.Occur.SHOULD);
	        }
	        IndexSearcher searcher = new IndexSearcher(ramDir);
	        Hits hits = searcher.search(query);
	        //显示代码,略
	    }
	//Lucene使用TermVector提高高亮显示性能
	    public void highterLightSearch() throws CorruptIndexException, IOException{
	        IndexReader reader = IndexReader.open(ramDir);
	        IndexSearcher searcher = new IndexSearcher(reader);
	        TermQuery query = new TermQuery(new Term("subject","java"));
	        Hits hits = searcher.search(query);
	        //高亮显示设置
	        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
	        Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
	         // 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容
	        highlighter.setTextFragmenter(new SimpleFragmenter(100));
	        for(int i = 0; i<hits.length(); i++){
	            Document doc = hits.doc(i);
	            TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");
	            TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");
	            TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);
	            String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));
	            System.out.println(doc.get("title"));
	            System.out.println(result);
	        }
	    }
	    public static void main(String[] args) throws CorruptIndexException, IOException
	    {
	        TermVectorTest  t = new TermVectorTest();
	        t.createRamIndex();
	        t.search();
	    }
	}

发表在 javalucene 发表回复

Lucene的Term Vectors学习

term vector, 就是对于documents的某一field,如title,body这种文本类型的, 建立词频的多维向量空间.每一个词就是一维, 这维的值就是这个词在这个field中的频率.同时 term vector能保存分词后文章内词的位置和偏移量。

使用term vectors:在创建索引的时候对该field打开term vectors选项。

Field 的 term vectors选项:
TermVector.YES解释: record the unique terms that occurred, and their counts, in each document, but do not store any positions or offsets information.

TermVector.WITH_POSITIONS解释:record the unique terms and their counts, and also the positions of each occurrence of every term, but no offsets.

TermVector.WITH_OFFSETS解释:record the unique terms and their counts, with the offsets (start & end character position) of each occurrence of every term, but no positions.

TermVector.WITH_POSITIONS_OFFSETS解释:store unique terms and their counts, along with positions and offsets.

TermVector.NO 解释:do not store any term vector information.
If Index.NO is specified for a field, then you must also specify TermVector.NO.

 

建立这样的lucene索引后,通过document的id和field名称, 通过IndexReader就可找到对应的term vector。代码如下:
TermFreqVector termFreqVector = reader.getTermFreqVector(id, “FieldName”);
遍历这个TermFreqVector可以取出每个词和词频, 如果在索引中存放了offsets和positions信息,可以取到位置信息,这样可以缩短高亮显示关键字的时间.

通过term vector可以实现很多功能:
1) 比如比较两个文档的相似度,把书抽象成一个document文件, 具有author, subject fields. 那么现在就通过这两个field来比较两本书的相似度.
author这个field是multiple fields, 就是说可以有多个author, 那么第一步就是比author是否相同,
String[] authors = doc.getValues(“author”);
BooleanQuery authorQuery = new BooleanQuery(); // #3
for (int i = 0; i < authors.length; i++) { // #3
String author = authors[i]; // #3
authorQuery.add(new TermQuery(new Term(“author”, author)), BooleanClause.Occur.SHOULD); // #3
}
authorQuery.setBoost(2.0f);
最后还可以把这个查询的boost值设高, 表示这个条件很重要, 权重较高, 如果作者相同, 那么就很相似了.
第二步就用到term vector了, 这里用的很简单, 单纯的看subject field的term vector中的term是否相同,
TermFreqVector vector = // #4
reader.getTermFreqVector(id, “subject”); // #4
BooleanQuery subjectQuery = new BooleanQuery(); // #4
for (int j = 0; j < vector.size(); j++) { // #4
TermQuery tq = new TermQuery(new Term(“subject”, vector.getTerms()[j]));
subjectQuery.add(tq, BooleanClause.Occur.SHOULD); // #4
}

2) What category?
这个比上个例子高级一点, 怎么分类了,还是对于document的subject, 我们有了term vector.
所以对于两个document, 我们可以比较这两个文章的term vector在向量空间中的夹角, 夹角越小说明这个两个document越相似.
那么既然是分类就有个训练的过程, 我们必须建立每个类的term vector作为个标准, 来给其它document比较.
这里用map来实现这个term vector, (term, frequency), 用n个这样的map来表示n维. 我们就要为每个category来生成一个term vector, category和term vector也可以用一个map来连接.创建这个category的term vector, 这样做:
遍历这个类中的每个document, 取document的term vector, 把它加到category的term vector上.
private void addTermFreqToMap(Map vectorMap, TermFreqVector termFreqVector) {
String[] terms = termFreqVector.getTerms();
int[] freqs = termFreqVector.getTermFrequencies();
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
if (vectorMap.containsKey(term)) {
Integer value = (Integer) vectorMap.get(term);
vectorMap.put(term, new Integer(value.intValue() + freqs[i]));
} else {
vectorMap.put(term, new Integer(freqs[i]));
}
}
}
首先从document的term vector中取出term和frequency的list, 然后从category的term vector中取每一个term, 把document的term frequency加上去.OK了

有了这个每个类的category, 我们就要开始计算document和这个类的向量夹角了
cos = A*B/|A||B|
A*B就是点积, 就是两个向量每一维相乘, 然后全加起来.
这里为了简便计算, 假设document中term frequency只有两种情况, 0或1.就表示出现或不出现
private double computeAngle(String[] words, String category) {
// assume words are unique and only occur once
Map vectorMap = (Map) categoryMap.get(category);
int dotProduct = 0;
int sumOfSquares = 0;
for (int i = 0; i < words.length; i++) {
String word = words[i];
int categoryWordFreq = 0;
if (vectorMap.containsKey(word)) {
categoryWordFreq = ((Integer) vectorMap.get(word)).intValue();
}
dotProduct += categoryWordFreq; // optimized because we assume frequency in words is 1
sumOfSquares += categoryWordFreq * categoryWordFreq;
}
double denominator;
if (sumOfSquares == words.length) {
// avoid precision issues for special case
denominator = sumOfSquares; // sqrt x * sqrt x = x
} else {
denominator = Math.sqrt(sumOfSquares) *
Math.sqrt(words.length);
}
double ratio = dotProduct / denominator;
return Math.acos(ratio);
}
这个函数就是实现了上面那个公式还是比较简单的.

 

3) MoreLikeThis

对于找到比较相似的文档,lucene还提供了个比较高效的接口,MoreLikeThis接口

http://lucene.apache.org/java/1_9_1/api/org/apache/lucene/search/similar/MoreLikeThis.html

对于上面的方法我们可以比较每两篇文档的余弦值,然后对余弦值进行排序,找出最相似的文档,但这个方法的最大问题在于计算量太大,当文档数目很大时,几乎是无法接受的,当然有专门的方法去优化余弦法,可以使计算量大大减少,但这个方法精确,但门槛较高。

这个接口的原理很简单,对于一篇文档中,我们只需要提取出interestingTerm(即tf×idf高的词),然后用lucene去搜索包含相同词的文档,作为相似文档,这个方法的优点就是高效,但缺点就是不准确,这个接口提供很多参数,你可以配置来选择interestingTerm。

MoreLikeThis mlt = new MoreLikeThis(ir);

Reader target = …

 

// orig source of doc you want to find similarities to

 

Query query = mlt.like( target);

Hits hits = is.search(query);

 

用法很简单,这样就可以得到,相似的文档

 

这个接口比较灵活,你可以不直接用like接口,而是用
retrieveInterestingTerms(Reader r)

 

这样你可以获得interestingTerm,然后怎么处理就根据你自己的需要了。

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics