hai0378

浏览: 516949 次
性别:
来自: 上海

最近访客更多访客>>

mnisummer

chenhua1228

jianfulove

Jameslyy

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

Lucene 3.X 自定义排序

博客分类：

Java基础
转载专区

Lucene 3.X 自定义排序

发表于 2012 年 10 月 26 日

这是摘录：原文请看http://www.oschina.net/code/snippet_54100_6338 
/* 加入几家店作为索引数据 */

058
 
059

      addPoint(writer, "starbuck", "cafe", 2, 0);

060
 
061

      addPoint(writer, "El Charro", "restaurant", 1, 2);

062
 
063

      addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9);

064
 
065

      addPoint(writer, "Los Betos", "restaurant", 9, 6);

066
 
067

      addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8);

068
 
069

      addPoint(writer, "central perk", "cafe", 3, 8);

070
 
071

      writer.close();

072
 
073
 
074
 
075

      searcher = new IndexSearcher(directory);

076
 
077

      /* 查询类型为restaurant的店 */

078
 
079

      query = new TermQuery(new Term("type", "restaurant"));

080
 
081

   }

082
 
083
 
084
 
085

   /* name为店名，type为类型， x, y, 分别是店的x, y 坐标 */

086
 
087

   private static void addPoint(IndexWriter writer, String name,

088
 
089
String type, int x, int y) throws IOException {

090
 
091

      Document doc = new Document();

092
 
093
     
094
 
095

      doc.add(new Field("name", name, Field.Store.YES,

096
 
097

             Field.Index.NOT_ANALYZED));

098
 
099

      doc.add(new Field("type", type, Field.Store.YES,

100
 
101

             Field.Index.NOT_ANALYZED));

102
 
103

      doc.add(new Field("location", (x + "," + y), Field.Store.YES,

104
 
105

             Field.Index.NOT_ANALYZED));

106
 
107

      writer.addDocument(doc);

108
 
109

   }

110
 
111
 
112
 
113

   public void testNearestRestaurantToHome() throws Exception {

114
 
115

      /* 使用自定义的排序规则 ，传入坐标0，0*/

116
 
117

      DistanceComparator distanceComparator = new

118
 
119
DistanceComparator( 0, 0 );

120
 
121

       Sort sort = new Sort( new SortField( "location",

122
 
123
distanceComparator ) );

124
 
125

       /* 查询取3条结果 */

/* 实现FieldComparatorSource接口 */

016
 
017
public class DistanceComparator extends FieldComparatorSource {

018
 
019

   private static final long serialVersionUID = 1L;

020
 
021

   private int x;

022
 
023

   private int y;

024
 
025
 
026
 
027

   /* 传给构造函数x,y坐标 */

028
 
029

   public DistanceComparator(int x, int y) { // #2

030
 
031

      this.x = x;

032
 
033

      this.y = y;

034
 
035

   }

036
 
037
 
038
 
039

   /* 实现newComparator函数，产生一个新的FieldComparator */

040
 
041

   public FieldComparator newComparator(String fieldname, int numHits,

042
 
043

          int sortPos, boolean reversed) throws IOException {

044
 
045

      return new DistanceScoreDocLookupComparator(fieldname,

046
 
047

            numHits);

048
 
049

   }

050
 
051
 
052
 
053

   /* 继承FieldComparator */

054
 
055

   private class DistanceScoreDocLookupComparator // #4

056
 
057

          extends FieldComparator {

058
 
059

      /* top N个文档的距离值 */

060
 
061

      private float[] values; // #6

062
 
063

      /* top N文档中最远距离 */

064
 
065

      private float bottom; // #7

066
 
067

      private String fieldName;

068
 
069

      /* 保存location中的值 */

070
 
071

      private String[] currentReaderValues;

072
 
073
 
074
 
075

      public DistanceScoreDocLookupComparator(String fieldName,

076
 
077
int numHits) throws IOException {

078
 
079

          values = new float[numHits];

080
 
081

          this.fieldName = fieldName;

082
 
083

      }

084
 
085
 
086
 
087

      public void setNextReader(IndexReader reader, int docBase )

088
 
089
throws IOException {

090
 
091

          this.currentReaderValues = FieldCache.DEFAULT.getStrings(

092
 
093
reader, this.fieldName );

094
 
095

      }

096
 
097
 
098
 
099

      /** 比较两个距离 */

100
 
101

      public int compare(int slot1, int slot2) { // #11

102
 
103

          if (values[slot1] < values[slot2])

104
 
105

             return -1; // #11

106
 
107

          if (values[slot1] > values[slot2])

108
 
109

             return 1; // #11

110
 
111

          return 0; // #11

112
 
113

      }

114
 
115
 
116
 
117

      /** 记录top N中最远距离  */

118
 
119

      public void setBottom(int slot) { // #12

120
 
121

          bottom = values[slot];

122
 
123

      }

124
 
125
 
126
 
127

      /** 得到top N中的第slot个值 */

128
 
129

      public Comparable value(int slot) { // #15

130
 
131

          return new Float(values[slot]); // #15

132
 
133

      } // #15

134
 
135
 
136
 
137

      /* 比较新的doc与最远距离 */

138
 
139

      public int compareBottom(int doc) throws IOException {

140
 
141

          float distance = distance( currentReaderValues[doc] );

142
 
143
         
144
 
145

          if (bottom < distance )

146
 
147

             return -1; // #13

148
 
149

          if (bottom > distance )

150
 
151

             return 1; // #13

152
 
153

          return 0; // #13

154
 
155

      }

156
 
157
 
158
 
159

      /* 将新doc的值插入top N */

发表在 lucene | 发表回复

lucene3.5多线程搜索释义

发表于 2012 年 1 月 8 日

转载自：Lucene3.4多线程多索引搜索问题，3.5相对3.4变化不大。

hi.baidu.com/zuimao2004/blog/item/886066727245ab018601b05a.html

多线程多索引搜索：
* Lucene从3.1.0开始不主张再使用ParalellMultiSearcher，并修改了ParalellMultiSearcher的底层实现，首次引入缓存池机制，如果不指定缓冲池，会使用一个缺省的缓冲池。目前在Lucene3.4中仍然可以使用，当然如果你实在是看不惯那条提示Deprecated的删除线的话，
* 这里有两个个可供选择的替代方案：
* 1.通过ParallelReader来实现
* 但注意的是：使用ParallelReader时，要保证多个索引中的文档的数目和内容都是一样的，包括索引创建和修改的顺序和方式，
* 如果你向一个索引中添加了文档，同时你也必须用同样的方式添加同样的文档到其他索引中。
* 从这一点来讲还是有别于ParalellMultiSearcher的，ParallelReader只是单纯的加快索引的读取速度，
* 并不关心外围的其他功能，所以严格来讲，还不是实际应用中所需要的多线程多索引搜索，因为实际开发中
* 的多个索引目录在创建方式和存储的内容上大都是不同的。
* 2.Lucene3.1.0开始废除了ParallelMultiSearcher，它主张用带有ExecutorService参数的IndexSearcher
* 的构造函数来实现(可查看官方文档)。ExecutorService提供了一种用户自定义线程池的机会。

*public IndexSearcher( r, ExecutorService executor),用于对每个检索索引segment的检索器分配一个独立的线程在executor线程池中，需要注意的是indexsearcher的关闭并不会关闭executor线程池，除非你自己手动关闭。这样意味着多个索引器可以共用一个线程池。

代码：

String INDEX_STORE_PATH1=”./index1″;
String INDEX_STORE_PATH2=”./index2″;
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_34);
/**
* 创建多索引目录
* 将索引存储在不同的目录中
*/
@Test
public void createMultiIndexs()throws Exception
{
IndexWriterConfig iwc1=new IndexWriterConfig(Version.LUCENE_34,analyzer);
iwc1.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriterConfig iwc2=new IndexWriterConfig(Version.LUCENE_34,analyzer);
iwc2.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
//创建第一个索引
IndexWriter writer1=new IndexWriter(FSDirectory.open(new File(INDEX_STORE_PATH1)),iwc1);
Document doc1=new Document();
Document doc2=new Document();
Document doc3=new Document();

Field bookname1=new Field(“bookname”,”钢铁是怎样炼成的”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice1=new Field(“price”,”20.5″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname2=new Field(“bookname”,”钢铁战士”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice2=new Field(“price”,”18.4″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname3=new Field(“bookname”,”钢和铁是两种金属”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice3=new Field(“price”,”20.5″,Field.Store.YES,Field.Index.NOT_ANALYZED);

doc1.add(bookname1);
doc1.add(bookprice1);

doc2.add(bookname2);
doc2.add(bookprice2);

doc3.add(bookname3);
doc3.add(bookprice3);

writer1.addDocument(doc1);
writer1.addDocument(doc2);
writer1.addDocument(doc3);
writer1.close();

//创建第二个索引
IndexWriter writer2=new IndexWriter(FSDirectory.open(new File(INDEX_STORE_PATH2)),iwc2);
Document doc4=new Document();
Document doc5=new Document();
Document doc6=new Document();

Field bookname4=new Field(“bookname”,”钢要比铁有更多的碳元素”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice4=new Field(“price”,”22″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname5=new Field(“bookname”,”钢和铁是两种重要的金属”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice5=new Field(“price”,”15.9″,Field.Store.YES,Field.Index.NOT_ANALYZED);

Field bookname6=new Field(“bookname”,”钢铁是很重要的金属材料”,Field.Store.YES,Field.Index.ANALYZED);
Field bookprice6=new Field(“price”,”19.0″,Field.Store.YES,Field.Index.NOT_ANALYZED);

doc4.add(bookname4);
doc4.add(bookprice4);

doc5.add(bookname5);
doc5.add(bookprice5);

doc6.add(bookname6);
doc6.add(bookprice6);

writer2.addDocument(doc4);
writer2.addDocument(doc5);
writer2.addDocument(doc6);
writer2.close();
}

/**
* 多索引搜索：
* 由于在Lucene3.4中废弃了MultiSearcher
* 通过MultiReader来代替实现
* @throws Exception
*/
@Test
public void testMultiReader()throws Exception
{
//创建两个IndexReader
IndexReader reader1=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH1)));
IndexReader reader2=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH2)));
//构造MultiReader
MultiReader multiReader=new MultiReader(new IndexReader[]{reader1,reader2},true);

Term t1=new Term(“bookname”,”和”);
TermDocs docs=multiReader.termDocs(t1);
System.out.println(“检索的匹配结果有”+multiReader.docFreq(t1)+”个”);
while(docs.next())
{
System.out.println(multiReader.document(docs.doc()).toString());
System.out.println(“——————————–”);
}

}
/**
* 多线程多索引搜索：
* @throws Exception
*/
@Test
public void testParallelReader()throws Exception
{
//创建两个IndexReader
IndexReader reader1=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH1)));
IndexReader reader2=IndexReader.open(FSDirectory.open(new File(INDEX_STORE_PATH2)));
//构造ParallelReader
ParallelReader parallelReader=new ParallelReader();
parallelReader.add(reader1);
parallelReader.add(reader2);
Term t1=new Term(“bookname”,”和”);
TermDocs docs=parallelReader.termDocs(t1);
System.out.println(“检索的匹配结果有”+parallelReader.docFreq(t1)+”个”);
while(docs.next())
{
System.out.println(parallelReader.document(docs.doc()).toString());
System.out.println(“——————————–”);
}

发表在 lucene | 发表回复

lucene3的IndexWriter参数释义

发表于 2012 年 1 月 8 日

IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.UNLIMITED);
1.MAXBufferedDocs

MaxBufferedDocs这个参数默认是disabled的，因为Lucene中还用另外一个参数（RAMBufferSizeMB）控制这个bufffer的索引文档个数。
其实MaxBufferedDocs和RAMBufferSizeMB这两个参数是可以一起使用的，一起使用时只要有一个触发条件满足就写入硬盘，生成一个新的索引segment文件。

2.RAMBufferSize

控制用于buffer索引文档的内存上限，如果buffer的索引文档个数到达该上限就写入硬盘。当然，一般来说也只越大索引速度越快。当我们对文档大小不太确定时，这个参数就相当有用，不至于outofmemory error.

3.MegerFactor
SetMergeFactor是控制segment合并频率的，其决定了一个索引块中包括多少个文档，当硬盘上的索引块达到多少时，将它们合并成一个较大的索引块。当MergeFactor值较大时，生成索引的速度较快。MergeFactor的默认值是10，建议在建立索引前将其设置的大一些。

发表在 lucene | 发表回复

lucene3.5入门实例

发表于 2012 年 1 月 8 日

1.建立索引

核心类

IndexWriterConfig：建立索引的配置对象，里面包含一个索引解析器Analyzer

IndexWriter：写索引的类。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class FileIndexer {
private IndexWriter indexWriter;

// 索引文件存放目录
private File fileIndex = new File(FileSearchConstant.FILE_INDEX);
// 要被建立索引的文件目录
private File fileDir = new File(FileSearchConstant.FILE_DIR);
// private static Logger logger = LoggerFactory.getLogger(FileIndexer.class);
public static void main(String[] args) throws IOException {
FileIndexer fileIndexer = new FileIndexer();
fileIndexer.buildIndex();
}
public void buildIndex() throws IOException {
boolean isCreate = true;
//建立索引的配置类，包含了一个解析器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35,
new StandardAnalyzer(Version.LUCENE_35));
//设置我们的解析器是新建还是追加更新
setModel(isCreate, indexWriterConfig);
//索引的建立类第一个参数索引的存放位置，第二个参数索引的配置对象
indexWriter = new IndexWriter(FSDirectory.open(fileIndex), indexWriterConfig);
long startTime = System.currentTimeMillis();
//建立索引
indexDocs(fileDir, indexWriter);
//这个方法在新增索引的情况会很有用，就是讲原来散落的索引文件重新进行整理合并！
indexWriter.forceMerge(1);
// indexWriter.commit();
// 关闭索引写
indexWriter.close();
long endTime = System.currentTimeMillis();
System.out.println(“cost :” + (endTime - startTime) + ”seconds”);
}
private void setModel(boolean isCreate, IndexWriterConfig indexWriterConfig) {
if (isCreate) {
indexWriterConfig.setOpenMode(OpenMode.CREATE);
} else {
indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
}
private void indexDocs(File fileDir, IndexWriter indexWriter) throws CorruptIndexException, IOException {
if (fileDir.canRead()) {
if (fileDir.isDirectory()) {
String[] listFiles = fileDir.list();
for (String file : listFiles) {
indexDocs(new File(fileDir, file), indexWriter);
}
} else {
// Document 代表一个索引类型件
Document doc = new Document();
// Field 代表索引的项，比如我们这里对文件的路径进行建索引
Field pathField = new Field(FileSearchConstant.PATH, fileDir.getPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
/*索引建立有两个很关键的因素：
* Document Frequency 即文档频次
* Document Frequency (df)：即有多少文档包含次Term。df 越大说明越不重要
* 这里代表在建索引时不考虑这两点，只建立索引
*/
pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(pathField);
// NumericField modifyField = new NumericField(IndexerConstant.MODIFIED);
// modifyField.setLongValue(fileDir.lastModified());
// doc.add(modifyField);
//对文件内容建立索引
Field contentField = new Field(FileSearchConstant.CONTENTS, getContents(fileDir), Field.Store.YES,
Field.Index.ANALYZED);
// FileInputStream fileInputStream = getFileInputStream(fileDir);
// doc.add(new Field(IndexerConstant.CONTENTS, new BufferedReader(new InputStreamReader(fileInputStream,
// ”UTF-8″))));
// fileInputStream.close();
contentField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(contentField);
//添加或是更新索引
if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE) {
indexWriter.addDocument(doc);
} else if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE_OR_APPEND) {
indexWriter.updateDocument(new Term(FileSearchConstant.PATH, fileDir.getPath()), doc);
}
}
}
}
public String getContentByUtils(File fileDir) {
String content = null;
try {
content = IOUtils.toString(new FileInputStream(fileDir), ”UTF-8″);
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
/*
* 这个方法不推荐使用，在readline的时候没有把换行符带入，带入在建立索引的时候不能做正确的分词，建议使用上面那个文件读取
*/
public String getContents(File fileDir) {
StringBuffer result = new StringBuffer();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileDir), ”UTF-8″));
String temp = null;
while ((temp = reader.readLine()) != null) {
result.append(temp);
}
} catch (Exception e) {
e.printStackTrace();
return null;
} finally {
if (null != reader) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result.toString();
}
}

2.按照索引查找

import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class FileSearcher {
private String content = ”haha”;
public void search() throws IOException, ParseException {
// FSDirectory.open(new File(FileSearchConstant.FILE_INDEX) 这个会根据你的操作系统类型，打开最为合适的索引读取器：简单文件读取，内存，NIO等三种
IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(FileSearchConstant.FILE_INDEX)));
IndexSearcher searcher = new IndexSearcher(indexReader);
//构建要查找的内容项
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
QueryParser queryParser = new QueryParser(Version.LUCENE_35, FileSearchConstant.CONTENTS, analyzer);
//解析内容
Query query = queryParser.parse(content);
System.out.println(“Searching for: ” + query.toString());
//TopDocs是返回结果，10代表最大取10个，在实际应用中我们一般要做分页处理，所以这里需要特别注意下！
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] docs = topDocs.scoreDocs;
if (null != docs) {
for (int i = 0; i < docs.length; i++) {
ScoreDoc scoreDoc = docs[i];
//这步很重要的，也是最为关键的一步
Document doc = searcher.doc(scoreDoc.doc);
//取的自己想要的数据
String contents = doc.get(FileSearchConstant.CONTENTS);
String path = doc.get(FileSearchConstant.PATH);
System.out.println(contents);
System.out.println(path);
}
}
}
public static void main(String[] args) throws IOException, ParseException {
FileSearcher fileSearcher = new FileSearcher();
fileSearcher.search();
}
}

发表在 lucene | 发表回复

lucene+IKAnnlyzer+highlighter实例

发表于 2012 年 1 月 8 日

中文分词 IKAnalyzer和高亮highlighter的使用

1. package demo.test;
2.
3. import java.io.IOException;
4. import java.io.StringReader;
5.
6. import org.apache.lucene.analysis.Analyzer;
7. import org.apache.lucene.analysis.TokenStream;
8. import org.apache.lucene.analysis.tokenattributes.TermAttribute;
9. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
10. import org.wltea.analyzer.lucene.IKAnalyzer;
11.
12. public class TestIKAnalyzer {
13.
14.     public static void main(String[] args) throws IOException {
15.         Analyzer analyzer = new IKAnalyzer();
16.         TokenStream tokenStream = analyzer.tokenStream(“”, new StringReader(“永和服装饰品有限公司”));
17.         //2.x写法 3.0之后不支持了
18.         /*Token token =new Token();
19.         while(tokenStream.next(token)!=null){
20.             System.out.println(token.term());
21.         }*/
22.         //3.x的写法
23.         TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
24.         TypeAttribute typeAtt = (TypeAttribute) tokenStream.getAttribute(TypeAttribute.class);
25.
26.         while (tokenStream.incrementToken()) {
27.             System.out.print(termAtt.term());
28.             System.out.print(‘ ‘);
29.             System.out.println(typeAtt.type());
30.         }
31.     }
32.
33. }

package demo.test;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TestIKAnalyzer {

public static void main(String[] args) throws IOException {
Analyzer analyzer = new IKAnalyzer();
TokenStream tokenStream = analyzer.tokenStream(“”, new StringReader(“永和服装饰品有限公司”));
//2.x写法 3.0之后不支持了
/*Token token =new Token();
while(tokenStream.next(token)!=null){
System.out.println(token.term());
}*/
//3.x的写法
TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) tokenStream.getAttribute(TypeAttribute.class);

while (tokenStream.incrementToken()) {
System.out.print(termAtt.term());
System.out.print(‘ ‘);
System.out.println(typeAtt.type());
}
}

}

分词结果永和和服服装装饰品装饰饰品有限公司有限公司

2.我们开始采用IKAnalyzer创建索引
Java代码收藏代码

1. package demo.test;
2.
3. import java.io.BufferedReader;
4. import java.io.File;
5. import java.io.FileInputStream;
6. import java.io.IOException;
7. import java.io.InputStreamReader;
8.
9. import org.apache.lucene.analysis.Analyzer;
10. import org.apache.lucene.document.Document;
11. import org.apache.lucene.document.Field;
12. import org.apache.lucene.index.IndexWriter;
13. import org.wltea.analyzer.lucene.IKAnalyzer;
14.
15. public class CreatIndex {
16.
17.     @SuppressWarnings(“deprecation”)
18.     public static void main(String[] args) throws IOException {
19.         String path = “index”;//索引目录
20.         Analyzer analyzer = new IKAnalyzer();//采用的分词器
21.         IndexWriter iwriter = new IndexWriter(path, analyzer, true);
22.         File dir = new File(“data”);//待索引的数据文件目录
23.         File[] files = dir.listFiles();
24.         for(int i=0;i<files.length;i++){
25.             Document doc = new Document();
26.             File file = files[i];
27.             FileInputStream fis = new FileInputStream(file);
28.             String content = “”;
29.             BufferedReader reader = new BufferedReader(new InputStreamReader(fis));
30.
31.             StringBuffer buffer = new StringBuffer(“”);
32.             content = reader.readLine();
33.             while (content != null) {
34.                 buffer.append(content);
35.                 content = reader.readLine();
36.             }
37.             doc.add(new Field(“title”,file.getName(),Field.Store.YES,Field.Index.ANALYZED));
38.             doc.add(new Field(“content”,buffer.toString(),Field.Store.YES,Field.Index.ANALYZED));
39.             iwriter.addDocument(doc);
40.         }
41.         iwriter.close();
42.     }
43.
44. }

package demo.test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class CreatIndex {

@SuppressWarnings(“deprecation”)
public static void main(String[] args) throws IOException {
String path = “index”;//索引目录
Analyzer analyzer = new IKAnalyzer();//采用的分词器
IndexWriter iwriter = new IndexWriter(path, analyzer, true);
File dir = new File(“data”);//待索引的数据文件目录
File[] files = dir.listFiles();
for(int i=0;i<files.length;i++){
Document doc = new Document();
File file = files[i];
FileInputStream fis = new FileInputStream(file);
String content = “”;
BufferedReader reader = new BufferedReader(new InputStreamReader(fis));

StringBuffer buffer = new StringBuffer(“”);
content = reader.readLine();
while (content != null) {
buffer.append(content);
content = reader.readLine();
}
doc.add(new Field(“title”,file.getName(),Field.Store.YES,Field.Index.ANALYZED));
doc.add(new Field(“content”,buffer.toString(),Field.Store.YES,Field.Index.ANALYZED));
iwriter.addDocument(doc);
}
iwriter.close();
}

}

3.对索引进行查询并进行高亮highlighter处理
Java代码收藏代码

1. package demo.test;
2.
3. import java.io.File;
4. import java.io.IOException;
5. import java.io.StringReader;
6.
7. import org.apache.lucene.analysis.Analyzer;
8. import org.apache.lucene.analysis.TokenStream;
9. import org.apache.lucene.document.Document;
10. import org.apache.lucene.index.Term;
11. import org.apache.lucene.search.IndexSearcher;
12. import org.apache.lucene.search.Query;
13. import org.apache.lucene.search.ScoreDoc;
14. import org.apache.lucene.search.TermQuery;
15. import org.apache.lucene.search.TopDocs;
16. import org.apache.lucene.search.highlight.Highlighter;
17. import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
18. import org.apache.lucene.search.highlight.QueryScorer;
19. import org.apache.lucene.search.highlight.SimpleFragmenter;
20. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
21. import org.apache.lucene.store.Directory;
22. import org.apache.lucene.store.FSDirectory;
23. import org.wltea.analyzer.lucene.IKAnalyzer;
24.
25. public class TestHighlighter {
26.
27.     @SuppressWarnings(“deprecation”)
28.     public static void main(String[] args) throws IOException, InvalidTokenOffsetsException {
29.         String path = “index”;//索引目录
30.         Directory dir = FSDirectory.getDirectory(new File(path));
31.         IndexSearcher search = new IndexSearcher(dir);
32.         Term term = new Term(“content”,”纯粹”);
33.         Query query = new TermQuery(term);
34.         TopDocs topDocs = search.search(query, 10);
35.         ScoreDoc[] hits = topDocs.scoreDocs;
36.         //正常产生的查询
37.         for(int i=0;i<hits.length;i++){
38.             Document doc = search.doc(hits[i].doc);
39.             System.out.print(doc.get(“title”)+”:”);
40.             System.out.println(doc.get(“content”));
41.         }
42.         //高亮设置
43.         Analyzer analyzer = new IKAnalyzer();//设定分词器
44.         SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(“<B>”,”</B>”);//设定高亮显示的格式，也就是对高亮显示的词组加上前缀后缀
45.         Highlighter highlighter = new Highlighter(simpleHtmlFormatter,new QueryScorer(query));
46.         highlighter.setTextFragmenter(new SimpleFragmenter(150));//设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧，当然这里也是设定只展示部分数据
47.         for(int i=0;i<hits.length;i++){
48.             Document doc = search.doc(hits[i].doc);
49.             TokenStream tokenStream = analyzer.tokenStream(“”,new StringReader(doc.get(“content”)));
50.             String str = highlighter.getBestFragment(tokenStream, doc.get(“content”));
51.             System.out.println(str);
52.         }
53.     }
54.
55. }

package demo.test;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class TestHighlighter {

@SuppressWarnings(“deprecation”)
public static void main(String[] args) throws IOException, InvalidTokenOffsetsException {
String path = “index”;//索引目录
Directory dir = FSDirectory.getDirectory(new File(path));
IndexSearcher search = new IndexSearcher(dir);
Term term = new Term(“content”,”纯粹”);
Query query = new TermQuery(term);
TopDocs topDocs = search.search(query, 10);
ScoreDoc[] hits = topDocs.scoreDocs;
//正常产生的查询
for(int i=0;i<hits.length;i++){
Document doc = search.doc(hits[i].doc);
System.out.print(doc.get(“title”)+”:”);
System.out.println(doc.get(“content”));
}
//高亮设置
Analyzer analyzer = new IKAnalyzer();//设定分词器
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(“<B>”,”</B>”);//设定高亮显示的格式，也就是对高亮显示的词组加上前缀后缀
Highlighter highlighter = new Highlighter(simpleHtmlFormatter,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(150));//设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧，当然这里也是设定只展示部分数据
for(int i=0;i<hits.length;i++){
Document doc = search.doc(hits[i].doc);
TokenStream tokenStream = analyzer.tokenStream(“”,new StringReader(doc.get(“content”)));
String str = highlighter.getBestFragment(tokenStream, doc.get(“content”));
System.out.println(str);
}
}

}

发表在 lucene | 发表回复

lucenelucene3.3 多目录索引

发表于 2012 年 1 月 8 日

lucenelucene3.3 多目录索引

最近做的一项目，需要用到lucene，项目开始的时候lucene3.4还没有发布，选择了最新的3.3版本先说一下业务背景：需要搜索的文件是TXT文件，每天会增量增加，而且文件一直保留，文件存放的结构化数据，具体结构如下： Id|name|address|date

需要根据name address date进行搜索 date需要考虑跨时间段

由于业务每天会生成一个文件，当天的文件日期肯定是一致的，这个由业务方保证，每天的文件最大会有200M左右

考虑到文件是增量增加，然后需要按时间跨度搜索，时间跨度最大可以是三个月之久，响应时间必须在2S内

如果放在一个文件里面，索引的建立会随着文件不断增长而变得无比庞大，同时对于索引的搜索和优化也很麻烦

根据实际情况，考虑了如下个方案：

1、根据文件日期做索引分类，这需要数据提供者配合给每天生成的文件名中必须要包含日期

2、然后按照日期格式生成诸如2011/10/2011-10-11这样的目录结构，对应每天的索引就存放在对应的日期文件夹内

这样的好处有如下几点：

1) 索引源文件里面不需要再处理日期，可以直接把日期字段删除，减少索引文件大小

2)搜索时不需要进行区间搜索,如搜索2011-10-01至2011-10-31号的数据，可以生成31个文件目录，如2011/10/2011-10-01,2011/10/2011-10-02等，

直接去指定日期目录下通过多目录索引搜索文件

3)对于生成的文件可以随时重建索引，因为是分目录索引，所以重建效率非常高，不需要进行专门的索引优化

/**
*根据不同的域使用不同的分词器
*
* @return
*/
public PerFieldAnalyzerWrapper kmsAnalyzer() {
Analyzer standardAnalyzer = new StandardAnalyzer(LUCENE_VERSION);

Analyzer kwAnalyzer = new KeywordAnalyzer();
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(kwAnalyzer);
analyzer.addAnalyzer(“content”, standardAnalyzer);
return analyzer;
}

//构造索引生成对象

private IndexWriter indexWriter33(String indexPath) throws CorruptIndexException,
LockObtainFailedException, IOException {

File file = new File(indexPath);
LogMergePolicy policy = new LogDocMergePolicy();

// SetUseCompoundFile这个方法可以使Lucene在创建索引库时，会合并多个 Segments 文件到一个 .cfs中
// 此方式有助于减少索引文件数量，对于将来搜索的效率有较大影响。
// 压缩存储（True则为复合索引格式)
policy.setUseCompoundFile(true);

//合并因子，当硬盘上的索引块达到设置的数量时，会合并成一个一个较大的索引块
policy.setMergeFactor(5000);
IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, this.kmsAnalyzer());
config.setOpenMode(OpenMode.CREATE);
config.setMergePolicy(policy);

//最大缓存文档数根据内存大小进行设置，设置较大的数目可以加快建索引速度
config.setMaxBufferedDocs(200000);

//构造索引存放目录
FSDirectory directory = FSDirectory.open(file);

//写索引对象
IndexWriter indexWriter = new IndexWriter(directory, config);

return indexWriter;
}

/**
* 构造document
*
* @param lineRecord 针对每一行记录构造document
* @return
*/
private Document buildDocument(String lineRecord, boolean fileStatus) {
Document doc = new Document();
String[] columns = lineRecord.split(String.valueOf((char) 5));
//长度为4时，说明是站内词
if (columns.length == 3) {
Field cateId = new Field(“cateId”, columns[2], Store.NO, Index.ANALYZED);
doc.add(cateId);
}
//长度为6时，表示是站外词
else if (columns.length == 5) {
Field sessionId = new Field(“sessionId”, columns[2], Store.NO, Index.ANALYZED);
Field countryId = new Field(“countryId”, columns[3], Store.NO, Index.ANALYZED);
Field urlGourpId = new Field(“urlGourpId”, columns[4], Store.NO, Index.ANALYZED);
doc.add(sessionId);
doc.add(countryId);
doc.add(urlGourpId);
} else {
logger.error(“The file content [" + lineRecord + "] error.”);
fileStatus = false;
return new Document();
}
Field id = new Field(“id”, columns[0], Store.YES, Index.ANALYZED);
Field keyword = new Field(“keyword”, columns[1], Store.NO, Index.ANALYZED);
//需要进行分词处理
Field content = new Field(“content”, columns[1], Store.NO, Index.ANALYZED);
//Field date = new Field(“date”, columns[2], Store.YES, Index.ANALYZED);

doc.add(id);
doc.add(keyword);
doc.add(content);
//doc.add(date);
return doc;

}

public void createIndex(String srcPath, String desPath) {
//文件内容正常标识
boolean fileStatus = true;
String path = null;
//获取所有*.dat文件
List<File> fileList = SEFileUtil.getSrcFiles(SEFileUtil.pathToFile(srcPath),
FILE_SUFFIX_DAT);

// 获取所有以*.lock的文件
List<File> lockFileList = SEFileUtil.getSrcFiles(SEFileUtil.pathToFile(srcPath),
FILE_SUFFIX_LOCK);

//建立索引
label0: for (File file : fileList) {
IndexWriter writer = null;
BufferedReader br = null;
//构造写索引对象
try {
String prxFileName = file.getName().substring(0, file.getName().indexOf(“_”));

//需要索引的文件正在生成时不处理
if (lockFileList != null && !lockFileList.isEmpty()) {
for (File lockFile : lockFileList) {
String preLockFileName = lockFile.getName().substring(0,
file.getName().indexOf(“_”));
if (preLockFileName.equalsIgnoreCase(prxFileName)) {
lockFileList.remove(lockFile);
continue label0;
}
}
}
//生成索引文件存储路径
path = SEFileUtil.buildFilePath(desPath, prxFileName, “yyyyMMdd”);
if (logger.isDebugEnabled()) {
logger.debug(“The index file path: ” + path);
}
writer = this.indexWriter33(SEFileUtil.createDirectory(path));
br = new BufferedReader(new InputStreamReader(new FileInputStream(file), “UTF-8″));
String record = null;

//生成索引文件
while (StringUtils.isNotBlank(record = br.readLine())) {
writer.addDocument(this.buildDocument(record, fileStatus));
}
writer.optimize();
writer.commit();
} catch (Exception e) {
e.printStackTrace();
return;
} finally {
this.close(writer, br);
}
//文件解析异常时不删除原文件
if (fileStatus) {
if (StringUtils.isNotBlank(this.getIndexCopyToIP())) {
String[] ipArray = this.getIndexCopyToIP().split(“\|”);
for (String ip : ipArray) {
int exitValue = this.copyIndex(ip.trim(), path);
if (0 != exitValue) {
logger.error(“^_^ Copy index directory [" + path + "] to [" + ip
+ "] failed.”);
}
}
}
//删除文件
boolean flag = SEFileUtil.deleteFile(file);
if (!flag) {
logger.error(“Delete file failed: ” + file.getPath());
}
}

}

下面是搜索，搜索由于我的业务相对简单，所以搜索也比较简单，主要有一点就是我需要搜索的返回值只要取其中的ID值就可以了，

开始不太了解lucene，取值时把整个document都装载进去了，所以取文件时很慢，后来通过MapFieldSelector使速度提高了5部以下，具体用法如下

下面是多目录搜索

/**

下面是查询主方法，包括构造搜索条件和多目录索引

public List<Long> searchIndex(Map<String, String> paramMap) {
Long startTime = null;
if (logger.isDebugEnabled()) {
startTime = System.currentTimeMillis();
logger.debug(“^_^ start search: ” + paramMap);
}
List<Long> ids = null;
//打开索引文件存放目录
String keyword = paramMap.get(“keyword”);//关键字
String cateId = paramMap.get(“cateId”);//类目标识
String matchFlag = paramMap.get(“matchFlag”); //匹配标识 0:精确,1：模糊
String cateType = paramMap.get(“cateType”);//02:发布类目 03:展示类目
String siteWord = paramMap.get(“siteWord”);//0:=站内词，1:=站外词
String sessionId = paramMap.get(“sessionId”);//来源
String countryId = paramMap.get(“countryId”);//国家
String urlGourpId = paramMap.get(“urlGourpId”);//url 组
String fromDate = paramMap.get(“startDate”);//起始时间
String toDate = paramMap.get(“endDate”);//结束时间

//获取搜索目录
String searchPath = this.getSearchPath(siteWord, cateType);

//计算时间段内所有日期
List<String> dateStringList = SEDateUtil.getDateRange(fromDate, toDate);
// IndexReader[] subReaders = new IndexReader[dateStringList.size()];
List<IndexReader> subReadersList = new ArrayList<IndexReader>();
boolean flag = true;
try {
//构造索引搜索对象
for (int i = 0; i < dateStringList.size(); i++) {
//获取所有搜索路径文件
String fullPath = SEFileUtil.buildFilePath(searchPath, dateStringList.get(i),
“yyyy-MM-dd”);
File file = SEFileUtil.pathToFile(fullPath);
if (!file.isDirectory()) {
if (logger.isDebugEnabled()) {
logger.debug(“The directory is not exist: ” + fullPath);
}
continue;
}
FSDirectory directory = FSDirectory.open(new File(fullPath));
IndexReader subReader = IndexReader.open(directory);
flag = false;
subReadersList.add(subReader);
}
if (flag) {
return null;
}
IndexReader[] subReaders = subReadersList
.toArray(new IndexReader[subReadersList.size()]);
if (logger.isDebugEnabled()) {

logger.debug(“Build search directory consume time: ”
+ (System.currentTimeMillis() – startTime));
startTime = System.currentTimeMillis();
}
//获取搜索结果
ids = this.getSearchResult(subReaders, matchFlag, keyword, cateId, sessionId,
countryId, urlGourpId);
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (null != subReadersList) {
subReadersList = null;
}
}
if (logger.isDebugEnabled()) {
Long endTime = (System.currentTimeMillis() – startTime);
logger.debug(“search end. Consume Time(s): ” + endTime);
}
if (null != ids && !ids.isEmpty()) {
//按ID升级排序
Collections.sort(ids, new IndicatorComparator());
}
return ids;
}

/**
*
*/
private List<Long> getSearchResult(IndexReader[] subReaders, String matchFlag, String keyword,
String cateId, String sessionId, String countryId,
String urlGourpId) throws ParseException,
CorruptIndexException, Exception {
List<Long> result = null;
PerFieldAnalyzerWrapper analyzer = buildIndexJob.kmsAnalyzer();
IndexReader multiReader = new MultiReader(subReaders);

BooleanQuery query = new BooleanQuery();
//分词匹配keyword
if (“1″.equals(matchFlag) && StringUtil.isNotBlank(keyword)) {
QueryParser queryParser = new QueryParser(BuildIndexJob.LUCENE_VERSION, “content”,
analyzer);
//两个词之间的关系是or
queryParser.setDefaultOperator(QueryParser.OR_OPERATOR);
query.add(queryParser.parse(QueryParser.escape(keyword.toLowerCase())), Occur.MUST);
}
//全量匹配keyword
else if (“0″.equals(matchFlag) && StringUtils.isNotBlank(keyword)) {
Query kQuery = new TermQuery(new Term(“keyword”, keyword.toLowerCase()));
query.add(kQuery, Occur.MUST);
}
//categoryId匹配
if (StringUtils.isNotBlank(cateId)) {
Query bQuery = new TermQuery(new Term(“cateId”, cateId));
query.add(bQuery, Occur.MUST);
}
if (StringUtils.isNotBlank(sessionId)) {
Query bQuery = new TermQuery(new Term(“sessionId”, sessionId));
query.add(bQuery, Occur.MUST);
}
if (StringUtils.isNotBlank(countryId)) {
Query bQuery = new TermQuery(new Term(“countryId”, countryId));
query.add(bQuery, Occur.MUST);
}
if (StringUtils.isNotBlank(urlGourpId)) {
Query bQuery = new TermQuery(new Term(“urlGourpId”, urlGourpId));
query.add(bQuery, Occur.MUST);
}
Long startTime = System.currentTimeMillis();
IndexSearcher search = new IndexSearcher(multiReader);
//最多只返回20W数据，此数据是业务需求
TopDocs topDocs = search.search(query, 200000);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if (logger.isDebugEnabled()) {
logger.debug(“search result: ” + scoreDocs.length);
logger.debug(“search consume time: ” + (System.currentTimeMillis() – startTime));
startTime = System.currentTimeMillis();
}
if (scoreDocs.length <= 0) {
return null;
}
result = this.getIds(scoreDocs, search);
if (logger.isDebugEnabled()) {
logger.debug(“Reader [id] consume time: ” + (System.currentTimeMillis() – startTime));
}
return result;
}

/**
* 从doc对象中获取所有的ID集合
*
* @param scoreDocs
* @param multiSearcher
* @return
* @throws CorruptIndexException
* @throws IOException
* @throws ExecutionException
* @throws InterruptedException
*/
public List<Long> getIds(ScoreDoc[] scoreDocs, IndexSearcher search)
throws CorruptIndexException, IOException {
List<Long> ids = new ArrayList<Long>(scoreDocs.length);
Map<String, FieldSelectorResult> fieldSelections = new HashMap<String, FieldSelectorResult>(
1);
fieldSelections.put(“id”, FieldSelectorResult.LOAD);
FieldSelector fieldSelector = new MapFieldSelector(fieldSelections);

//获取ID集合
for (int i = 0; i < scoreDocs.length; i++) {
Document doc = search.doc(scoreDocs[i].doc, fieldSelector);
ids.add(Long.valueOf(doc.getFieldable(“id”).stringValue()));
}
return ids;
}

红色部分是取文件时只装载ID，其它的域不装载，主样获取速度会快很多，文件越大加载越快

再有就是在性能测试时，windows和linux区别很大，主要是windows上默认没有使用多线程和内存映射，源代码如下

/** Just like {@link #open(File)}, but allows you to
* also specify a custom {@link LockFactory}. */
public static FSDirectory open(File path, LockFactory lockFactory) throws IOException {
if ((Constants.WINDOWS || Constants.SUN_OS || Constants.LINUX)
&& Constants.JRE_IS_64BIT && MMapDirectory.UNMAP_SUPPORTED) {
return new MMapDirectory(path, lockFactory);
} else if (Constants.WINDOWS) {
return new SimpleFSDirectory(path, lockFactory);
} else {
return new NIOFSDirectory(path, lockFactory);
}
}

所以测试效果必需要在64位的linux上才能体现出

转载自：http://xmgestapo.iteye.com/blog/1217606

发表在 lucene | 发表回复

lucene3.5删除索引

发表于 2012 年 1 月 8 日

Lucene提供了两种删除索引的方式，一种是通过documentId删除某document文档，第二种是根据Term删除文档document.

public class MyDeleteIndexes {
	public static final String STORE_PATH = "lucene_index";
	public static void deleteIndexes(String field , String keyword) throws IOException{
		long startTime = System.currentTimeMillis();
		Directory dir = FSDirectory.open(new File(STORE_PATH));
		IndexReader reader = IndexReader.open(dir,false);
		Term term = new Term(field,keyword);
		reader.deleteDocuments(term);
		//可以按documentId删除文档
		//reader.deleteDocument(1);
		reader.flush();
		reader.close();
		//System.out.println(reader.lastModified(dir));
		long endTime = System.currentTimeMillis();
		System.out.println("total time: " + (endTime - startTime) + " ms");
	}
}

发表在 lucene | 发表回复

使用Lucene TermVector提高关键字高亮显示速度

发表于 2012 年 1 月 5 日

使用Lucene TermVector提高关键字高亮显示速度

转载自：Lucene TermVector用法:相关搜索功能及提高高亮显示性能：hi.baidu.com/z57354658/blog/item/b80f524b2c92e1fa82025cbd.html

public class TermVectorTest {
	    Analyzer analyzer = new SimpleAnalyzer();
	    Directory ramDir = new RAMDirectory();
	    public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{
	        IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);
	        Document doc1 = new Document();
	        doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));
	        doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));
	        doc1.add(new Field("subject","学java,用java的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
	        Document doc2 = new Document();
	        doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));
	        doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));
	        doc2.add(new Field("subject","薄荷味23让他",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
	        Document doc3 = new Document();
	        doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));
	        doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));
	        doc3.add(new Field("subject","说都给我一 23523条去",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
	        writer.addDocument(doc1);
	        writer.addDocument(doc2);
	        writer.addDocument(doc3);
	        writer.optimize();
	        writer.close();
	    }
	    public void search() throws CorruptIndexException, IOException{
	        IndexReader reader = IndexReader.open(ramDir);
	        IndexSearcher searcher = new IndexSearcher(reader);
	        Term term = new Term("title","java");   //在title里查询java词条
	        TermQuery query = new TermQuery(term);
	        Hits hits = searcher.search(query);
	        for (int i = 0; i<hits.length(); i++)
	        {
	            Document doc = hits.doc(i);
	            System.out.println(doc.get("title"));
	            System.out.println(doc.get("subject"));
	            System.out.println("moreLike search: ");
	            morelikeSearch(reader,hits.id(i));
	        }
	    }
	    private void morelikeSearch(IndexReader reader,int id) throws IOException
	    {
	        //根据这个document的id获取这个field的Term Vector 信息，就是这个field分词之后在这个field里的频率、位置、等信息
	        TermFreqVector vector = reader.getTermFreqVector(id, "subject");
	        BooleanQuery query = new BooleanQuery();
	        for (int i = 0; i <vector.size(); i++)
	        {
	             TermQuery tq = new TermQuery(new Term("subject",
	                        vector.getTerms()[i]));   //获取每个term保存的Token
	                 query.add(tq, BooleanClause.Occur.SHOULD);
	        }
	        IndexSearcher searcher = new IndexSearcher(ramDir);
	        Hits hits = searcher.search(query);
	        //显示代码，略
	    }
	//Lucene使用TermVector提高高亮显示性能
	    public void highterLightSearch() throws CorruptIndexException, IOException{
	        IndexReader reader = IndexReader.open(ramDir);
	        IndexSearcher searcher = new IndexSearcher(reader);
	        TermQuery query = new TermQuery(new Term("subject","java"));
	        Hits hits = searcher.search(query);
	        //高亮显示设置
	        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
	        Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
	         // 这个100是指定关键字字符串的context的长度，你可以自己设定，因为不可能返回整篇正文内容
	        highlighter.setTextFragmenter(new SimpleFragmenter(100));
	        for(int i = 0; i<hits.length(); i++){
	            Document doc = hits.doc(i);
	            TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");
	            TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");
	            TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);
	            String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));
	            System.out.println(doc.get("title"));
	            System.out.println(result);
	        }
	    }
	    public static void main(String[] args) throws CorruptIndexException, IOException
	    {
	        TermVectorTest  t = new TermVectorTest();
	        t.createRamIndex();
	        t.search();
	    }
	}

发表在 java、lucene | 发表回复

Lucene的Term Vectors学习

发表于 2012 年 1 月 5 日

term vector, 就是对于documents的某一field,如title,body这种文本类型的, 建立词频的多维向量空间.每一个词就是一维, 这维的值就是这个词在这个field中的频率.同时 term vector能保存分词后文章内词的位置和偏移量。

使用term vectors：在创建索引的时候对该field打开term vectors选项。

Field 的 term vectors选项：
TermVector.YES解释： record the unique terms that occurred, and their counts, in each document, but do not store any positions or offsets information.

TermVector.WITH_POSITIONS解释：record the unique terms and their counts, and also the positions of each occurrence of every term, but no offsets.

TermVector.WITH_OFFSETS解释：record the unique terms and their counts, with the offsets (start & end character position) of each occurrence of every term, but no positions.

TermVector.WITH_POSITIONS_OFFSETS解释：store unique terms and their counts, along with positions and offsets.

TermVector.NO 解释：do not store any term vector information.
If Index.NO is specified for a field, then you must also specify TermVector.NO.

建立这样的lucene索引后,通过document的id和field名称, 通过IndexReader就可找到对应的term vector。代码如下：
TermFreqVector termFreqVector = reader.getTermFreqVector(id, “FieldName”);
遍历这个TermFreqVector可以取出每个词和词频, 如果在索引中存放了offsets和positions信息，可以取到位置信息，这样可以缩短高亮显示关键字的时间.

通过term vector可以实现很多功能:
1) 比如比较两个文档的相似度,把书抽象成一个document文件, 具有author, subject fields. 那么现在就通过这两个field来比较两本书的相似度.
author这个field是multiple fields, 就是说可以有多个author, 那么第一步就是比author是否相同,
String[] authors = doc.getValues(“author”);
BooleanQuery authorQuery = new BooleanQuery(); // #3
for (int i = 0; i < authors.length; i++) { // #3
String author = authors[i]; // #3
authorQuery.add(new TermQuery(new Term(“author”, author)), BooleanClause.Occur.SHOULD); // #3
}
authorQuery.setBoost(2.0f);
最后还可以把这个查询的boost值设高, 表示这个条件很重要, 权重较高, 如果作者相同, 那么就很相似了.
第二步就用到term vector了, 这里用的很简单, 单纯的看subject field的term vector中的term是否相同,
TermFreqVector vector = // #4
reader.getTermFreqVector(id, “subject”); // #4
BooleanQuery subjectQuery = new BooleanQuery(); // #4
for (int j = 0; j < vector.size(); j++) { // #4
TermQuery tq = new TermQuery(new Term(“subject”, vector.getTerms()[j]));
subjectQuery.add(tq, BooleanClause.Occur.SHOULD); // #4
}

2) What category?
这个比上个例子高级一点, 怎么分类了,还是对于document的subject, 我们有了term vector.
所以对于两个document, 我们可以比较这两个文章的term vector在向量空间中的夹角, 夹角越小说明这个两个document越相似.
那么既然是分类就有个训练的过程, 我们必须建立每个类的term vector作为个标准, 来给其它document比较.
这里用map来实现这个term vector, (term, frequency), 用n个这样的map来表示n维. 我们就要为每个category来生成一个term vector, category和term vector也可以用一个map来连接.创建这个category的term vector, 这样做:
遍历这个类中的每个document, 取document的term vector, 把它加到category的term vector上.
private void addTermFreqToMap(Map vectorMap, TermFreqVector termFreqVector) {
String[] terms = termFreqVector.getTerms();
int[] freqs = termFreqVector.getTermFrequencies();
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
if (vectorMap.containsKey(term)) {
Integer value = (Integer) vectorMap.get(term);
vectorMap.put(term, new Integer(value.intValue() + freqs[i]));
} else {
vectorMap.put(term, new Integer(freqs[i]));
}
}
}
首先从document的term vector中取出term和frequency的list, 然后从category的term vector中取每一个term, 把document的term frequency加上去.OK了

有了这个每个类的category, 我们就要开始计算document和这个类的向量夹角了
cos = A*B/|A||B|
A*B就是点积, 就是两个向量每一维相乘, 然后全加起来.
这里为了简便计算, 假设document中term frequency只有两种情况, 0或1.就表示出现或不出现
private double computeAngle(String[] words, String category) {
// assume words are unique and only occur once
Map vectorMap = (Map) categoryMap.get(category);
int dotProduct = 0;
int sumOfSquares = 0;
for (int i = 0; i < words.length; i++) {
String word = words[i];
int categoryWordFreq = 0;
if (vectorMap.containsKey(word)) {
categoryWordFreq = ((Integer) vectorMap.get(word)).intValue();
}
dotProduct += categoryWordFreq; // optimized because we assume frequency in words is 1
sumOfSquares += categoryWordFreq * categoryWordFreq;
}
double denominator;
if (sumOfSquares == words.length) {
// avoid precision issues for special case
denominator = sumOfSquares; // sqrt x * sqrt x = x
} else {
denominator = Math.sqrt(sumOfSquares) *
Math.sqrt(words.length);
}
double ratio = dotProduct / denominator;
return Math.acos(ratio);
}
这个函数就是实现了上面那个公式还是比较简单的.

3) MoreLikeThis

对于找到比较相似的文档，lucene还提供了个比较高效的接口，MoreLikeThis接口

http://lucene.apache.org/java/1_9_1/api/org/apache/lucene/search/similar/MoreLikeThis.html

对于上面的方法我们可以比较每两篇文档的余弦值，然后对余弦值进行排序，找出最相似的文档，但这个方法的最大问题在于计算量太大，当文档数目很大时，几乎是无法接受的，当然有专门的方法去优化余弦法，可以使计算量大大减少，但这个方法精确，但门槛较高。

这个接口的原理很简单，对于一篇文档中，我们只需要提取出interestingTerm（即tf×idf高的词），然后用lucene去搜索包含相同词的文档，作为相似文档，这个方法的优点就是高效，但缺点就是不准确，这个接口提供很多参数，你可以配置来选择interestingTerm。

MoreLikeThis mlt = new MoreLikeThis(ir);

Reader target = …

// orig source of doc you want to find similarities to

Query query = mlt.like( target);

Hits hits = is.search(query);

用法很简单，这样就可以得到，相似的文档

这个接口比较灵活，你可以不直接用like接口，而是用
retrieveInterestingTerms(Reader r)

这样你可以获得interestingTerm，然后怎么处理就根据你自己的需要了。

分享到：

文章主要是介绍DB2取得当前时间的正确解析 ... | DB2获取当前时间

2013-09-18 16:26
浏览 1248
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Lucene 3.X 自定义排序

Lucene 3.X 自定义排序

lucene3.5多线程搜索释义

lucene3的IndexWriter参数释义

lucene3.5入门实例

lucene+IKAnnlyzer+highlighter实例

lucenelucene3.3 多目录索引

lucenelucene3.3 多目录索引

lucene3.5删除索引

使用Lucene TermVector提高关键字高亮显示速度

Lucene的Term Vectors学习

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Lucene 3.X 自定义排序

lucenelucene3.3 多目录索引

评论

发表评论

相关推荐

Subversion (SVN) - Repository (版本库) 备份与恢复

mysql中间件研究（Atlas，cobar，TDDL）--转载

eclipse 插件式安装 maven

HTTP 接口设计指南

IP 查询开放API

大牛 linux 命令

spring mvc中 每次都会多请求一次/favicon.ico

支付宝 return_url 与 notify_url 的区别

ubuntu 14.04 安装 rpm包,转化deb包

redis-live 监控 redis

freemarker 数字输出中的逗号问题

redis cli 客户端连接

redis连接过多的一个解决方法

Eclipse 骨灰级开发任务 快捷方式推荐

编码规范系列（一）：Eclipse Code Templates设置

MySQL 建立索引

ubuntu 12.04 添加源

[DB][MySql]关于取得自增字段的值、及@@IDENTITY 与并发性问题

POI 合并单元格

两个相同结构表 数据的 对复制

最近访客更多访客>>

spring mvc中每次都会多请求一次/favicon.ico

Eclipse 骨灰级开发任务快捷方式推荐

两个相同结构表数据的对复制