`
strayly
  • 浏览: 93655 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

转载 利用LUCENE求相似文档

 
阅读更多
/*
* MoreLikeThis.java
*
* Created on 2008年3月11日, 下午3:31
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/

package Similarity;
import java.util.*;
import java.io.*;
import java.lang.*;
import java.text.*;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
/**
*
* @author Administrator
*/
public final class MoreLikeThis {
public final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
public static final Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
public final int DEFAULT_MIN_TERM_FREQ = 2;
public final int DEFALT_MIN_DOC_FREQ = 5;
public final boolean DEFAULT_BOOST = false;
public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"};
public final int DEFAULT_MIN_WORD_LENGTH = 0;
public final int DEFAULT_MAX_WORD_LENGTH = 0;
public static final Hashtable DEFAULT_STOP_WORDS = null;
private Hashtable stopWords = DEFAULT_STOP_WORDS;
public final int DEFAULT_MAX_QUERY_TERMS = 25;
private Analyzer analyzer = DEFAULT_ANALYZER;
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
private int minDocFreq = DEFALT_MIN_DOC_FREQ;
private boolean boost = DEFAULT_BOOST;
private String[] fieldNames = DEFAULT_FIELD_NAMES;
private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
private org.apache.lucene.search.Similarity similarity = new DefaultSimilarity();
private IndexReader ir;
/** Creates a new instance of MoreLikeThis */
public MoreLikeThis(IndexReader ir) {
this.ir = ir;
}

public Analyzer GetAnalyzer() {
return analyzer;
}

public void SetAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}

public int GetMinTermFreq() {
return minTermFreq;
}

public void SetMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}

public int GetMinDocFreq() {
return minDocFreq;
}

public void SetMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}

public boolean IsBoost() {
return boost;
}

public void SetBoost(boolean boost) {
this.boost = boost;
}

public String[] GetFieldNames() {
return fieldNames;
}

public void SetFieldNames(String[] fieldNames) {
this.fieldNames = fieldNames;
}

public int GetMinWordLen() {
return minWordLen;
}

public void SetMinWordLen(int minWordLen) {
this.minWordLen = minWordLen;
}

public int GetMaxWordLen() {
return maxWordLen;
}

public void SetMaxWordLen(int maxWordLen) {
this.maxWordLen = maxWordLen;
}

public void SetStopWords(Hashtable stopWords) {
this.stopWords = stopWords;
}

public Hashtable GetStopWords() {
return stopWords;
}

public int GetMaxQueryTerms() {
return maxQueryTerms;
}

public void SetMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}

public int GetMaxNumTokensParsed() {
return maxNumTokensParsed;
}

public void SetMaxNumTokensParsed(int i) {
maxNumTokensParsed = i;
}

public Query Like(int docNum)
{
if (fieldNames == null)
{
Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
Iterator e = fields.iterator();
fieldNames = new String[fields.size()];
int index = 0;
while (e.hasNext())
fieldNames[index++] = (String) e.next();
}

return CreateQuery(RetrieveTerms(docNum));
}
public Query Like(File f)
{
try
{
if (fieldNames == null)
{
Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
Iterator e = fields.iterator();
fieldNames = new String[fields.size()];
int index = 0;
while (e.hasNext())
fieldNames[index++] = (String) e.next();
}
return Like(new FileInputStream(f.getName()));
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}
public Query Like(FileInputStream is_Renamed)
{
return Like(new InputStreamReader(is_Renamed));
}

public Query Like(Reader r)
{
return CreateQuery(RetrieveTerms(r));
}
private Query CreateQuery(PriorityQueue q)
{
BooleanQuery query = new BooleanQuery();
Object cur;
int qterms = 0;
float bestScore = 0;
if(q!=null)
{
while (((cur = q.pop()) != null))
{
Object[] ar = (Object[]) cur;
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));

if (boost)
{
if (qterms == 0)
{
bestScore = ((Float)ar[2]).floatValue();
}
float myScore = ((Float)ar[2]).floatValue();

tq.setBoost(myScore / bestScore);
}

try
{
query.add(tq, BooleanClause.Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses ignore)
{
break;
}

qterms++;
if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
{
break;
}
}

return query;
}

private PriorityQueue CreateQueue(Dictionary words)
{
try
{
int numDocs = ir.numDocs();
FreqQ res = new FreqQ(words.size());

Enumeration it = words.keys();
while (it.hasMoreElements())
{
String word = (String) it.nextElement();
Object tmpW=words.get(word);
int tmpI=((Int32)tmpW).x;
if(tmpI==0)
{
tmpI=1;
}
int tf = tmpI;
if (minTermFreq > 0 && tf < minTermFreq)
{
continue;
}
String topField = fieldNames[0];
int docFreq = 0;
for (int i = 0; i < fieldNames.length; i++)
{
int freq = ir.docFreq(new Term(fieldNames[i], word));
topField = (freq > docFreq) ? fieldNames[i] : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}

if (minDocFreq > 0 && docFreq < minDocFreq)
{
continue;
}

if (docFreq == 0)
{
continue;
}

float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
res.insert(new Object[]{word, topField, (float) score, (float)idf, (long) docFreq, (long) tf});
return res;
}
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}

public String DescribeParams()
{
StringBuilder sb = new StringBuilder();
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
sb.append("\t" + "fieldNames : \"");
String delim = "";
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
sb.append(delim).append(fieldName);
delim = ", ";
}
sb.append("\n");
sb.append("\t" + "boost : " + boost + "\n");
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
return sb.toString();
}

private PriorityQueue RetrieveTerms(int docNum)
{
try
{
Dictionary termFreqMap = new Hashtable();
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
if (vector == null)
{
Document d = ir.document(docNum);
String[] text = d.getValues(fieldName);
if (text != null)
{
for (int j = 0; j < text.length; j++)
{
AddTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
}
}
}
else
{
AddTermFrequencies(termFreqMap, vector);
}
}

return CreateQueue(termFreqMap);
}
catch(IOException e)
{
System.out.println(e);
}
return null;
}

private void AddTermFrequencies(Dictionary termFreqMap, TermFreqVector vector)
{
String[] terms = vector.getTerms();
int[] freqs = vector.getTermFrequencies();
for (int j = 0; j < terms.length; j++)
{
String term = terms[j];

if (IsNoiseWord(term))
{
continue;
}
Int32 cnt = (Int32) termFreqMap.get(term);
if (cnt == null)
{
cnt = new Int32();
termFreqMap.put(term,cnt);
cnt.x = freqs[j];
}
else
{
cnt.x += freqs[j];
}
}
}

private void AddTermFrequencies(Reader r, Dictionary termFreqMap, String fieldName)
{
try{
TokenStream ts = analyzer.tokenStream(fieldName,r);
org.apache.lucene.analysis.Token token;
int tokenCount = 0;
while ((token = ts.next()) != null)
{
String word = token.termText();
tokenCount++;
if (tokenCount > maxNumTokensParsed)
{
break;
}
if (IsNoiseWord(word))
{
continue;
}

Int32 cnt = (Int32) termFreqMap.get(word);
if (cnt == null)
{
termFreqMap.put(word,new Int32());
}
else
{
cnt.x++;
}
}
}
catch(IOException e)
{
System.out.println(e);
}
}
   
private boolean IsNoiseWord(String term)
{
int len = term.length();
if (minWordLen > 0 && len < minWordLen)
{
return true;
}
if (maxWordLen > 0 && len > maxWordLen)
{
return true;
}
if (stopWords != null && stopWords.contains(term))
{
return true;
}
return false;
}

public PriorityQueue RetrieveTerms(Reader r)
{
Dictionary words = new Hashtable();
for (int i = 0; i < fieldNames.length; i++)
{
String fieldName = fieldNames[i];
AddTermFrequencies(r, words, fieldName);
}
return CreateQueue(words);
}

public String[] RetrieveInterestingTerms(StringReader r)
{
ArrayList al = new ArrayList(maxQueryTerms);
PriorityQueue pq = RetrieveTerms(r);
Object cur;
int lim = maxQueryTerms;
while (((cur = pq.pop()) != null) && lim-- > 0)
{
Object[] ar = (Object[]) cur;
al.add(ar[0]);
}
String[] res = new String[al.size()];
return (String[]) al.toArray();
}

private class FreqQ extends PriorityQueue
{
private FreqQ(int s)
{
super.initialize(s);
}

protected boolean lessThan(Object a, Object b)
{
Object[] aa = (Object[]) a;
Object[] bb = (Object[]) b;
float fa = ((Float) aa[2]).floatValue();
float fb = ((Float) bb[2]).floatValue();
return (float) fa > (float) fb;
}
}

private class Int32
{
int x;

Int32()
{
x = 1;
}
}
}

调用测试代码:
String indexName="e:\\index\\indexForML"; //索引文件路径
String fn="c:\\0.txt"; //测试文件路径
IndexReader r = IndexReader.open(indexName);
MoreLikeThis mlt = new MoreLikeThis(r); //传入IndexReader对象提供查询
mlt.SetAnalyzer(new StandardAnalyzer()); //设置使用的分词器
mlt.SetFieldNames(new String[]{"content"}); //设置需要比较的field字段
Query query = null;
query = mlt.Like(new FileReader(fn)); //创建查询,传入查询内容可为任意的Reader子类
IndexSearcher searcher = new IndexSearcher(indexName);
Hits hits = searcher.search(query); //根据查询返回相似文档
int len = hits.length();
for (int i = 0; i < Math.min(25, len); i++) //如果比25大就只返回前25条
{
Document d = hits.doc(i);
System.out.println("score : " + hits.score(i));
System.out.println("filename : " + d.get("fullname")); //取出索引字段fullname内容
System.out.println("type : " + d.get("ttype")); //取出索引字段ttype内容
System.out.println("___________________________");
}
r.close(); //关闭索引

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics