`
wu_quanyin
  • 浏览: 204706 次
  • 性别: Icon_minigender_1
  • 来自: 福建省
社区版块
存档分类
最新评论

Lucene---全文检索(处理一对多去重问题 )

阅读更多

在处理如"问答"功能时,以答进行搜索,这时假就会出现去重问题--->http://www.iteye.com/problems/56869

 

 

解决方案:

1,写个线程管理器,用来存储当前查出的重复数据

 

/*
 * CopyRright (c)  www.fdauto.com
 */
package com.fdauto.bws.business.module.lucene.duplicate;

import java.util.ArrayList;
import java.util.List;

/**
 * 
 *用于管理在一个线程内共用一个Map
 * 
 * @author 09817(wu_quanyin)
 * @date 2011-01-06 上午17:02:44
 * @version 1.0
 */
public class DuplicateManager {

	private static ThreadLocal<List<DuplicateModel>> 
					threadLocalList = new ThreadLocal<List<DuplicateModel>>() {
		// 该函数说明,在当前线程内,不管调用多次,只实例化一次
		protected synchronized List<DuplicateModel> initialValue() {
			return new ArrayList<DuplicateModel>();
		}
	};

	public static List<DuplicateModel> getCurrentThreadList() {
		return threadLocalList.get();
	}
	/**清空当前线程中设置进的list*/
	public static void removeCurrentThreadListValue(){
		threadLocalList.remove();
	}

	public static void main(String[] args) throws Exception {
		
		Thread thread1=new Thread(){
			public void run(){
				DuplicateManager.getCurrentThreadList().add(new DuplicateModel());
				System.out.println(DuplicateManager.getCurrentThreadList().size());
				DuplicateManager.removeCurrentThreadListValue();
				DuplicateManager.removeCurrentThreadListValue();
				System.out.println("当前线程的值被删除了."+DuplicateManager.getCurrentThreadList().size());
			}
		};
		
		
		Thread thread2=new Thread(){
			public void run(){
				System.out.println(DuplicateManager.getCurrentThreadList().size());
			}
		};
		
		thread1.start();
		Thread.sleep(1000L);
		thread2.start();
	
		System.out.println("-------------------------------");
		Runnable runnable1=new Runnable(){
			public void run(){
				DuplicateManager.getCurrentThreadList().add(new DuplicateModel());
				System.out.println(DuplicateManager.getCurrentThreadList().size());
			}
		};
		
		Thread thread3=new Thread(runnable1);
		Thread thread4=new Thread(runnable1);
		
		thread3.start();
		Thread.sleep(1000L);
		thread4.start();
	}
	
	
}

二, 设置一个model存储相同值的doc,并设置开关,只出现一次

 

/*
 * CopyRright (c)  www.fdauto.com
 */
package com.fdauto.bws.business.module.lucene.duplicate;

import java.util.ArrayList;
import java.util.List;

/**
 * 
 *用于存放一组具有相同值的字段
 * 
 * @author 09817(wu_quanyin)
 * @date 2011-01-06 上午17:02:44
 * @version 1.0
 */
public class DuplicateModel {

	
	private int mainDoc;
	//用于进行判断该重复值是否比较过,如果比较过则不在比较
	private boolean isChecked=false;
	private List<Integer> assistantDocs=new ArrayList<Integer>();
	
	public DuplicateModel(){}
	public DuplicateModel(int mainDoc){
		this.mainDoc=mainDoc;
	}
	
	public int getMainDoc() {
		return mainDoc;
	}
	public void setMainDoc(int mainDoc) {
		this.mainDoc = mainDoc;
	}
	public boolean isChecked() {
		return isChecked;
	}
	public void setChecked(boolean isChecked) {
		this.isChecked = isChecked;
	}
	public List<Integer> getAssistantDocs() {
		return assistantDocs;
	}
	public void setAssistantDocs(List<Integer> assistantDocs) {
		this.assistantDocs = assistantDocs;
	}
	
	
}

 三,扩展duplicatefilter类

 

 

/*
 * CopyRright (c)  www.fdauto.com
 */
package com.fdauto.bws.business.module.lucene.duplicate;

import java.io.IOException;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.OpenBitSet;

/**
 * 在对重复的项进行以集合的形式区分存放在list中待在DuplicateQuery进行比较
 * 
 * @author wu_quanyin(09817)
 * @version 1.0
 * @date 2011-01-06 上午10:54:01
 */
public class DuplicateExtendFilter extends Filter {

	private static final long serialVersionUID = 1601875802819099276L;

	String fieldName;

	public DuplicateExtendFilter(String fieldName) {
		this.fieldName = fieldName;
	}

	@Override
	public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
		return correctBits(reader);
	}

	private OpenBitSet correctBits(IndexReader reader) throws IOException {
	
		OpenBitSet bits = new OpenBitSet(reader.maxDoc()); // assume all are
															// INvalid
		Term startTerm = new Term(fieldName);
		TermEnum te = reader.terms(startTerm);
		if (te != null) {
			Term currTerm = te.term();
			while ((currTerm != null)
					&& (currTerm.field() == startTerm.field())) // term
																// fieldnames
																// are interned
			{
				int firstDoc = -1;

				// set non duplicates
				TermDocs td = reader.termDocs(currTerm);
				// 第一个全部添加
				if (td.next()) {
					firstDoc = td.doc();
					bits.set(firstDoc);
				}

				// --------------当有重复键时,实行对具有重复的进行保留比较
				DuplicateModel duplicateModel = new DuplicateModel();
				boolean isDuplicate = false;
				while (td.next()) {
					isDuplicate = true;
					duplicateModel.getAssistantDocs().add(td.doc());
				}
				if (isDuplicate) {
					duplicateModel.setMainDoc(firstDoc);
					duplicateModel.getAssistantDocs().add(firstDoc);
					DuplicateManager.getCurrentThreadList().add(duplicateModel);
				} else {
					duplicateModel = null;
				}
				// -----------------------

				if (!te.next()) {
					break;
				}
				currTerm = te.term();
			}
		}
		return bits;
	}

	public String getFieldName() {
		return fieldName;
	}

	public void setFieldName(String fieldName) {
		this.fieldName = fieldName;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if ((obj == null) || (obj.getClass() != this.getClass()))
			return false;
		DuplicateExtendFilter other = (DuplicateExtendFilter) obj;
		return (fieldName == other.fieldName || (fieldName != null && fieldName
				.equals(other.fieldName)));
	}

	@Override
	public int hashCode() {
		int hash = 217;
		hash = 31 * hash + fieldName.hashCode();
		return hash;
	}
	

}
 

 

四,在query中,根据自己的逻辑进行判断,扩展filterquery类

 

/*
 * CopyRright (c)  www.fdauto.com
 */
package com.fdauto.bws.business.module.lucene.duplicate;

import java.io.IOException;
import java.util.Iterator;
import java.util.Set;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ToStringUtils;

/**
 * 与DuplicateExtendFilter配合使用,
 * 对有重复的一个个比较过去..
 * 
 * @author 09817(wu_quanyin)
 * @date 2010-12-25 上午09:02:44
 * @version 1.0
 */
public class DuplicateQuery extends Query {

	/**
	 * 
	 */
	private static final long serialVersionUID = 4610299680666587077L;
	Query query;
	Filter filter;

	/**
	 * Constructs a new query which applies a filter to the results of the
	 * original query. Filter.getDocIdSet() will be called every time this query
	 * is used in a search.
	 * 
	 * @param query
	 *            Query to be filtered, cannot be <code>null</code>.
	 * @param filter
	 *            Filter to apply to query results, cannot be <code>null</code>.
	 */
	public DuplicateQuery(Query query, Filter filter) {
		this.query = query;
		this.filter = filter;
	}

	/**
	 * Returns a Weight that applies the filter to the enclosed query's Weight.
	 * This is accomplished by overriding the Scorer returned by the Weight.
	 */
	@Override
	public Weight createWeight(final Searcher searcher) throws IOException {
		final Weight weight = query.createWeight(searcher);
		final Similarity similarity = query.getSimilarity(searcher);
		return new Weight() {
			private static final long serialVersionUID = 3001781092877864947L;
			private float value;

			// pass these methods through to enclosed query's weight
			@Override
			public float getValue() {
				return value;
			}

			@Override
			public float sumOfSquaredWeights() throws IOException {
				return weight.sumOfSquaredWeights() * getBoost() * getBoost();
			}

			@Override
			public void normalize(float v) {
				weight.normalize(v);
				value = weight.getValue() * getBoost();
			}

			@Override
			public Explanation explain(IndexReader ir, int i)
					throws IOException {
				Explanation inner = weight.explain(ir, i);
				if (getBoost() != 1) {
					Explanation preBoost = inner;
					inner = new Explanation(inner.getValue() * getBoost(),
							"product of:");
					inner.addDetail(new Explanation(getBoost(), "boost"));
					inner.addDetail(preBoost);
				}
				Filter f = DuplicateQuery.this.filter;
				DocIdSet docIdSet = f.getDocIdSet(ir);
				DocIdSetIterator docIdSetIterator = docIdSet == null ? DocIdSet.EMPTY_DOCIDSET
						.iterator()
						: docIdSet.iterator();
				if (docIdSetIterator == null) {
					docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator();
				}
				if (docIdSetIterator.advance(i) == i) {
					return inner;
				} else {
					Explanation result = new Explanation(0.0f,
							"failure to match filter: " + f.toString());
					result.addDetail(inner);
					return result;
				}
			}

			// return this query
			@Override
			public Query getQuery() {
				return DuplicateQuery.this;
			}

			// return a filtering scorer
			@Override
			public Scorer scorer(IndexReader indexReader,
					boolean scoreDocsInOrder, boolean topScorer)
					throws IOException {
				final Scorer scorer = weight.scorer(indexReader, true, false);
				if (scorer == null) {
					return null;
				}
				DocIdSet docIdSet = filter.getDocIdSet(indexReader);
				if (docIdSet == null) {
					return null;
				}
				final DocIdSetIterator docIdSetIterator = docIdSet.iterator();

				if (docIdSetIterator == null) {
					return null;
				}

				return new Scorer(similarity) {

					private int doc = -1;

					private int advanceToCommon(int scorerDoc, int disiDoc)
							throws IOException {
						while (scorerDoc != disiDoc) {
							if (scorerDoc < disiDoc) {

								scorerDoc = scorer.advance(disiDoc);
							} else {
								disiDoc = docIdSetIterator.advance(scorerDoc);
							}

						}
						return scorerDoc;
					}

					@Override
					public int nextDoc() throws IOException {
						int scorerDoc = -1;
						
						//对每一个重复的进行比较
						while ((scorerDoc = scorer.nextDoc()) != NO_MORE_DOCS) {
							
							boolean ignoreDocment=false;
							
							for (Iterator<DuplicateModel> duplicateIterator = DuplicateManager
									.getCurrentThreadList().iterator(); duplicateIterator
									.hasNext();) {
								DuplicateModel duplicateModel = duplicateIterator.next();
								if(duplicateModel.getAssistantDocs()
											.contains(scorerDoc)){
									if(!duplicateModel.isChecked()){
										duplicateModel.setChecked(true);
										return scorerDoc;
									}else{
										//如果已经被检查过的忽略
										ignoreDocment=true;
										break;
									}
								}
								
							}
							
							// 如果该doc已经加入过,则忽略
							if (ignoreDocment) {
								continue;
							}

							return scorerDoc;

						}
						return NO_MORE_DOCS;

					}

					@Override
					public int docID() {
						return doc;
					}

					@Override
					public int advance(int target) throws IOException {
						int disiDoc, scorerDoc;
						return doc = (disiDoc = docIdSetIterator
								.advance(target)) != NO_MORE_DOCS
								&& (scorerDoc = scorer.advance(disiDoc)) != NO_MORE_DOCS
								&& advanceToCommon(scorerDoc, disiDoc) != NO_MORE_DOCS ? scorer
								.docID()
								: NO_MORE_DOCS;
					}

					@Override
					public float score() throws IOException {
						return getBoost() * scorer.score();
					}
				};
			}
		};
	}

	/** Rewrites the wrapped query. */
	@Override
	public Query rewrite(IndexReader reader) throws IOException {
		Query rewritten = query.rewrite(reader);
		if (rewritten != query) {
			DuplicateQuery clone = (DuplicateQuery) this.clone();
			clone.query = rewritten;
			return clone;
		} else {
			return this;
		}
	}

	public Query getQuery() {
		return query;
	}

	public Filter getFilter() {
		return filter;
	}

	// inherit javadoc
	@Override
	public void extractTerms(Set<Term> terms) {
		getQuery().extractTerms(terms);
	}

	/** Prints a user-readable version of this query. */
	@Override
	public String toString(String s) {
		StringBuilder buffer = new StringBuilder();
		buffer.append("filtered(");
		buffer.append(query.toString(s));
		buffer.append(")->");
		buffer.append(filter);
		buffer.append(ToStringUtils.boost(getBoost()));
		return buffer.toString();
	}

	/** Returns true iff <code>o</code> is equal to this. */
	@Override
	public boolean equals(Object o) {
		if (o instanceof FilteredQuery) {
			DuplicateQuery fq = (DuplicateQuery) o;
			return (query.equals(fq.query) && filter.equals(fq.filter) && getBoost() == fq
					.getBoost());
		}
		return false;
	}

	/** Returns a hash code value for this object. */
	@Override
	public int hashCode() {
		return query.hashCode() ^ filter.hashCode()
				+ Float.floatToRawIntBits(getBoost());
	}
}
使用:
DuplicateExtendFilter filter = new DuplicateExtendFilter(
					uniqueField);
			duplicateQuery = new DuplicateQuery(query, filter);
分享到:
评论
5 楼 skambc 2011-10-19  
wu_quanyin 写道
skambc 写道
lucene提供的DuplicateFilter,在实际应用感觉没有什么实际意义,因为它只是在重复的域内保留了一个文档,造成查询数据丢失。提供这个真是没什么用呀

没错,,所以我对的进行了改造,按我以上的设置,可以达到想要的效果,不过在性能上会有所消耗。

你好,我qq499530853,方便的话加我一下,最近用lucene,咨询点东西,谢了
4 楼 wu_quanyin 2011-10-19  
skambc 写道
lucene提供的DuplicateFilter,在实际应用感觉没有什么实际意义,因为它只是在重复的域内保留了一个文档,造成查询数据丢失。提供这个真是没什么用呀

没错,,所以我对的进行了改造,按我以上的设置,可以达到想要的效果,不过在性能上会有所消耗。
3 楼 skambc 2011-10-19  
lucene提供的DuplicateFilter,在实际应用感觉没有什么实际意义,因为它只是在重复的域内保留了一个文档,造成查询数据丢失。提供这个真是没什么用呀
2 楼 wu_quanyin 2011-10-11  
confident_f 写道
通过你写的这篇文章,可以看出你对lucene的一套机制肯定做了深入的研究,关于lucene的去重问题,我想请教你一个问题,它去重的具体过程是怎么样的?
比如在search的时候search(query,n) 是把n里的去掉重复的吗?
谢谢

要了解这个问题,你可以先了解下我以前的提问文章,
http://www.iteye.com/problems/56869
1 楼 confident_f 2011-10-11  
通过你写的这篇文章,可以看出你对lucene的一套机制肯定做了深入的研究,关于lucene的去重问题,我想请教你一个问题,它去重的具体过程是怎么样的?
比如在search的时候search(query,n) 是把n里的去掉重复的吗?
谢谢

相关推荐

    lucene-analyzers-smartcn-7.7.0-API文档-中文版.zip

    赠送jar包:lucene-analyzers-smartcn-7.7.0.jar; 赠送原API文档:lucene-analyzers-smartcn-7.7.0-javadoc.jar; 赠送源代码:lucene-analyzers-smartcn-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-...

    lucene-core-2.9.4,lucene-core-3.0.2,lucene-core-3.0.3,lucene-core-3.4.0

    lucene-core-2.9.4,lucene-core-3.0.2,lucene-core-3.0.3,lucene-core-3.4.0

    lucene-core-7.7.0-API文档-中文版.zip

    赠送jar包:lucene-core-7.7.0.jar; 赠送原API文档:lucene-core-7.7.0-javadoc.jar; 赠送源代码:lucene-core-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-core-7.7.0.pom; 包含翻译后的API文档:lucene...

    lucene-analyzers-common-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-analyzers-common-6.6.0.jar; 赠送原API文档:lucene-analyzers-common-6.6.0-javadoc.jar; 赠送源代码:lucene-analyzers-common-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-...

    lucene-suggest-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-suggest-6.6.0.jar; 赠送原API文档:lucene-suggest-6.6.0-javadoc.jar; 赠送源代码:lucene-suggest-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-suggest-6.6.0.pom; 包含翻译后的API...

    lucene-core-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-core-6.6.0.jar; 赠送原API文档:lucene-core-6.6.0-javadoc.jar; 赠送源代码:lucene-core-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-core-6.6.0.pom; 包含翻译后的API文档:lucene...

    lucene-core-7.2.1-API文档-中文版.zip

    赠送jar包:lucene-core-7.2.1.jar; 赠送原API文档:lucene-core-7.2.1-javadoc.jar; 赠送源代码:lucene-core-7.2.1-sources.jar; 赠送Maven依赖信息文件:lucene-core-7.2.1.pom; 包含翻译后的API文档:lucene...

    lucene-backward-codecs-7.3.1-API文档-中英对照版.zip

    赠送jar包:lucene-backward-codecs-7.3.1.jar; 赠送原API文档:lucene-backward-codecs-7.3.1-javadoc.jar; 赠送源代码:lucene-backward-codecs-7.3.1-sources.jar; 赠送Maven依赖信息文件:lucene-backward-...

    lucene-core-8.7.0

    Java使用 lucene-core-8.7.0 实现全文检索等功能

    lucene-memory-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-memory-6.6.0.jar; 赠送原API文档:lucene-memory-6.6.0-javadoc.jar; 赠送源代码:lucene-memory-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-memory-6.6.0.pom; 包含翻译后的API文档...

    lucene-spatial-extras-6.6.0-API文档-中英对照版.zip

    赠送jar包:lucene-spatial-extras-6.6.0.jar; 赠送原API文档:lucene-spatial-extras-6.6.0-javadoc.jar; 赠送源代码:lucene-spatial-extras-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-extras...

    lucene-backward-codecs-6.6.0-API文档-中英对照版.zip

    赠送jar包:lucene-backward-codecs-6.6.0.jar; 赠送原API文档:lucene-backward-codecs-6.6.0-javadoc.jar; 赠送源代码:lucene-backward-codecs-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-backward-...

    lucene-spatial-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-spatial-6.6.0.jar; 赠送原API文档:lucene-spatial-6.6.0-javadoc.jar; 赠送源代码:lucene-spatial-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-6.6.0.pom; 包含翻译后的API...

    lucene-misc-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-misc-6.6.0.jar; 赠送原API文档:lucene-misc-6.6.0-javadoc.jar; 赠送源代码:lucene-misc-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-misc-6.6.0.pom; 包含翻译后的API文档:lucene...

    lucene-analyzers-smartcn-7.7.0-API文档-中英对照版.zip

    赠送jar包:lucene-analyzers-smartcn-7.7.0.jar; 赠送原API文档:lucene-analyzers-smartcn-7.7.0-javadoc.jar; 赠送源代码:lucene-analyzers-smartcn-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-...

    lucene-spatial-extras-7.3.1-API文档-中英对照版.zip

    赠送jar包:lucene-spatial-extras-7.3.1.jar; 赠送原API文档:lucene-spatial-extras-7.3.1-javadoc.jar; 赠送源代码:lucene-spatial-extras-7.3.1-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-extras...

    lucene-backward-codecs-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-backward-codecs-6.6.0.jar; 赠送原API文档:lucene-backward-codecs-6.6.0-javadoc.jar; 赠送源代码:lucene-backward-codecs-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-backward-...

    lucene-spatial-extras-7.2.1-API文档-中英对照版.zip

    赠送jar包:lucene-spatial-extras-7.2.1.jar; 赠送原API文档:lucene-spatial-extras-7.2.1-javadoc.jar; 赠送源代码:lucene-spatial-extras-7.2.1-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-extras...

    lucene-suggest-7.7.0-API文档-中文版.zip

    赠送jar包:lucene-suggest-7.7.0.jar; 赠送原API文档:lucene-suggest-7.7.0-javadoc.jar; 赠送源代码:lucene-suggest-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-suggest-7.7.0.pom; 包含翻译后的API...

    lucene-highlighter-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-highlighter-6.6.0.jar; 赠送原API文档:lucene-highlighter-6.6.0-javadoc.jar; 赠送源代码:lucene-highlighter-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-highlighter-6.6.0.pom;...

Global site tag (gtag.js) - Google Analytics