Lucene MoreLikeThisQuery 例子

rabbit9898

浏览: 147444 次
性别:
来自: 北京

最近访客更多访客>>

adnapllits

i4u

fanci

AILIKES

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

lucene

lucene Apache PHP maven Java

要做一个跟文章标题相关的新闻，本来想简单做一下，就是把标题用分词处理一下，去除停用词，做个布尔查询，朋友建议lucene有一个 MoreLikeThisQuery，试了一下功能，觉得还可以，贴上示例代码（MoreLikeThisQuery 在contrib 下的Queryies）：

pom文件：

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>lucene-test</groupId>
	<artifactId>lucene-test</artifactId>
	<version>0.1-SNAPSHOT</version>
	<name>lucene-test</name>
	<dependencies>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>2.9.2</version>  <!-- 3.0.0 -->
		</dependency>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queries</artifactId>
			<version>2.9.2</version>  <!-- 3.0.0 lucene-queries-2.9.2-dev.jar-->
		</dependency>


	</dependencies>
	<build>
		<plugins>
			<plugin>
				<artifactId>maven-compiler-plugin</artifactId>
				<configuration>
					<source>1.6</source>
					<target>1.6</target>
					<encoding>UTF-8</encoding>
				</configuration>
			</plugin>
		</plugins>
	</build>
</project>

Java文件：

package lucene.test;
import java.io.File;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similar.MoreLikeThisQuery;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;

//ref doc: 
//http://www.iteye.com/topic/586043
//http://www.cnblogs.com/forfuture1978/archive/2010/05/19/1738803.html
//http://www.javadocexamples.com/java_source/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java.html
	
public class LuceneTestLike {
	public static void main(String[] args) {   
        try{   
  
            String path = "./Index";   
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_29) ; 
               
            boolean isIndex = false;    // true:要索引,false:表示要搜索    
               
            if(isIndex){
                IndexWriter writer = new IndexWriter(new NIOFSDirectory(new File(path)),analyzer,MaxFieldLength.LIMITED);   
                   
                Document doc_0 = new Document();   
                doc_0.add(new Field("Name","java 开发人员", Field.Store.YES, Field.Index.ANALYZED));   
                doc_0.add(new Field("Info","招聘 网站开发人员,要求一年或以上工作经验", Field.Store.YES, Field.Index.ANALYZED));   
                doc_0.add(new Field("Time","20100201", Field.Store.YES, Field.Index.NOT_ANALYZED));   
                writer.addDocument(doc_0);   
                   
                Document doc_1 = new Document();   
                doc_1.add(new Field("Name","高级开发人员(java 方向)", Field.Store.YES, Field.Index.ANALYZED));   
                doc_1.add(new Field("Info","需要有四年或者以上的工作经验,有大型项目实践,java基本扎实", Field.Store.YES, Field.Index.ANALYZED));   
                doc_1.add(new Field("Time","20100131", Field.Store.YES, Field.Index.NOT_ANALYZED));   
                writer.addDocument(doc_1);   
                   
                Document doc_2 = new Document();   
                doc_2.add(new Field("Name","php 开发工程师", Field.Store.YES, Field.Index.ANALYZED));   
                doc_2.add(new Field("Info","主要是维护公司的网站php开发,能独立完成网站的功能", Field.Store.YES, Field.Index.ANALYZED));   
                doc_2.add(new Field("Time","20100201", Field.Store.YES, Field.Index.NOT_ANALYZED));   
                writer.addDocument(doc_2);   
                   
                Document doc_3 = new Document();   
                doc_3.add(new Field("Name","linux 管理员", Field.Store.YES, Field.Index.ANALYZED));   
                doc_3.add(new Field("Info","管理及维护公司的linux服务器,职责包括完成mysql数据备份及日常管理,apache的性能调优等", Field.Store.YES, Field.Index.ANALYZED));   
                doc_3.add(new Field("Time","20100201", Field.Store.YES, Field.Index.NOT_ANALYZED));   
                writer.addDocument(doc_3);   
                   
                Document doc_4 = new Document();   
                doc_4.add(new Field("Name","lucene开发工程师", Field.Store.YES, Field.Index.ANALYZED));   
                doc_4.add(new Field("Info","需要两年或者以上的从事lucene java 开发工作的经验,需要对算法,排序规则等有相关经验,java水平及基础要扎实", Field.Store.YES, Field.Index.ANALYZED));   
                doc_4.add(new Field("Time","20100131", Field.Store.YES, Field.Index.NOT_ANALYZED));   
                writer.addDocument(doc_4);   
                   
                Document doc_5 = new Document();   
                doc_5.add(new Field("Name","php 软件工程师", Field.Store.YES, Field.Index.ANALYZED));   
                doc_5.add(new Field("Info","具有大量的php开发经验,如熟悉 java 开发,数据库管理则更佳", Field.Store.YES, Field.Index.ANALYZED));   
                doc_5.add(new Field("Time","20100130", Field.Store.YES, Field.Index.NOT_ANALYZED));   
                writer.addDocument(doc_5);   
                   
                writer.close();   
                System.out.println("数据索引完成");   
            }else{   
                IndexSearcher search = new IndexSearcher(new NIOFSDirectory(new File(path)),true);   
                
                String kw="php 开发工程师" ; //"lucene java"; //"开发工程师"; //"php 开发工程师";
                
                String[] moreLikeFields = { "Name"} ; //{ "Name" ,"Info"};
				MoreLikeThisQuery query = new MoreLikeThisQuery(kw, moreLikeFields, analyzer);
				// 设置停用词
				// query.setStopWords(getStopWords(reader));
				
				//最少的词频 
				//Sets the frequency below which terms will be ignored in the source doc.
				query.setMinTermFrequency(1);

				//最多的查询词数目
				//Sets the maximum number of query terms that will be included in any generated query.
				query.setMaxQueryTerms(5);
				
				//词至少在这么多篇文档中出现
				//Ignore words which do not occur in at least this many docs.  DEFAULT_MIN_DOC_FREQ = 5
				query.setMinDocFreq(1);

				System.out.println("搜索条件:" + query.toString());   
            	
                long start = System.currentTimeMillis();   
                TopDocs tDocs = search.search(query,10);   
                   
                ScoreDoc sDocs[] = tDocs.scoreDocs;   
  
                int len = sDocs.length;   
                   
                for(int i=0;i<len;i++){   
                    ScoreDoc tScore = sDocs[i];   
                    //从Lucene3.0开始已经不能通过 tScore.score 这样来得到些文档的得分了   
                    int docId = tScore.doc;   
                    Explanation exp = search.explain(query, docId);   
                       
                    Document tDoc = search.doc(docId);   
                    String Name = tDoc.get("Name");   
                    String Info = tDoc.get("Info");   
                    String Time = tDoc.get("Time");   
                       
                    float score = exp.getValue();   
                    //System.out.println(exp.toString()); //如果需要打印文档得分的详细信息则可以通过此方法 
                       
                    System.out.println("DocId:"+docId+"\tScore:" + score + "\tName:" + Name + "\tTime:" + Time + "\tInfo:" + Info);   
                }   
                Long end = System.currentTimeMillis();   
                System.out.println("搜索用时:" + (end -start) + "ms");   
                search.close();   
            }   
               
        }catch(Exception ex){   
            ex.printStackTrace();   
        }   
    }   

}

运行结果：

搜索条件:like:php 开发工程师
DocId:2 Score:1.1971036 Name:php 开发工程师 Time:20100201 Info:主要是维护公司的网站php开发,能独立完成网站的功能
DocId:5 Score:0.82631415 Name:php 软件工程师 Time:20100130 Info:具有大量的php开发经验,如熟悉 java 开发,数据库管理则更佳
DocId:4 Score:0.6882751 Name:lucene开发工程师 Time:20100131 Info:需要两年或者以上的从事lucene java 开发工作的经验,需要对算法,排序规则等有相关经验,java水平及基础要扎实
DocId:0 Score:0.038315877 Name:java 开发人员 Time:20100201 Info:招聘网站开发人员,要求一年或以上工作经验
DocId:1 Score:0.027368484 Name:高级开发人员(java 方向) Time:20100131 Info:需要有四年或者以上的工作经验,有大型项目实践,java基本扎实
搜索用时:47ms

1
顶

0
踩

分享到：

Lucene MoreLikeThisQuery 例子备注 | lucene 3.0 分词例子转载

2011-01-04 13:42
浏览 2792
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论