`

在Lucene中应用poading进行分词

阅读更多

1、下载poading解牛
http://code.google.com/p/paoding/downloads/list
2、设置系统环境变量PAODING_DIC_HOME指向词典安装目录
如:paoding.dic.home=z:\data\paoding\dic
3、解压paoding-analysis-2.0.4-beta,解压后再用压缩软件打开paoding-analysis.jar文件,单个解压出paoding-analysis.properties文件,在paoding-analysis.properties文件中添加或修改paoding.dic.home的值。并将paoding-analysis-2.0.4-beta解压包中的dic下的文件拷到PAODING_DIC_HOME目录下
如:paoding.dic.home=z:\data\paoding\dic
然后在将修改后的文件放回paoding-analysis.jar包
4、创建一个词库目录,目录必须跟paoding.dic.home的值一致,如:
paoding.dic.home=z:\data\paoding\dic
再创建索引目录,
如:paoding.dic.home=z:\data\paoding\test_index
5、创建一个java project,将paoding-analysis-2.0.4-beta根目录下的包引入,注意,此时引入的paoding-analysis.jar包应是你修改过的包文件。
6、测试代码:

package com.paoding.index;   
  
  
import java.io.IOException;   
  
import net.paoding.analysis.analyzer.PaodingAnalyzer;   
  
import org.apache.lucene.analysis.Analyzer;   
import org.apache.lucene.analysis.TokenStream;   
import org.apache.lucene.document.Document;   
import org.apache.lucene.document.Field;   
import org.apache.lucene.index.CorruptIndexException;   
import org.apache.lucene.index.IndexReader;   
import org.apache.lucene.index.IndexWriter;   
import org.apache.lucene.index.TermPositionVector;   
import org.apache.lucene.queryParser.ParseException;   
import org.apache.lucene.queryParser.QueryParser;   
import org.apache.lucene.search.Hits;   
import org.apache.lucene.search.IndexSearcher;   
import org.apache.lucene.search.Query;   
import org.apache.lucene.search.Searcher;   
import org.apache.lucene.search.highlight.Formatter;   
import org.apache.lucene.search.highlight.Highlighter;   
import org.apache.lucene.search.highlight.QueryScorer;   
import org.apache.lucene.search.highlight.TokenGroup;   
import org.apache.lucene.search.highlight.TokenSources;   
import org.apache.lucene.store.LockObtainFailedException;   
public class Index {   
  
    /**  
     * @param args  
     */  
    public static void main(String[] args) {   
        String IDNEX_PATH = "Z:/data/paoding/test_index";   
        //获取Paoding中文分词器   
        Analyzer analyzer = new PaodingAnalyzer();   
        //建立索引   
        IndexWriter writer;   
        try {   
            writer = new IndexWriter(IDNEX_PATH, analyzer, true);   
  
            Document doc = new Document();   
            Field field = new Field("content", "书法和国的书!", Field.Store.YES,   
                Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);   
            doc.add(field);   
            writer.addDocument(doc);   
            writer.close();   
            System.out.println("Indexed success!");   
                
            //检索   
            IndexReader reader = IndexReader.open(IDNEX_PATH);   
            QueryParser parser = new QueryParser("content", analyzer);   
            Query query = parser.parse("书法");   
            Searcher searcher = new IndexSearcher(reader);   
            Hits hits = searcher.search(query);   
            if (hits.length() == 0) {   
                System.out.println("hits.length=0");   
            }   
            Document doc2 = hits.doc(0);   
            //高亮处理   
            String text = doc2.get("content");   
            TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(   
                            0, "content");   
            TokenStream ts = TokenSources.getTokenStream(tpv);   
            Formatter formatter = new Formatter() {   
                public String highlightTerm(String srcText, TokenGroup g) {   
                    if (g.getTotalScore() <= 0) {   
                        return srcText;   
                    }   
                    return "<b>" + srcText + "</b>";   
                }   
            };   
            Highlighter highlighter = new Highlighter(formatter, new QueryScorer(   
                    query));   
            String result = highlighter.getBestFragments(ts, text, 5, "…");   
            System.out.println("result:\n\t" + result);   
            reader.close();   
        } catch (CorruptIndexException e) {   
            // TODO Auto-generated catch block   
            e.printStackTrace();   
        } catch (LockObtainFailedException e) {   
            // TODO Auto-generated catch block   
            e.printStackTrace();   
        } catch (IOException e) {   
            // TODO Auto-generated catch block   
            e.printStackTrace();   
        } catch (ParseException e) {   
            // TODO Auto-generated catch block   
            e.printStackTrace();   
        }   
  
  
    }   
  
}

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics