In order to use custom chinese tokenizer(eg. jcseg). Following the next steps
1. download carrot2 souce code and import it to eclipse
#git clone git://github.com/carrot2/carrot2.git
#cd carrot2
#ant -p
#ant eclipse
2. import jecseg to eclipse and reference it to carrot2-util-text subproject.
3. modify org.carrot2.text.linguistic.DefaultTokenizerFactory.java
private static EnumMap<LanguageCode, IFactory<ITokenizer>> createDefaultTokenizers() { EnumMap<LanguageCode, IFactory<ITokenizer>> map = Maps .newEnumMap(LanguageCode.class); // By default, we use our own tokenizer for all languages. IFactory<ITokenizer> whitespaceTokenizerFactory = new NewClassInstanceFactory<ITokenizer>( ExtendedWhitespaceTokenizer.class); IFactory<ITokenizer> chineseTokenizerFactory = new NewClassInstanceFactory<ITokenizer>( InokChineseTokenizerAdapter.class); for (LanguageCode lc : LanguageCode.values()) { map.put(lc, whitespaceTokenizerFactory); } // Chinese and Thai are exceptions, we use adapters around tokenizers // from Lucene. map.put(LanguageCode.CHINESE_SIMPLIFIED, chineseTokenizerFactory); ..... }
4. create new class org.carrot2.text.linguistic.lucene.InokChineseTokenizerAdapter.java
package org.carrot2.text.linguistic.lucene; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.regex.Pattern; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.carrot2.text.analysis.ITokenizer; import org.carrot2.text.util.MutableCharArray; import org.lionsoul.jcseg.analyzer.JcsegFilter; import org.lionsoul.jcseg.analyzer.JcsegTokenizer; import org.lionsoul.jcseg.core.ADictionary; import org.lionsoul.jcseg.core.DictionaryFactory; import org.lionsoul.jcseg.core.ISegment; import org.lionsoul.jcseg.core.IWord; import org.lionsoul.jcseg.core.JcsegException; import org.lionsoul.jcseg.core.JcsegTaskConfig; import org.lionsoul.jcseg.core.SegmentFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class InokChineseTokenizerAdapter extends Tokenizer implements ITokenizer { private final static Logger logger = LoggerFactory .getLogger(InokChineseTokenizerAdapter.class); private ISegment segmentor; private OffsetAttribute offsetAtt; private CharTermAttribute termAtt = null; private final MutableCharArray tempCharSequence; public InokChineseTokenizerAdapter() throws JcsegException, IOException { super(new StringReader("")); JcsegTaskConfig config = new JcsegTaskConfig(); ADictionary dic = DictionaryFactory.createDefaultDictionary(config); this.tempCharSequence = new MutableCharArray(new char[0]); segmentor = SegmentFactory.createJcseg(1, new Object[] { config, dic }); segmentor.reset(input); termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); } @Override public void reset(Reader reader) throws IOException { super.reset(); segmentor.reset(reader); } @Override public short nextToken() throws IOException { final boolean hasNextToken = incrementToken(); if (hasNextToken) { short flags = 0; final char[] image = termAtt.buffer(); final int length = termAtt.length(); tempCharSequence.reset(image, 0, length); if (length == 1) { flags = ITokenizer.TT_PUNCTUATION; } else { flags = ITokenizer.TT_TERM; } return flags; } return ITokenizer.TT_EOF; } @Override public void setTermBuffer(MutableCharArray array) { // TODO Auto-generated method stub array.reset(termAtt.buffer(), 0, termAtt.length()); } @Override public boolean incrementToken() throws IOException { clearAttributes(); IWord word = segmentor.next(); if (word != null) { termAtt.append(word.getValue()); termAtt.setLength(word.getLength()); offsetAtt.setOffset(word.getPosition(), word.getPosition() + word.getLength()); return true; } else { end(); return false; } } }
5. recompile and build jars in carrot2
#cd carrot2
a. modify build.xml to add jcseg jars
<patternset id="lib.test">
<include name="core/**/*.jar" />
<include name="lib/**/*.jar" />
<include name="lib/jcseg-*.jar" />
<exclude name="lib/org.slf4j/slf4j-nop*" />
<include name="applications/carrot2-dcs/**/*.jar" />
<include name="applications/carrot2-webapp/lib/*.jar" />
<include name="applications/carrot2-benchmarks/lib/*.jar" />
</patternset>
<patternset id="lib.core">
<include name="lib/**/*.jar" />
<include name="core/carrot2-util-matrix/lib/*.jar" />
<include name="lib/jcseg-*.jar" />
<patternset refid="lib.core.excludes" />
</patternset>
<patternset id="lib.core.mini">
<include name="lib/**/mahout-*.jar" />
<include name="lib/jcseg-*.jar" />
<include name="lib/**/mahout.LICENSE" />
<include name="lib/**/colt.LICENSE" />
<include name="lib/**/commons-lang*" />
<include name="lib/**/guava*" />
<include name="lib/**/jackson*" />
<include name="lib/**/lucene-snowball*" />
<include name="lib/**/lucene.LICENSE" />
<include name="lib/**/hppc-*.jar" />
<include name="lib/**/hppc*.LICENSE" />
<include name="lib/**/slf4j-api*.jar" />
<include name="lib/**/slf4j-nop*.jar" />
<include name="lib/**/slf4j.LICENSE" />
<include name="lib/**/attributes-binder-*.jar" />
</patternset>
Note: lib/jcseg-*.jar
b. cp jcseg-analyzer-1.9.5.jar and jcseg-core-1.9.5.jar to carrot2/lib/
c.run recompile and build jar
#ant jar
d. cp tmp/jar/carrot2-core-3.10.0-SNAPSHOT.jar to solr/WEB-INF/lib/ '
Note: you should copy jars in contrib/clustering/lib/ , jcesg jars, lexcion dir and jcseg.properties file to solr/WEB-INF/lib/.
Warning: the most important configure in solrconfig.xml is to define tokenizerFactory attribute
<str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str>
<searchComponent name="clustering" enable="true" class="solr.clustering.ClusteringComponent" > <lst name="engine"> <str name="name">lingo</str> <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> <str name="carrot.resourcesDir">clustering/carrot2</str> <str name="MultilingualClustering.defaultLanguage">CHINESE_SIMPLIFIED</str> <str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str> </lst> </searchComponent>
相关推荐
solr的carrot2需要用到的文件solr-integration-strategies-gh-pages carrot3.9webapp,还有tomcat还有solr4.81请自己下载
最新可用已配置好solr的carrot2插件,tomcat里面需配置好solr具体到http://carrot2.github.io/solr-integration-strategies/carrot2-3.8.0/index.html查看
Apache Solr 4 Cookbook Apache Solr 4 Cookbook Apache Solr 4 Cookbook Apache Solr 4 Cookbook Apache Solr 4 Cookbook
Spring Data for Apache Solr API。 Spring Data for Apache Solr 开发文档
Apache Solr Essentials is a fast-paced guide to help you quickly learn the process of creating a scalable, efficient, and powerful search application. The book starts off by explaining the ...
Apache Solr for Indexing Data
This book is for developers who already know how to use Solr and are looking at procuring advanced strategies for improving their search using Solr. This book is also for people who work with ...
apache solr搜索系统的.Net实现
Apache Solr High Performance is a practical guide that will help you explore and take full advantage of the robust nature of Apache Solr so as to achieve optimized Solr instances, especially in terms ...
Apache Solr Search
apache solr 官方文档(英文原版) 包含详细的安装、Schema配置、solrConfig配置、管理页面使用等.
Apache Solr 3 Enterprise Search Server 部分中文翻译 从博客上面保存下来的。是网页版,方便大家查看
apache solr 源文件 版本为3.6.1 让你能够更好地了解solr实现,更好的使用solr
Mastering Apache Solr 7.x An expert guide to advancing, optimizing, and scaling your enterprise search 英文azw3 本资源转载自网络,如有侵权,请联系上传者或csdn删除 查看此书详细信息请在美国亚马逊官网...
《apachesolr7官方指南》
Apache Solr 1.3.0发布,Apache Solr是一个性能强大的,基于 Lucene 的全文搜索的 开源企业级搜索服务器,拥有XML/HTTP,JSON APIs,hit highlighting, faceted search, caching, replication,web管理界面等很多功能...
You will understand the concepts and internals of Apache Solr and tune the results for your client’s search needs. The book explains each essential concept―backed by practical and industry examples...
Apache Solr(solr-8.11.1.tgz)Binary releases 二进制版本
apache solr guide 4.7