Apache SOLR and Carrot2 integration strategies 2

ylzhj02

浏览: 234919 次
性别:
来自: 成都

最近访客更多访客>>

daqin

bbpopeye

也许on

learnmore

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Solr
Carrot
Lucene

In order to use custom chinese tokenizer(eg. jcseg). Following the next steps

1. download carrot2 souce code and import it to eclipse

#git clone git://github.com/carrot2/carrot2.git

#cd carrot2

#ant -p

#ant eclipse

2. import jecseg to eclipse and reference it to carrot2-util-text subproject.

3. modify org.carrot2.text.linguistic.DefaultTokenizerFactory.java

private static EnumMap<LanguageCode, IFactory<ITokenizer>> createDefaultTokenizers() {
		EnumMap<LanguageCode, IFactory<ITokenizer>> map = Maps
				.newEnumMap(LanguageCode.class);

		// By default, we use our own tokenizer for all languages.
		IFactory<ITokenizer> whitespaceTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
				ExtendedWhitespaceTokenizer.class);
 		
		IFactory<ITokenizer> chineseTokenizerFactory = new NewClassInstanceFactory<ITokenizer>(
				InokChineseTokenizerAdapter.class);

		for (LanguageCode lc : LanguageCode.values()) {
			map.put(lc, whitespaceTokenizerFactory);
		}

		// Chinese and Thai are exceptions, we use adapters around tokenizers
		// from Lucene.
	
		map.put(LanguageCode.CHINESE_SIMPLIFIED, chineseTokenizerFactory);
.....
}

4. create new class org.carrot2.text.linguistic.lucene.InokChineseTokenizerAdapter.java

package org.carrot2.text.linguistic.lucene;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.lionsoul.jcseg.analyzer.JcsegFilter;
import org.lionsoul.jcseg.analyzer.JcsegTokenizer;
import org.lionsoul.jcseg.core.ADictionary;
import org.lionsoul.jcseg.core.DictionaryFactory;
import org.lionsoul.jcseg.core.ISegment;
import org.lionsoul.jcseg.core.IWord;
import org.lionsoul.jcseg.core.JcsegException;
import org.lionsoul.jcseg.core.JcsegTaskConfig;
import org.lionsoul.jcseg.core.SegmentFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class InokChineseTokenizerAdapter extends Tokenizer implements
		ITokenizer {
	private final static Logger logger = LoggerFactory
			.getLogger(InokChineseTokenizerAdapter.class);
	private ISegment segmentor;

	private OffsetAttribute offsetAtt;
	private CharTermAttribute termAtt = null;

	private final MutableCharArray tempCharSequence;

	public InokChineseTokenizerAdapter() throws JcsegException, IOException {

		super(new StringReader("")); 
		JcsegTaskConfig config = new JcsegTaskConfig();
		ADictionary dic = DictionaryFactory.createDefaultDictionary(config);
		this.tempCharSequence = new MutableCharArray(new char[0]);
		segmentor = SegmentFactory.createJcseg(1, new Object[] { config, dic });
		segmentor.reset(input);
		termAtt = addAttribute(CharTermAttribute.class);
		offsetAtt = addAttribute(OffsetAttribute.class);
	}

	@Override
	public void reset(Reader reader) throws IOException {
		super.reset();
		segmentor.reset(reader);
	}

	@Override
	public short nextToken() throws IOException {

		final boolean hasNextToken = incrementToken();

		if (hasNextToken) {
			short flags = 0;
			final char[] image = termAtt.buffer();
			final int length = termAtt.length();
			tempCharSequence.reset(image, 0, length);

			if (length == 1) {

				flags = ITokenizer.TT_PUNCTUATION;
			} else {
				flags = ITokenizer.TT_TERM;
			}
			return flags;
		}

		return ITokenizer.TT_EOF;
	}

	@Override
	public void setTermBuffer(MutableCharArray array) {
		// TODO Auto-generated method stub
		array.reset(termAtt.buffer(), 0, termAtt.length());
	}

	@Override
	public boolean incrementToken() throws IOException {
		clearAttributes();
		IWord word = segmentor.next();
		if (word != null) {
			termAtt.append(word.getValue());
			termAtt.setLength(word.getLength());
			offsetAtt.setOffset(word.getPosition(),
					word.getPosition() + word.getLength());
			return true;
		} else {
			end();
			return false;
		}
	}

}

5. recompile and build jars in carrot2

#cd carrot2

a. modify build.xml to add jcseg jars

 <patternset id="lib.test">
    <include name="core/**/*.jar" />
    <include name="lib/**/*.jar" />
    <include name="lib/jcseg-*.jar" />
    <exclude name="lib/org.slf4j/slf4j-nop*" />
    <include name="applications/carrot2-dcs/**/*.jar" />
    <include name="applications/carrot2-webapp/lib/*.jar" />
    <include name="applications/carrot2-benchmarks/lib/*.jar" />
  </patternset>

  <patternset id="lib.core">
    <include name="lib/**/*.jar" />
    <include name="core/carrot2-util-matrix/lib/*.jar" />
    <include name="lib/jcseg-*.jar" />
    <patternset refid="lib.core.excludes" />
  </patternset>

  <patternset id="lib.core.mini">
    <include name="lib/**/mahout-*.jar" />
    <include name="lib/jcseg-*.jar" />
    <include name="lib/**/mahout.LICENSE" />
    <include name="lib/**/colt.LICENSE" />
    <include name="lib/**/commons-lang*" />
    <include name="lib/**/guava*" />
    <include name="lib/**/jackson*" />
    <include name="lib/**/lucene-snowball*" />
    <include name="lib/**/lucene.LICENSE" />
    <include name="lib/**/hppc-*.jar" />
    <include name="lib/**/hppc*.LICENSE" />

    <include name="lib/**/slf4j-api*.jar" />
    <include name="lib/**/slf4j-nop*.jar" />
    <include name="lib/**/slf4j.LICENSE" />

    <include name="lib/**/attributes-binder-*.jar" />
  </patternset>

Note: lib/jcseg-*.jar

b. cp jcseg-analyzer-1.9.5.jar and jcseg-core-1.9.5.jar to carrot2/lib/

c.run recompile and build jar

#ant jar

d. cp tmp/jar/carrot2-core-3.10.0-SNAPSHOT.jar to solr/WEB-INF/lib/ '

Note: you should copy jars in contrib/clustering/lib/ , jcesg jars, lexcion dir and jcseg.properties file to solr/WEB-INF/lib/.

Warning: the most important configure in solrconfig.xml is to define tokenizerFactory attribute

<str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str>

  <searchComponent name="clustering"
                   enable="true"
                   class="solr.clustering.ClusteringComponent" >
    <lst name="engine">
      <str name="name">lingo</str>
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
      <str name="carrot.resourcesDir">clustering/carrot2</str>
      <str name="MultilingualClustering.defaultLanguage">CHINESE_SIMPLIFIED</str>
      <str name="PreprocessingPipeline.tokenizerFactory">org.carrot2.text.linguistic.DefaultTokenizerFactory</str>
    </lst>
</searchComponent>

查看图片附件

分享到：

Build and deploy workbench | Apache SOLR and Carrot2 integration stra ...

2014-11-04 17:39
浏览 974
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论