`
m635674608
  • 浏览: 4930624 次
  • 性别: Icon_minigender_1
  • 来自: 南京
社区版块
存档分类
最新评论

elasticsearch 1.1.0 mmseg 英文数字分词

 
阅读更多

elasticsearch 1.1.0  mmseg 插件的版本是1.2.2 版本。该版本没有解决英文数字分词问题。

比如  user123。分词后 user123

 

解决1:

mmseg插件升级 elasticsearch-analysis-mmseg-1.4.0。

https://github.com/medcl/elasticsearch-analysis-mmseg/commit/61b5e8199425c845a3060fe39f40e59868dd364b 

index:
  analysis: 
    tokenizer:
      mmseg_maxword:
        type: mmseg
        seg_type: max_word
      mmseg_complex:
        type: mmseg
        seg_type: complex
    analyzer:
      mmseg_maxword:
        type: custom
        filter:
        - lowercase
        - cut_letter_digit
        tokenizer: mmseg_maxword
      mmseg:
        type: custom
        filter:
        - lowercase
        - cut_letter_digit
        tokenizer: mmseg_maxword
      mmseg_complex:
        type: custom
        filter:
        - lowercase
        tokenizer: mmseg_complex
#index.analysis.analyzer.default.type : "org.elasticsearch.index.analysis.MMsegAnalyzerProvider"
#index.analysis.analyzer.default.type : "ik"
index.analysis.analyzer.default.type : "mmseg"

 

解决2:

         修改1.2.2 版本jar包中的MMSegAnalyzer,然后替换class

         

package com.chenlb.mmseg4j.analysis;

import java.io.File;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.Seg;

/**
 * 榛樿浣跨敤 max-word
 *
 * @see {@link SimpleAnalyzer}, {@link ComplexAnalyzer}, {@link MaxWordAnalyzer}
 *
 * @author chenlb
 */
public class MMSegAnalyzer extends Analyzer {

	protected Dictionary dic;

	/**
	 * @see Dictionary#getInstance()
	 */
	public MMSegAnalyzer() {
		dic = Dictionary.getInstance();
	}

	/**
	 * @param path 璇嶅簱璺緞
	 * @see Dictionary#getInstance(String)
	 */
	public MMSegAnalyzer(String path) {
		dic = Dictionary.getInstance(path);
	}

	/**
	 * @param path 璇嶅簱鐩綍
	 * @see Dictionary#getInstance(File)
	 */
	public MMSegAnalyzer(File path) {
		dic = Dictionary.getInstance(path);
	}

	public MMSegAnalyzer(Dictionary dic) {
		super();
		this.dic = dic;
	}

	protected Seg newSeg() {
		return new MaxWordSeg(dic);
	}

	public Dictionary getDict() {
		return dic;
	}

	/*@Override
	protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
		return new TokenStreamComponents(new MMSegTokenizer(newSeg(), reader));
	}*/
	@Override
	protected TokenStreamComponents createComponents(String fieldName,
			Reader reader) {
		Tokenizer t = new MMSegTokenizer(newSeg(), reader);
		return new TokenStreamComponents(t, new CutLetterDigitFilter(t));
	}
	
	
}

 

       

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics