`
sungang_1120
  • 浏览: 309514 次
  • 性别: Icon_minigender_1
  • 来自: 成都
社区版块
存档分类

JAVA 中文分词正向最大匹配和逆向最大匹配算法实现

 
阅读更多

可以下载一个词库这里用的是搜狗的词库

 

Segmentation.java

package cn.strong;

 

import java.util.HashMap;

 

public class Segmentation {

 

private String tem = null;

private HashMap mapDic, len;

 

public Segmentation(HashMap mapDic, HashMap len) {

this.mapDic = mapDic;

this.len = len;

}

//source是输入的中文

//正向最大匹配

public String Fmm(String source) {

String[] targets = new String[source.length()];

String target = "";

//资源的最大长度

int MaxLen = source.length();

//将最大的长度赋给temLen变量

int temLen = MaxLen;

int primarylen = 0;

while (true) {

//判断temLen有没有对应的value值 有的返回true否则返回false

if (len.containsKey(temLen)) {

tem = source.substring(primarylen, temLen);

//判断词库里面有没有tem对应的值  或者只剩下一个单字的时候

if (mapDic.containsKey(tem) || temLen - primarylen == 1) {

primarylen = temLen;

temLen = MaxLen;

if (primarylen == MaxLen)

target = target + tem;

else

target = target + tem + "/";

} else

temLen--;

} else

temLen--;

 

if (primarylen == MaxLen)

break;

}

return target;

}

//反相最大匹配

public String Bmm(String source) {

String[] targets = new String[source.length()];

String target="";

int MaxLen = source.length();

int temLen = MaxLen;

int primarylen = 0;

int i=0;

while (true) {

if (len.containsKey(temLen)) {

tem = source.substring(primarylen, temLen);

if (mapDic.containsKey(tem)||temLen-primarylen==1) {

if (temLen == MaxLen){

targets[i] = tem;

}

else{

tem = tem+"/";

targets[i] = tem;

}

temLen = primarylen;

primarylen = 0;

i++;

} else

primarylen++;

} else

primarylen++;

if (temLen == 0)

break;

}

 

for(int j=i-1;j>=0;j--)

target+=targets[j];

return target;

}

 

}

GenerateDictionary.java

package cn.strong;

 

import java.io.BufferedReader;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.util.HashMap;

 

public class GenerateDictionary {

public void GenHashDic(String filename, HashMap hm, HashMap len) throws FileNotFoundException, IOException {

String s = new String();

BufferedReader in = new BufferedReader(new FileReader(filename));

//在此处会循环词库的词数个数次

while ((s = in.readLine()) != null) {

hm.put(s, s.length());

len.put(s.length(), s);

}

}

}

Test.java测试

package cn.Test;

 

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.HashMap;

 

import cn.strong.GenerateDictionary;

import cn.strong.Segmentation;

 

public class Test {

 

public static void main(String[] args) throws IOException {

 

String filename = "sougou.txt";

HashMap hm = new HashMap();

HashMap len = new HashMap();

GenerateDictionary genDic = new GenerateDictionary();

Segmentation seg;

 

long genStart = System.currentTimeMillis();

genDic.GenHashDic(filename, hm, len);

System.out.println("GenHashDic 消耗时间:"+(System.currentTimeMillis() - genStart));

 

System.out.println("请输入您需要分解的语句:");

 

InputStreamReader reader = new InputStreamReader(System.in);

BufferedReader br = new BufferedReader(reader);

String data = "";

data = br.readLine();

seg = new Segmentation(hm, len);

long emmStart = System.currentTimeMillis();

String FmmTarget = seg.Fmm(data);

System.out.println("FMM 算法共花费时间为:"+(System.currentTimeMillis()-emmStart));

long bmmStart = System.currentTimeMillis();

String BmmTarget = seg.Bmm(data);

System.out.println("BMM 算法共花费时间为:"+(System.currentTimeMillis()-bmmStart));

 

System.out.println("FMM算法统计结果为: " + FmmTarget);

System.out.println("BMM算法统计结果为: " + BmmTarget);

}

 

}

 GenHashDic 消耗时间:1337

请输入您需要分解的语句:

中国四川省成都市青羊区少城路9号人民公园

FMM 算法共花费时间为:0

BMM 算法共花费时间为:0

FMM算法统计结果为: 中国/四川省/成都市/青羊区/少/城/路/9/号/人民/公园

BMM算法统计结果为: 中国/四川省/成都市/青羊区/少/城/路/9/号/人民/公园

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics