JAVA 中文分词正向最大匹配和逆向最大匹配算法实现

sungang_1120

浏览: 309514 次
性别:
来自: 成都

最近访客更多访客>>

bxl994

eplang

zhongzunfa

sdyjmc

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

细细品味-算法
细细品味-地理编码

可以下载一个词库这里用的是搜狗的词库

Segmentation.java

package cn.strong;

import java.util.HashMap;

public class Segmentation {

private String tem = null;

private HashMap mapDic, len;

public Segmentation(HashMap mapDic, HashMap len) {

this.mapDic = mapDic;

this.len = len;

}

//source是输入的中文

//正向最大匹配

public String Fmm(String source) {

String[] targets = new String[source.length()];

String target = "";

//资源的最大长度

int MaxLen = source.length();

//将最大的长度赋给temLen变量

int temLen = MaxLen;

int primarylen = 0;

while (true) {

//判断temLen有没有对应的value值有的返回true否则返回false

if (len.containsKey(temLen)) {

tem = source.substring(primarylen, temLen);

//判断词库里面有没有tem对应的值或者只剩下一个单字的时候

if (mapDic.containsKey(tem) || temLen - primarylen == 1) {

primarylen = temLen;

temLen = MaxLen;

if (primarylen == MaxLen)

target = target + tem;

else

target = target + tem + "/";

} else

temLen--;

} else

temLen--;

if (primarylen == MaxLen)

break;

}

return target;

}

//反相最大匹配

public String Bmm(String source) {

String[] targets = new String[source.length()];

String target="";

int MaxLen = source.length();

int temLen = MaxLen;

int primarylen = 0;

int i=0;

while (true) {

if (len.containsKey(temLen)) {

tem = source.substring(primarylen, temLen);

if (mapDic.containsKey(tem)||temLen-primarylen==1) {

if (temLen == MaxLen){

targets[i] = tem;

}

else{

tem = tem+"/";

targets[i] = tem;

}

temLen = primarylen;

primarylen = 0;

i++;

} else

primarylen++;

} else

primarylen++;

if (temLen == 0)

break;

}

for(int j=i-1;j>=0;j--)

target+=targets[j];

return target;

}

GenerateDictionary.java

package cn.strong;

import java.io.BufferedReader;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.util.HashMap;

public class GenerateDictionary {

public void GenHashDic(String filename, HashMap hm, HashMap len) throws FileNotFoundException, IOException {

String s = new String();

BufferedReader in = new BufferedReader(new FileReader(filename));

//在此处会循环词库的词数个数次

while ((s = in.readLine()) != null) {

hm.put(s, s.length());

len.put(s.length(), s);

}

Test.java测试

package cn.Test;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.HashMap;

import cn.strong.GenerateDictionary;

import cn.strong.Segmentation;

public class Test {

public static void main(String[] args) throws IOException {

String filename = "sougou.txt";

HashMap hm = new HashMap();

HashMap len = new HashMap();

GenerateDictionary genDic = new GenerateDictionary();

Segmentation seg;

long genStart = System.currentTimeMillis();

genDic.GenHashDic(filename, hm, len);

System.out.println("GenHashDic 消耗时间："+(System.currentTimeMillis() - genStart));

System.out.println("请输入您需要分解的语句：");

InputStreamReader reader = new InputStreamReader(System.in);

BufferedReader br = new BufferedReader(reader);

String data = "";

data = br.readLine();

seg = new Segmentation(hm, len);

long emmStart = System.currentTimeMillis();

String FmmTarget = seg.Fmm(data);

System.out.println("FMM 算法共花费时间为："+(System.currentTimeMillis()-emmStart));

long bmmStart = System.currentTimeMillis();

String BmmTarget = seg.Bmm(data);

System.out.println("BMM 算法共花费时间为："+(System.currentTimeMillis()-bmmStart));

System.out.println("FMM算法统计结果为: " + FmmTarget);

System.out.println("BMM算法统计结果为: " + BmmTarget);

}

GenHashDic 消耗时间：1337

请输入您需要分解的语句：

中国四川省成都市青羊区少城路9号人民公园

FMM 算法共花费时间为：0

BMM 算法共花费时间为：0

FMM算法统计结果为: 中国/四川省/成都市/青羊区/少/城/路/9/号/人民/公园

BMM算法统计结果为: 中国/四川省/成都市/青羊区/少/城/路/9/号/人民/公园

分享到：

JAVA反射机制通过反射 Field类获取和 ... | JAVA 枚举类型简单介绍

2013-01-20 01:11
浏览 1579
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论