`
yjhexy
  • 浏览: 327522 次
  • 性别: Icon_minigender_1
  • 来自: 火星
社区版块
存档分类
最新评论

JAVA 正则表达式初探

阅读更多

一直听说apache 有个oro的正则表达式比JAVA的好,于是稍仔细的研究了一番:

写了些许个很简单的代码:

package com.yajun;

import java.util.ArrayList;
import java.util.List;

import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Substitution;
import org.apache.oro.text.regex.Util;

/**
 * 正则表达式匹配
 * 
 * @author yajun.wuyj 2009-11-15 下午07:55:34
 */
public class MyPerlMatchUtil {

	private Pattern pattern;

	public MyPerlMatchUtil(String patternString) {
		PatternCompiler compiler = new Perl5Compiler();
		try {
			this.pattern = compiler.compile(patternString);
		} catch (MalformedPatternException e) {
			System.err.println("正则表达式出错 ");
			throw new RuntimeException(e);
		}
	}

	/**
	 * 判断是否匹配
	 * 
	 * @param inputString
	 * @param patternString
	 * @return
	 */
	public boolean match(String inputString, String patternString) {
		return new Perl5Matcher().matches(inputString, pattern);
	}

	/**
	 * 以组返回所有的匹配的关键词
	 * 
	 * @param inputString
	 * @param patternString
	 * @return
	 */
	public List<String> getMatchedKey(String inputString, String patternString) {
		List<String> groupList = new ArrayList<String>();
		Perl5Matcher matcher = new Perl5Matcher();
		if (matcher.matches(inputString, pattern)) {
			int groupNum = matcher.getMatch().groups();
			for (int i = 0; i < groupNum; i++) {
				groupList.add(matcher.getMatch().group(i));
			}
		}
		return groupList;
	}

	/**
	 * 判断是否含有
	 * 
	 * @param inputString
	 * @param patternString
	 * @return
	 */
	public boolean contains(String inputString, String patternString) {
		return new Perl5Matcher().contains(inputString, pattern);
	}

	/**
	 * 可以继续匹配下一个(最大匹配原则的继续匹配下一个)
	 * 
	 * @param inputString
	 * @param patternString
	 * @return
	 */
	public boolean nextContains(String inputString, String patternString) {
		Perl5Matcher matcher = new Perl5Matcher();
		PatternMatcherInput input = new PatternMatcherInput(inputString);
		// while (matcher.contains(input, getPattern(patternString))) {
		// MatchResult result = matcher.getMatch();
		// System.out.println(result.group(0) + ":" + inputString);
		// }
		return matcher.contains(input, pattern);
	}

	/**
	 * 文本替换操作
	 * 
	 * @param sourceString
	 * @param patternString
	 * @param desString
	 * @return
	 */
	public String replace(String sourceString, String patternString,
			String desString) {
		try {
			PatternMatcher matcher = new Perl5Matcher();
						Perl5Substitution repalcer = new Perl5Substitution(desString,
					Util.SUBSTITUTE_ALL);
			return Util.substitute(matcher, pattern, repalcer, sourceString,
					Util.SUBSTITUTE_ALL);
		} catch (Exception e) {
			throw new RuntimeException(e);
		}

	}
}

 写完之后发现oro 本身提供了上述类似功能的工具类,而且比我这个功能肯定更完善和强大,比方说他的工具会对常用的compile进行LRU算法的缓存。吐血白写了。。

使用他提供的类写了如下简单的代码:

package com.yajun;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.apache.oro.text.perl.Perl5Util;

/**
 * 使用Perl5Util的工具,感觉运行速度慢些
 * 
 * @author yajun.wuyj
 */
public class Perl5MatcherTester {

	public void match(String line, String pattern) {
		Perl5Util util = new Perl5Util();
		if (util.match(pattern, line)) {
			System.out.println(line);
			System.out.println(util.substitute("s/美女/{我爱美女}/g", line));
		}

	}

	public static void main(String[] args) throws IOException {
		long start = System.currentTimeMillis();
		FileReader fr = new FileReader(new File(
				"E:/work/balanceofworld/balance/balance.textparser/meinv.txt"));
		BufferedReader br = new BufferedReader(fr);
		String line = null;
		while ((line = br.readLine()) != null) {
			Perl5MatcherTester test = new Perl5MatcherTester();
			test.match(line, "/美女/");
		}
		long end = System.currentTimeMillis();
		System.out.println(end - start);
	}
}

 

代码一下变得少了很多。

 

也做了一下比较,可能是我用了小文本,短的正则表达式去比较,没有比较出明显性能效果来吧。

 

既然使用了,不妨也看看他怎么实现的,于是进入网站:

于是 svn co http://svn.apache.org/repos/asf/jakarta/oro/trunk/ oro

下了代码下来一看,汗一个,贴出来看下:

/* 
 * $Id: JavaCompiler.java 124053 2005-01-04 01:24:35Z dfs $
 *
 * Copyright 2000-2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.oro.text.java;

import java.util.regex.*;

import org.apache.oro.text.regex.*;

/**
 *
 * @version @version@
 * @since 2.1
 */

public final class JavaCompiler implements PatternCompiler {

  public org.apache.oro.text.regex.Pattern compile(String pattern)
    throws MalformedPatternException
  {
    return compile(pattern, 0);
  }


  public org.apache.oro.text.regex.Pattern compile(String pattern, int options)
    throws MalformedPatternException
  {
    try {
      JavaPattern jp = new JavaPattern(pattern);
      return jp;
    } catch(Exception e) {
      // We can't wrap the exception without making MalformedPatternException
      // dependent on J2SE 1.4.
      throw new MalformedPatternException(e.getMessage());
    }
  }


  public org.apache.oro.text.regex.Pattern compile(char[] pattern)
    throws MalformedPatternException 
  {
    return compile(new String(pattern));
  }


  public org.apache.oro.text.regex.Pattern compile(char[] pattern, int options)
       throws MalformedPatternException
  {
    return compile(new String(pattern));
  }

}

 

看了以上代码发现还是用的SUN的,所以本质上应该不会有太大的性能或者BUG问题。

不过他提供的工具类的确蛮好。至少可以少写很多代码。

这个类库还提供了awk的功能,总体来说还是不错的,推荐使用啦。

 

 

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics