用Java实现Bloom Filter -

chen_yongkai

浏览: 61314 次
性别:
来自: 福州

最近访客更多访客>>

法萨芬

longxm

lingqiang522

xubukang

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (10)

社区版块

存档分类

用Java实现Bloom Filter

java bloom filter

正好在“问答”和“论坛”中看到关于Bloom Filter的帖子，学习研究了一把，自娱自乐就写了一种实现。不多说，直接上代码，代码尽量写得具备可读性，不多解释了。关于Bloom Filter可以参考http://www.google.com.hk/ggblog/googlechinablog/2007/07/bloom-filter_7469.html


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.BitSet;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class BloomFilter<T extends Serializable> implements Serializable {
	/**
	 * 
	 */
	private static final long serialVersionUID = -4322599722288348992L;
	private static final int DEFAULT_CAPACITY = 1 << 16;
	private transient BitSet filter;
	private HashGenerator<T> hashGenerator;
	private int hashSize;
	private int nbits;

	public BloomFilter(HashGenerator<T> hashGenerator) {
		this(hashGenerator, hashGenerator.size(), DEFAULT_CAPACITY);
	}

	public BloomFilter(HashGenerator<T> hashGenerator, int capacity) {
		this(hashGenerator, hashGenerator.size(), capacity);
	}

	public BloomFilter(HashGenerator<T> hashGenerator, int hashSize, int capacity) {
		super();
		this.hashGenerator = hashGenerator;
		this.hashSize = hashSize;
		this.nbits = capacity * hashSize * 2;
		filter = new BitSet(nbits);
	}

	private void writeObject(ObjectOutputStream out) throws IOException {
		// 压缩
		ByteArrayOutputStream buf = new ByteArrayOutputStream();
		ObjectOutputStream objOut = new ObjectOutputStream(new GZIPOutputStream(buf));
		objOut.writeObject(filter);
		objOut.close();
		out.writeObject(buf.toByteArray());
		out.defaultWriteObject();
	}

	private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
		byte[] buf = (byte[]) in.readObject();
		ObjectInputStream objIn = new ObjectInputStream(new GZIPInputStream(
				new ByteArrayInputStream(buf)));
		filter = (BitSet) objIn.readObject();
		objIn.close();
		in.defaultReadObject();
	}

	public void add(T key) {
		for (int i = 0; i < hashSize; i++) {
			long hashCode = hashGenerator.getHashCode(key, i);
			int index = hashGenerator.getBitIndex(hashCode, nbits);
			filter.set(index);
		}
	}

	public boolean contains(T key) {
		for (int i = 0; i < hashSize; i++) {
			long hashCode = hashGenerator.getHashCode(key, i);
			int index = hashGenerator.getBitIndex(hashCode, nbits);
			if (!filter.get(index))
				return false;
		}
		return true;
	}
}

import java.io.Serializable;

public interface HashGenerator<T> extends Serializable {
	public int getBitIndex(long hashCode,int maxIndex);

	public long getHashCode(T key, int index);

	public int size();
}

import java.util.Random;

public abstract class AbstractHashGenerator<T> implements HashGenerator<T> {
	/**
	 * 
	 */
	private static final long serialVersionUID = 1918866698987940799L;
	private static final Random rand = new Random();
	private int size;

	public AbstractHashGenerator(int size) {
		super();
		this.size = size;
	}

	public int getBitIndex(long hashCode, int maxIndex) {
		rand.setSeed(hashCode);
		return rand.nextInt(maxIndex);
	}

	public int size() {
		return size;
	}
}



public class SimpleHashGenerator<T> extends AbstractHashGenerator<T> {

	/**
	 * 
	 */
	private static final long serialVersionUID = -6971076063651082178L;

	public SimpleHashGenerator(int size) {
		super(size);
	}

	public SimpleHashGenerator() {
		this(8);
	}

	private long getHashCode2(T key, int index) {
		int h = index * key.hashCode();
		h ^= (h >>> 20) ^ (h >>> 12);
		return h ^ (h >>> 7) ^ (h >>> 4);
	}

	private long getHashCode1(T key, int index) {
		int h = index * 31 + key.hashCode();
		h += ~(h << 9);
		h ^= (h >>> 14);
		h += (h << 4);
		h ^= (h >>> 10);
		return h;
	}

	public long getHashCode(T key, int index) {
		return getHashCode1(key, index);
		// return getHashCode2(key, index);
		// if ((index & 1) == 0) {
		// return getHashCode1(key, index);
		// } else {
		// return getHashCode2(key, index);
		// }
	}
}

小小测试：


import java.io.File;
import java.io.Serializable;

import bluechip.io.SerializeUtils;

public class BloomFilterTest {
	static class Key implements Serializable {
		/**
		 * 
		 */
		private static final long serialVersionUID = 7503732767154152820L;
		String name;
		String id;

		public Key(String name, String id) {
			super();
			this.name = name;
			this.id = id;
		}

		/*
		 * (non-Javadoc)
		 * 
		 * @see java.lang.Object#hashCode()
		 */
		@Override
		public int hashCode() {
			final int prime = 31;
			int result = 1;
			result = prime * result + ((id == null) ? 0 : id.hashCode());
			result = prime * result + ((name == null) ? 0 : name.hashCode());
			return result;
		}

		/*
		 * (non-Javadoc)
		 * 
		 * @see java.lang.Object#equals(java.lang.Object)
		 */
		@Override
		public boolean equals(Object obj) {
			if (this == obj)
				return true;
			if (obj == null)
				return false;
			if (getClass() != obj.getClass())
				return false;
			Key other = (Key) obj;
			if (id == null) {
				if (other.id != null)
					return false;
			} else if (!id.equals(other.id))
				return false;
			if (name == null) {
				if (other.name != null)
					return false;
			} else if (!name.equals(other.name))
				return false;
			return true;
		}

		/*
		 * (non-Javadoc)
		 * 
		 * @see java.lang.Object#toString()
		 */
		@Override
		public String toString() {
			return "Key [id=" + id + ", name=" + name + "]";
		}

	}

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		File file = new File("d:/filter.dat");
		int n = 1000000;
		BloomFilter<Key> bf = null;
		try {
			bf = SerializeUtils.readObject(file);
		} catch (Exception ex) {
			
			bf = new BloomFilter<Key>(new SimpleHashGenerator<Key>(), n);
			for (int i = 0; i < n; i++) {
				bf.add(new Key("Jim", String.valueOf(i)));
			}
			SerializeUtils.writeObject(bf, new File("d:/filter.dat"));
		}

		System.out.println("==================");
		for (int i = 0; i < n; i++) {
			Key k = new Key("aaa", String.valueOf(i));
			if (bf.contains(k)) {
				System.out.println(k);
			}
		}
		System.out.println("==================");
		for (int i = 0; i < n; i++) {
			Key k = new Key("Jim", String.valueOf(i));
			if (!bf.contains(k)) {
				System.out.println(k);
			}
		}
	}

}

分享到：

用Java实现搜索引擎布尔运算

2011-09-09 17:09
浏览 4069
评论(4)
分类:编程语言
查看更多

4 楼 wa114d 2012-08-30

牛人啊

能提供代码jar包吗

3 楼 happyITLife 2012-06-05

hj01kkk 写道

你的这个类在哪里定义的?bluechip.io.SerializeUtils,方便提供下不?

同问,不知道导入哪个jar

2 楼 chen_yongkai 2012-05-28

	@SuppressWarnings("unchecked")
	public static <T> T readObject(File src) throws IOException, ClassNotFoundException {
		ObjectInputStream ois = null;
		try {
			ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(src)));
			return (T) ois.readObject();
		} finally {
			IOUtils.close(ois);
		}
	}


	public static void writeObject(Object obj, File file) throws FileNotFoundException, IOException {
		if (obj == null)
			throw new NullPointerException("obj are null.");
		ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(
				new FileOutputStream(file)));
		try {
			oos.writeObject(obj);
		} finally {
			IOUtils.close(oos);
		}
	}

1 楼 hj01kkk 2012-05-28

你的这个类在哪里定义的?bluechip.io.SerializeUtils,方便提供下不?

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

用Java实现Bloom Filter

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

用Java实现Bloom Filter

评论

发表评论

相关推荐

最近访客更多访客>>