日韩2页,国产精品一区在线免费观看,欧日韩一区二区三区

布隆過濾器是可以用于判斷一個(gè)元素是不是在一個(gè)集合里，并且相比于其它的數(shù)據(jù)結(jié)構(gòu)，布隆過濾器在空間和時(shí)間方面都有巨大的優(yōu)勢。布隆過濾器存儲空間和插入/查詢時(shí)間都是常數(shù)。但是它也是擁有一定的缺點(diǎn)：布隆過濾器是有一定的誤識別率以及刪除困難的。本文中給出的布隆過濾器的實(shí)現(xiàn)，基本滿足了日常使用所需要的功能。

先簡單來說一下布隆過濾器。其實(shí)現(xiàn)方法就是：利用內(nèi)存中一個(gè)長度為m的位數(shù)組b并初始化里面的所有位都為0，如下面的表格所示：

然后我們根據(jù)h個(gè)不同的散列函數(shù)，對傳進(jìn)來的字符串進(jìn)行散列，并且每次的散列結(jié)果都不能大于位數(shù)組的長度。布隆過濾器的誤判率取決于你使用多少個(gè)不同的散列函數(shù)，下面給出的代碼中，給出了一些參考的誤判率（參考代碼中的枚舉類：misjudgmentrate）。現(xiàn)在我們先假定有4個(gè)不同散列函數(shù)，傳入一個(gè)字符串并進(jìn)行一次插入操作，這時(shí)會(huì)進(jìn)行4次散列，假設(shè)到了4個(gè)不同的下標(biāo)，這個(gè)時(shí)候我們就會(huì)去數(shù)組中，將這些下標(biāo)的位置置為1，數(shù)組變更為：

如果接下來我們再傳入同一個(gè)字符串時(shí)，因?yàn)?次的散列結(jié)果都是跟上一次一樣的，所以會(huì)得出跟上面一樣的結(jié)果，所有應(yīng)該置1的位都已經(jīng)置1了，這個(gè)時(shí)候我們就可以認(rèn)為這個(gè)字符串是已經(jīng)存在的了。因此不難發(fā)現(xiàn)，這是會(huì)存在一定的誤判率的，具體由你采用的散列函數(shù)質(zhì)量，以及散列函數(shù)的數(shù)量確定。

代碼如下：

				?

									import java.io.fileinputstream;

									import java.io.fileoutputstream;

									import java.io.objectinputstream;

									import java.io.objectoutputstream;

									import java.io.serializable;

									import java.util.bitset;

									import java.util.concurrent.atomic.atomicinteger;

									public class bloomfileter implements serializable {

									 private static final long serialversionuid = -5221305273707291280l;

									 private final int[] seeds;

									 private final int size;

									 private final bitset notebook;

									 private final misjudgmentrate rate;

									 private final atomicinteger usecount = new atomicinteger(0);

									 private final double autoclearrate;

									 /**

									 * 默認(rèn)中等程序的誤判率：misjudgmentrate.middle 以及不自動(dòng)清空數(shù)據(jù)（性能會(huì)有少許提升）

									 * 

									 * @param datacount

									 *      預(yù)期處理的數(shù)據(jù)規(guī)模，如預(yù)期用于處理1百萬數(shù)據(jù)的查重，這里則填寫1000000

									 */

									 public bloomfileter(int datacount) {

									 this(misjudgmentrate.middle, datacount, null);

									 }

									 /**

									 * 

									 * @param rate

									 *      一個(gè)枚舉類型的誤判率

									 * @param datacount

									 *      預(yù)期處理的數(shù)據(jù)規(guī)模，如預(yù)期用于處理1百萬數(shù)據(jù)的查重，這里則填寫1000000

									 * @param autoclearrate

									 *      自動(dòng)清空過濾器內(nèi)部信息的使用比率，傳null則表示不會(huì)自動(dòng)清理，

									 *      當(dāng)過濾器使用率達(dá)到100%時(shí)，則無論傳入什么數(shù)據(jù)，都會(huì)認(rèn)為在數(shù)據(jù)已經(jīng)存在了

									 *      當(dāng)希望過濾器使用率達(dá)到80%時(shí)自動(dòng)清空重新使用，則傳入0.8

									 */

									 public bloomfileter(misjudgmentrate rate, int datacount, double autoclearrate) {

									 long bitsize = rate.seeds.length * datacount;

									 if (bitsize < 0 || bitsize > integer.max_value) {

									  throw new runtimeexception("位數(shù)太大溢出了，請降低誤判率或者降低數(shù)據(jù)大小");

									 }

									 this.rate = rate;

									 seeds = rate.seeds;

									 size = (int) bitsize;

									 notebook = new bitset(size);

									 this.autoclearrate = autoclearrate;

									 }

									 public void add(string data) {

									 checkneedclear();

									 for (int i = 0; i < seeds.length; i++) {

									  int index = hash(data, seeds[i]);

									  settrue(index);

									 }

									 }

									 public boolean check(string data) {

									 for (int i = 0; i < seeds.length; i++) {

									  int index = hash(data, seeds[i]);

									  if (!notebook.get(index)) {

									  return false;

									  }

									 }

									 return true;

									 }

									 /**

									 * 如果不存在就進(jìn)行記錄并返回false，如果存在了就返回true

									 * 

									 * @param data

									 * @return

									 */

									 public boolean addifnotexist(string data) {

									 checkneedclear();

									 int[] indexs = new int[seeds.length];

									 // 先假定存在

									 boolean exist = true;

									 int index;

									 for (int i = 0; i < seeds.length; i++) {

									  indexs[i] = index = hash(data, seeds[i]);

									  if (exist) {

									  if (!notebook.get(index)) {

									   // 只要有一個(gè)不存在，就可以認(rèn)為整個(gè)字符串都是第一次出現(xiàn)的

									   exist = false;

									   // 補(bǔ)充之前的信息

									   for (int j = 0; j <= i; j++) {

									   settrue(indexs[j]);

									   }

									  }

									  } else {

									  settrue(index);

									  }

									 }

									 return exist;

									 }

									 private void checkneedclear() {

									 if (autoclearrate != null) {

									  if (getuserate() >= autoclearrate) {

									  synchronized (this) {

									   if (getuserate() >= autoclearrate) {

									   notebook.clear();

									   usecount.set(0);

									   }

									  }

									  }

									 }

									 }

									 public void settrue(int index) {

									 usecount.incrementandget();

									 notebook.set(index, true);

									 }

									 private int hash(string data, int seeds) {

									 char[] value = data.tochararray();

									 int hash = 0;

									 if (value.length > 0) {

									  for (int i = 0; i < value.length; i++) {

									  hash = i * hash + value[i];

									  }

									 }

									 hash = hash * seeds % size;

									 // 防止溢出變成負(fù)數(shù)

									 return math.abs(hash);

									 }

									 public double getuserate() {

									 return (double) usecount.intvalue() / (double) size;

									 }

									 public void savefiltertofile(string path) {

									 try (objectoutputstream oos = new objectoutputstream(new fileoutputstream(path))) {

									  oos.writeobject(this);

									 } catch (exception e) {

									  throw new runtimeexception(e);

									 }

									 }

									 public static bloomfileter readfilterfromfile(string path) {

									 try (objectinputstream ois = new objectinputstream(new fileinputstream(path))) {

									  return (bloomfileter) ois.readobject();

									 } catch (exception e) {

									  throw new runtimeexception(e);

									 }

									 }

									 /**

									 * 清空過濾器中的記錄信息

									 */

									 public void clear() {

									 usecount.set(0);

									 notebook.clear();

									 }

									 public misjudgmentrate getrate() {

									 return rate;

									 }

									 /**

									 * 分配的位數(shù)越多，誤判率越低但是越占內(nèi)存

									 * 

									 * 4個(gè)位誤判率大概是0.14689159766308

									 * 

									 * 8個(gè)位誤判率大概是0.02157714146322

									 * 

									 * 16個(gè)位誤判率大概是0.00046557303372

									 * 

									 * 32個(gè)位誤判率大概是0.00000021167340

									 * 

									 * @author lianghaohui

									 *

									 */

									 public enum misjudgmentrate {

									 // 這里要選取質(zhì)數(shù)，能很好的降低錯(cuò)誤率

									 /**

									  * 每個(gè)字符串分配4個(gè)位

									  */

									 very_small(new int[] { 2, 3, 5, 7 }),

									 /**

									  * 每個(gè)字符串分配8個(gè)位

									  */

									 small(new int[] { 2, 3, 5, 7, 11, 13, 17, 19 }), //

									 /**

									  * 每個(gè)字符串分配16個(gè)位

									  */

									 middle(new int[] { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53 }), //

									 /**

									  * 每個(gè)字符串分配32個(gè)位

									  */

									 high(new int[] { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,

									  101, 103, 107, 109, 113, 127, 131 });

									 private int[] seeds;

									 private misjudgmentrate(int[] seeds) {

									  this.seeds = seeds;

									 }

									 public int[] getseeds() {

									  return seeds;

									 }

									 public void setseeds(int[] seeds) {

									  this.seeds = seeds;

									 }

									 }

									 public static void main(string[] args) {

									 bloomfileter fileter = new bloomfileter(7);

									 system.out.println(fileter.addifnotexist("1111111111111"));

									 system.out.println(fileter.addifnotexist("2222222222222222"));

									 system.out.println(fileter.addifnotexist("3333333333333333"));

									 system.out.println(fileter.addifnotexist("444444444444444"));

									 system.out.println(fileter.addifnotexist("5555555555555"));

									 system.out.println(fileter.addifnotexist("6666666666666"));

									 system.out.println(fileter.addifnotexist("1111111111111"));

									 fileter.savefiltertofile("c:\\users\\john\\desktop\\1111\\11.obj");

									 fileter = readfilterfromfile("c:\\users\\john\\desktop\\111\\11.obj");

									 system.out.println(fileter.getuserate());

									 system.out.println(fileter.addifnotexist("1111111111111"));

									 }

									}