diff --git a/extension/bloomfilter.js b/extension/bloomfilter.js new file mode 100644 index 0000000..323cb8e --- /dev/null +++ b/extension/bloomfilter.js @@ -0,0 +1,115 @@ +(function(exports) { + exports.BloomFilter = BloomFilter; + exports.fnv_1a = fnv_1a; + + var typedArrays = typeof ArrayBuffer !== "undefined"; + + // Creates a new bloom filter. If *m* is an array-like object, with a length + // property, then the bloom filter is loaded with data from the array, where + // each element is a 32-bit integer. Otherwise, *m* should specify the + // number of bits. Note that *m* is rounded up to the nearest multiple of + // 32. *k* specifies the number of hashing functions. + function BloomFilter(m, k) { + var a; + if (typeof m !== "number") a = m, m = a.length * 32; + + var n = Math.ceil(m / 32), + i = -1; + this.m = m = n * 32; + this.k = k; + + if (typedArrays) { + var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2), + array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array, + kbuffer = new ArrayBuffer(kbytes * k), + buckets = this.buckets = new Int32Array(n); + if (a) while (++i < n) buckets[i] = a[i]; + this._locations = new array(kbuffer); + } else { + var buckets = this.buckets = []; + if (a) while (++i < n) buckets[i] = a[i]; + else while (++i < n) buckets[i] = 0; + this._locations = []; + } + } + + // See http://willwhim.wpengine.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/ + BloomFilter.prototype.locations = function(v) { + var k = this.k, + m = this.m, + r = this._locations, + a = fnv_1a(v), + b = fnv_1a(v, 1576284489), // The seed value is chosen randomly + x = a % m; + for (var i = 0; i < k; ++i) { + r[i] = x < 0 ? (x + m) : x; + x = (x + b) % m; + } + return r; + }; + + BloomFilter.prototype.add = function(v) { + var l = this.locations(v + ""), + k = this.k, + buckets = this.buckets; + for (var i = 0; i < k; ++i) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32); + }; + + BloomFilter.prototype.test = function(v) { + var l = this.locations(v + ""), + k = this.k, + buckets = this.buckets; + for (var i = 0; i < k; ++i) { + var b = l[i]; + if ((buckets[Math.floor(b / 32)] & (1 << (b % 32))) === 0) { + return false; + } + } + return true; + }; + + // Estimated cardinality. + BloomFilter.prototype.size = function() { + var buckets = this.buckets, + bits = 0; + for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]); + return -this.m * Math.log(1 - bits / this.m) / this.k; + }; + + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + function popcnt(v) { + v -= (v >> 1) & 0x55555555; + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return ((v + (v >> 4) & 0xf0f0f0f) * 0x1010101) >> 24; + } + + // Fowler/Noll/Vo hashing. + // Nonstandard variation: this function optionally takes a seed value that is incorporated + // into the offset basis. According to http://www.isthe.com/chongo/tech/comp/fnv/index.html + // "almost any offset_basis will serve so long as it is non-zero". + function fnv_1a(v, seed) { + var a = 2166136261 ^ (seed || 0); + for (var i = 0, n = v.length; i < n; ++i) { + var c = v.charCodeAt(i), + d = c & 0xff00; + if (d) a = fnv_multiply(a ^ d >> 8); + a = fnv_multiply(a ^ c & 0xff); + } + return fnv_mix(a); + } + + // a * 16777619 mod 2**32 + function fnv_multiply(a) { + return a + (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); + } + + // See https://web.archive.org/web/20131019013225/http://home.comcast.net/~bretm/hash/6.html + function fnv_mix(a) { + a += a << 13; + a ^= a >>> 7; + a += a << 3; + a ^= a >>> 17; + a += a << 5; + return a & 0xffffffff; + } +})(typeof exports !== "undefined" ? exports : this);