diff --git a/packages/utils/package.json b/packages/utils/package.json index 370c157b2c..9490980d8f 100644 --- a/packages/utils/package.json +++ b/packages/utils/package.json @@ -56,6 +56,10 @@ "types": "./dist/src/array-equals.d.ts", "import": "./dist/src/array-equals.js" }, + "./bloom-filter": { + "types": "./dist/src/bloom-filter.d.ts", + "import": "./dist/src/bloom-filter.js" + }, "./close-source": { "types": "./dist/src/close-source.d.ts", "import": "./dist/src/close-source.js" @@ -128,20 +132,24 @@ }, "dependencies": { "@chainsafe/is-ip": "^2.0.2", + "@libp2p/crypto": "^4.1.0", "@libp2p/interface": "^1.3.0", "@libp2p/logger": "^4.0.11", "@multiformats/multiaddr": "^12.2.1", "@multiformats/multiaddr-matcher": "^1.2.0", + "@types/murmurhash3js-revisited": "^3.0.3", "delay": "^6.0.0", "get-iterator": "^2.0.1", "is-loopback-addr": "^2.0.2", "it-pushable": "^3.2.3", "it-stream-types": "^2.0.1", + "murmurhash3js-revisited": "^3.0.0", "netmask": "^2.0.2", "p-defer": "^4.0.1", "race-event": "^1.2.0", "race-signal": "^1.0.2", - "uint8arraylist": "^2.4.8" + "uint8arraylist": "^2.4.8", + "uint8arrays": "^5.0.3" }, "devDependencies": { "@libp2p/peer-id-factory": "^4.1.0", @@ -153,8 +161,7 @@ "it-pair": "^2.0.6", "it-pipe": "^3.0.1", "sinon": "^17.0.1", - "sinon-ts": "^2.0.0", - "uint8arrays": "^5.0.3" + "sinon-ts": "^2.0.0" }, "sideEffects": false } diff --git a/packages/utils/src/bloom-filter.ts b/packages/utils/src/bloom-filter.ts new file mode 100644 index 0000000000..308cbfc0ac --- /dev/null +++ b/packages/utils/src/bloom-filter.ts @@ -0,0 +1,141 @@ +// ported from xxbloom - https://github.com/ceejbot/xxbloom/blob/master/LICENSE +import { randomBytes } from '@libp2p/crypto' +import mur from 'murmurhash3js-revisited' +import { Uint8ArrayList } from 'uint8arraylist' +import { alloc } from 'uint8arrays/alloc' +import { fromString as uint8ArrayFromString } from 'uint8arrays/from-string' + +const LN2_SQUARED = Math.LN2 * Math.LN2 + +export interface BloomFilterOptions { + seeds?: number[] + hashes?: number + bits?: number +} + +export class BloomFilter { + /** + * Create a `BloomFilter` with the smallest `bits` and `hashes` value for the + * specified item count and error rate. + */ + static create (itemcount: number, errorRate: number = 0.005): BloomFilter { + const opts = optimize(itemcount, errorRate) + return new BloomFilter(opts) + } + + public readonly seeds: number[] + public readonly bits: number + public buffer: Uint8Array + + constructor (options: BloomFilterOptions = {}) { + if (options.seeds != null) { + this.seeds = options.seeds + } else { + this.seeds = generateSeeds(options.hashes ?? 8) + } + + this.bits = options.bits ?? 1024 + this.buffer = alloc(Math.ceil(this.bits / 8)) + } + + /** + * Add an item to the filter + */ + add (item: Uint8Array | string): void { + if (typeof item === 'string') { + item = uint8ArrayFromString(item) + } + + for (let i = 0; i < this.seeds.length; i++) { + const hash = mur.x86.hash32(item, this.seeds[i]) + const bit = hash % this.bits + + this.setbit(bit) + } + } + + /** + * Test if the filter has an item. If it returns false it definitely does not + * have the item. If it returns true, it probably has the item but there's + * an `errorRate` chance it doesn't. + */ + has (item: Uint8Array | string): boolean { + if (typeof item === 'string') { + item = uint8ArrayFromString(item) + } + + for (let i = 0; i < this.seeds.length; i++) { + const hash = mur.x86.hash32(item, this.seeds[i]) + const bit = hash % this.bits + + const isSet = this.getbit(bit) + + if (!isSet) { + return false + } + } + + return true + } + + /** + * Reset the filter + */ + clear (): void { + this.buffer.fill(0) + } + + setbit (bit: number): void { + let pos = 0 + let shift = bit + while (shift > 7) { + pos++ + shift -= 8 + } + + let bitfield = this.buffer[pos] + bitfield |= (0x1 << shift) + this.buffer[pos] = bitfield + } + + getbit (bit: number): boolean { + let pos = 0 + let shift = bit + while (shift > 7) { + pos++ + shift -= 8 + } + + const bitfield = this.buffer[pos] + return (bitfield & (0x1 << shift)) !== 0 + } +} + +function optimize (itemcount: number, errorRate: number = 0.005): { bits: number, hashes: number } { + const bits = Math.round(-1 * itemcount * Math.log(errorRate) / LN2_SQUARED) + const hashes = Math.round((bits / itemcount) * Math.LN2) + + return { bits, hashes } +} + +function generateSeeds (count: number): number[] { + let buf: Uint8ArrayList + let j: number + const seeds = [] + + for (let i = 0; i < count; i++) { + buf = new Uint8ArrayList(randomBytes(4)) + seeds[i] = buf.getUint32(0, true) + + // Make sure we don't end up with two identical seeds, + // which is unlikely but possible. + for (j = 0; j < i; j++) { + if (seeds[i] === seeds[j]) { + i-- + break + } + } + } + + return seeds +} diff --git a/packages/utils/test/bloom-filter.spec.ts b/packages/utils/test/bloom-filter.spec.ts new file mode 100644 index 0000000000..27f8377c56 --- /dev/null +++ b/packages/utils/test/bloom-filter.spec.ts @@ -0,0 +1,199 @@ +// ported from xxbloom - https://github.com/ceejbot/xxbloom/blob/master/LICENSE +import { expect } from 'aegir/chai' +import { fromString as uint8ArrayFromString } from 'uint8arrays/from-string' +import { BloomFilter } from '../src/bloom-filter.js' + +function hasBitsSet (buffer: Uint8Array): number { + let isset = 0 + for (let i = 0; i < buffer.length; i++) { + isset |= (buffer[i] !== 0 ? 1 : 0) + } + return isset +} + +describe('bloom-filter', () => { + it('constructs a filter of the requested size', () => { + const filter = new BloomFilter({ hashes: 4, bits: 32 }) + expect(filter.seeds).to.have.lengthOf(4) + expect(filter.bits).to.equal(32) + expect(filter.buffer).to.be.an.instanceOf(Uint8Array) + }) + + it('zeroes out its storage buffer', () => { + const filter = new BloomFilter({ hashes: 3, bits: 64 }) + for (let i = 0; i < filter.buffer.length; i++) { + expect(filter.buffer[i]).to.equal(0) + } + }) + + it('uses passed-in seeds if provided', () => { + const filter = new BloomFilter({ bits: 256, seeds: [1, 2, 3, 4, 5] }) + expect(filter.seeds.length).to.equal(5) + expect(filter.seeds[0]).to.equal(1) + expect(filter.seeds[4]).to.equal(5) + }) + + describe('createOptimal()', () => { + it('creates a filter with good defaults', () => { + let filter = BloomFilter.create(95) + expect(filter.bits).to.equal(1048) + expect(filter.seeds.length).to.equal(8) + + filter = BloomFilter.create(148) + expect(filter.bits).to.equal(1632) + expect(filter.seeds.length).to.equal(8) + + filter = BloomFilter.create(10) + expect(filter.bits).to.equal(110) + expect(filter.seeds.length).to.equal(8) + }) + + it('createOptimal() lets you specify an error rate', () => { + let filter = BloomFilter.create(20000) + expect(filter.bits).to.equal(220555) + const previous = filter.bits + + filter = BloomFilter.create(20000, 0.2) + expect(filter.bits).to.be.below(previous) + }) + }) + + describe('setbit() and getbit()', () => { + it('sets the specified bit', () => { + const filter = new BloomFilter({ hashes: 3, bits: 16 }) + + filter.setbit(0) + let val = filter.getbit(0) + expect(val).to.equal(true) + + filter.setbit(1) + val = filter.getbit(1) + expect(val).to.equal(true) + + val = filter.getbit(2) + expect(val).to.equal(false) + + filter.setbit(10) + val = filter.getbit(10) + expect(val).to.equal(true) + }) + + it('can set all bits', () => { + let i: number + let value: number + + const filter = new BloomFilter({ hashes: 3, bits: 16 }) + expect(filter.buffer.length).to.equal(2) + + for (i = 0; i < 16; i++) { + filter.setbit(i) + } + + for (i = 0; i < 2; i++) { + value = filter.buffer[i] + expect(value).to.equal(255) + } + }) + + it('slides over into the next buffer slice when setting bits', () => { + let val + const filter = new BloomFilter({ hashes: 3, bits: 64 }) + + filter.setbit(8) + val = filter.buffer[1] + expect(val).to.equal(1) + + filter.setbit(17) + val = filter.buffer[2] + expect(val).to.equal(2) + + filter.setbit(34) + val = filter.buffer[4] + expect(val).to.equal(4) + }) + }) + + describe('add()', () => { + it('can store buffers', () => { + const filter = new BloomFilter({ hashes: 4, bits: 128 }) + + expect(hasBitsSet(filter.buffer)).to.equal(0) + filter.add(uint8ArrayFromString('cat')) + expect(hasBitsSet(filter.buffer)).to.equal(1) + }) + + it('can store strings', () => { + const filter = new BloomFilter({ hashes: 4, bits: 128 }) + filter.add('cat') + + expect(hasBitsSet(filter.buffer)).to.equal(1) + }) + + it('can add a hundred random items', () => { + const alpha = '0123456789abcdefghijklmnopqrstuvwxyz' + function randomWord (length?: number): string { + length = length ?? Math.ceil(Math.random() * 20) + let result = '' + for (let i = 0; i < length; i++) { + result += alpha[Math.floor(Math.random() * alpha.length)] + } + + return result + } + + const filter = BloomFilter.create(100) + const words: string[] = [] + + for (let i = 0; i < 100; i++) { + const w = randomWord() + words.push(w) + filter.add(w) + } + + for (let i = 0; i < words.length; i++) { + expect(filter.has(words[i])).to.equal(true) + } + }) + }) + + describe('has()', () => { + it('returns true when called on a stored item', () => { + const filter = new BloomFilter({ hashes: 3, bits: 16 }) + filter.add('cat') + + expect(hasBitsSet(filter.buffer)).to.equal(1) + expect(filter.has('cat')).to.be.true() + }) + + it('returns false for items not in the set (mostly)', () => { + const filter = new BloomFilter({ hashes: 4, bits: 50 }) + filter.add('cat') + expect(filter.has('dog')).to.be.false() + }) + + it('responds appropriately for arrays of added items', () => { + const filter = new BloomFilter({ hashes: 3, bits: 128 }) + filter.add('cat') + filter.add('dog') + filter.add('wallaby') + + expect(filter.has('cat')).to.equal(true) + expect(filter.has('dog')).to.equal(true) + expect(filter.has('wallaby')).to.equal(true) + expect(filter.has('orange')).to.equal(false) + }) + }) + + describe('clear()', () => { + it('clears the filter', () => { + const filter = new BloomFilter({ hashes: 3, bits: 128 }) + filter.add('cat') + filter.add('dog') + filter.add('wallaby') + expect(hasBitsSet(filter.buffer)).to.equal(1) + + filter.clear() + expect(hasBitsSet(filter.buffer)).to.equal(0) + }) + }) +})