Skip to content

Commit

Permalink
feat: add bloom filter (#2507)
Browse files Browse the repository at this point in the history
Adds a bloom filter implementation to `@libp2p/utils` for use in
libp2p components.
  • Loading branch information
achingbrain authored Apr 26, 2024
1 parent 998fcaf commit e1923b0
Show file tree
Hide file tree
Showing 3 changed files with 350 additions and 3 deletions.
13 changes: 10 additions & 3 deletions packages/utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
"types": "./dist/src/array-equals.d.ts",
"import": "./dist/src/array-equals.js"
},
"./bloom-filter": {
"types": "./dist/src/bloom-filter.d.ts",
"import": "./dist/src/bloom-filter.js"
},
"./close-source": {
"types": "./dist/src/close-source.d.ts",
"import": "./dist/src/close-source.js"
Expand Down Expand Up @@ -128,20 +132,24 @@
},
"dependencies": {
"@chainsafe/is-ip": "^2.0.2",
"@libp2p/crypto": "^4.1.0",
"@libp2p/interface": "^1.3.0",
"@libp2p/logger": "^4.0.11",
"@multiformats/multiaddr": "^12.2.1",
"@multiformats/multiaddr-matcher": "^1.2.0",
"@types/murmurhash3js-revisited": "^3.0.3",
"delay": "^6.0.0",
"get-iterator": "^2.0.1",
"is-loopback-addr": "^2.0.2",
"it-pushable": "^3.2.3",
"it-stream-types": "^2.0.1",
"murmurhash3js-revisited": "^3.0.0",
"netmask": "^2.0.2",
"p-defer": "^4.0.1",
"race-event": "^1.2.0",
"race-signal": "^1.0.2",
"uint8arraylist": "^2.4.8"
"uint8arraylist": "^2.4.8",
"uint8arrays": "^5.0.3"
},
"devDependencies": {
"@libp2p/peer-id-factory": "^4.1.0",
Expand All @@ -153,8 +161,7 @@
"it-pair": "^2.0.6",
"it-pipe": "^3.0.1",
"sinon": "^17.0.1",
"sinon-ts": "^2.0.0",
"uint8arrays": "^5.0.3"
"sinon-ts": "^2.0.0"
},
"sideEffects": false
}
141 changes: 141 additions & 0 deletions packages/utils/src/bloom-filter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// ported from xxbloom - https://github.com/ceejbot/xxbloom/blob/master/LICENSE
import { randomBytes } from '@libp2p/crypto'
import mur from 'murmurhash3js-revisited'
import { Uint8ArrayList } from 'uint8arraylist'
import { alloc } from 'uint8arrays/alloc'
import { fromString as uint8ArrayFromString } from 'uint8arrays/from-string'

const LN2_SQUARED = Math.LN2 * Math.LN2

export interface BloomFilterOptions {
seeds?: number[]
hashes?: number
bits?: number
}

export class BloomFilter {
/**
* Create a `BloomFilter` with the smallest `bits` and `hashes` value for the
* specified item count and error rate.
*/
static create (itemcount: number, errorRate: number = 0.005): BloomFilter {
const opts = optimize(itemcount, errorRate)
return new BloomFilter(opts)
}

public readonly seeds: number[]
public readonly bits: number
public buffer: Uint8Array

constructor (options: BloomFilterOptions = {}) {
if (options.seeds != null) {
this.seeds = options.seeds
} else {
this.seeds = generateSeeds(options.hashes ?? 8)
}

this.bits = options.bits ?? 1024
this.buffer = alloc(Math.ceil(this.bits / 8))
}

/**
* Add an item to the filter
*/
add (item: Uint8Array | string): void {
if (typeof item === 'string') {
item = uint8ArrayFromString(item)
}

for (let i = 0; i < this.seeds.length; i++) {
const hash = mur.x86.hash32(item, this.seeds[i])
const bit = hash % this.bits

this.setbit(bit)
}
}

/**
* Test if the filter has an item. If it returns false it definitely does not
* have the item. If it returns true, it probably has the item but there's
* an `errorRate` chance it doesn't.
*/
has (item: Uint8Array | string): boolean {
if (typeof item === 'string') {
item = uint8ArrayFromString(item)
}

for (let i = 0; i < this.seeds.length; i++) {
const hash = mur.x86.hash32(item, this.seeds[i])
const bit = hash % this.bits

const isSet = this.getbit(bit)

if (!isSet) {
return false
}
}

return true
}

/**
* Reset the filter
*/
clear (): void {
this.buffer.fill(0)
}

setbit (bit: number): void {
let pos = 0
let shift = bit
while (shift > 7) {
pos++
shift -= 8
}

let bitfield = this.buffer[pos]
bitfield |= (0x1 << shift)
this.buffer[pos] = bitfield
}

getbit (bit: number): boolean {
let pos = 0
let shift = bit
while (shift > 7) {
pos++
shift -= 8
}

const bitfield = this.buffer[pos]
return (bitfield & (0x1 << shift)) !== 0
}
}

function optimize (itemcount: number, errorRate: number = 0.005): { bits: number, hashes: number } {
const bits = Math.round(-1 * itemcount * Math.log(errorRate) / LN2_SQUARED)
const hashes = Math.round((bits / itemcount) * Math.LN2)

return { bits, hashes }
}

function generateSeeds (count: number): number[] {
let buf: Uint8ArrayList
let j: number
const seeds = []

for (let i = 0; i < count; i++) {
buf = new Uint8ArrayList(randomBytes(4))
seeds[i] = buf.getUint32(0, true)

// Make sure we don't end up with two identical seeds,
// which is unlikely but possible.
for (j = 0; j < i; j++) {
if (seeds[i] === seeds[j]) {
i--
break
}
}
}

return seeds
}
199 changes: 199 additions & 0 deletions packages/utils/test/bloom-filter.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
// ported from xxbloom - https://github.com/ceejbot/xxbloom/blob/master/LICENSE
import { expect } from 'aegir/chai'
import { fromString as uint8ArrayFromString } from 'uint8arrays/from-string'
import { BloomFilter } from '../src/bloom-filter.js'

function hasBitsSet (buffer: Uint8Array): number {
let isset = 0
for (let i = 0; i < buffer.length; i++) {
isset |= (buffer[i] !== 0 ? 1 : 0)
}
return isset
}

describe('bloom-filter', () => {
it('constructs a filter of the requested size', () => {
const filter = new BloomFilter({ hashes: 4, bits: 32 })
expect(filter.seeds).to.have.lengthOf(4)
expect(filter.bits).to.equal(32)
expect(filter.buffer).to.be.an.instanceOf(Uint8Array)
})

it('zeroes out its storage buffer', () => {
const filter = new BloomFilter({ hashes: 3, bits: 64 })
for (let i = 0; i < filter.buffer.length; i++) {
expect(filter.buffer[i]).to.equal(0)
}
})

it('uses passed-in seeds if provided', () => {
const filter = new BloomFilter({ bits: 256, seeds: [1, 2, 3, 4, 5] })
expect(filter.seeds.length).to.equal(5)
expect(filter.seeds[0]).to.equal(1)
expect(filter.seeds[4]).to.equal(5)
})

describe('createOptimal()', () => {
it('creates a filter with good defaults', () => {
let filter = BloomFilter.create(95)
expect(filter.bits).to.equal(1048)
expect(filter.seeds.length).to.equal(8)

filter = BloomFilter.create(148)
expect(filter.bits).to.equal(1632)
expect(filter.seeds.length).to.equal(8)

filter = BloomFilter.create(10)
expect(filter.bits).to.equal(110)
expect(filter.seeds.length).to.equal(8)
})

it('createOptimal() lets you specify an error rate', () => {
let filter = BloomFilter.create(20000)
expect(filter.bits).to.equal(220555)
const previous = filter.bits

filter = BloomFilter.create(20000, 0.2)
expect(filter.bits).to.be.below(previous)
})
})

describe('setbit() and getbit()', () => {
it('sets the specified bit', () => {
const filter = new BloomFilter({ hashes: 3, bits: 16 })

filter.setbit(0)
let val = filter.getbit(0)
expect(val).to.equal(true)

filter.setbit(1)
val = filter.getbit(1)
expect(val).to.equal(true)

val = filter.getbit(2)
expect(val).to.equal(false)

filter.setbit(10)
val = filter.getbit(10)
expect(val).to.equal(true)
})

it('can set all bits', () => {
let i: number
let value: number

const filter = new BloomFilter({ hashes: 3, bits: 16 })
expect(filter.buffer.length).to.equal(2)

for (i = 0; i < 16; i++) {
filter.setbit(i)
}

for (i = 0; i < 2; i++) {
value = filter.buffer[i]
expect(value).to.equal(255)
}
})

it('slides over into the next buffer slice when setting bits', () => {
let val
const filter = new BloomFilter({ hashes: 3, bits: 64 })

filter.setbit(8)
val = filter.buffer[1]
expect(val).to.equal(1)

filter.setbit(17)
val = filter.buffer[2]
expect(val).to.equal(2)

filter.setbit(34)
val = filter.buffer[4]
expect(val).to.equal(4)
})
})

describe('add()', () => {
it('can store buffers', () => {
const filter = new BloomFilter({ hashes: 4, bits: 128 })

expect(hasBitsSet(filter.buffer)).to.equal(0)
filter.add(uint8ArrayFromString('cat'))
expect(hasBitsSet(filter.buffer)).to.equal(1)
})

it('can store strings', () => {
const filter = new BloomFilter({ hashes: 4, bits: 128 })
filter.add('cat')

expect(hasBitsSet(filter.buffer)).to.equal(1)
})

it('can add a hundred random items', () => {
const alpha = '0123456789abcdefghijklmnopqrstuvwxyz'
function randomWord (length?: number): string {
length = length ?? Math.ceil(Math.random() * 20)
let result = ''
for (let i = 0; i < length; i++) {
result += alpha[Math.floor(Math.random() * alpha.length)]
}

return result
}

const filter = BloomFilter.create(100)
const words: string[] = []

for (let i = 0; i < 100; i++) {
const w = randomWord()
words.push(w)
filter.add(w)
}

for (let i = 0; i < words.length; i++) {
expect(filter.has(words[i])).to.equal(true)
}
})
})

describe('has()', () => {
it('returns true when called on a stored item', () => {
const filter = new BloomFilter({ hashes: 3, bits: 16 })
filter.add('cat')

expect(hasBitsSet(filter.buffer)).to.equal(1)
expect(filter.has('cat')).to.be.true()
})

it('returns false for items not in the set (mostly)', () => {
const filter = new BloomFilter({ hashes: 4, bits: 50 })
filter.add('cat')
expect(filter.has('dog')).to.be.false()
})

it('responds appropriately for arrays of added items', () => {
const filter = new BloomFilter({ hashes: 3, bits: 128 })
filter.add('cat')
filter.add('dog')
filter.add('wallaby')

expect(filter.has('cat')).to.equal(true)
expect(filter.has('dog')).to.equal(true)
expect(filter.has('wallaby')).to.equal(true)
expect(filter.has('orange')).to.equal(false)
})
})

describe('clear()', () => {
it('clears the filter', () => {
const filter = new BloomFilter({ hashes: 3, bits: 128 })
filter.add('cat')
filter.add('dog')
filter.add('wallaby')
expect(hasBitsSet(filter.buffer)).to.equal(1)

filter.clear()
expect(hasBitsSet(filter.buffer)).to.equal(0)
})
})
})

0 comments on commit e1923b0

Please sign in to comment.