2018-11-15 13:23:26 -08:00
|
|
|
//! Simple Bloom Filter
|
|
|
|
use bv::BitVec;
|
2019-01-17 18:22:21 -08:00
|
|
|
use fnv::FnvHasher;
|
2018-11-15 13:23:26 -08:00
|
|
|
use rand::{self, Rng};
|
2019-02-18 22:26:22 -08:00
|
|
|
use serde::{Deserialize, Serialize};
|
2018-11-15 13:23:26 -08:00
|
|
|
use std::cmp;
|
2019-01-17 18:22:21 -08:00
|
|
|
use std::hash::Hasher;
|
2018-11-15 13:23:26 -08:00
|
|
|
use std::marker::PhantomData;
|
|
|
|
|
2019-01-15 13:56:54 -08:00
|
|
|
/// Generate a stable hash of `self` for each `hash_index`
|
|
|
|
/// Best effort can be made for uniqueness of each hash.
|
|
|
|
pub trait BloomHashIndex {
|
|
|
|
fn hash_at_index(&self, hash_index: u64) -> u64;
|
|
|
|
}
|
|
|
|
|
2018-11-15 13:23:26 -08:00
|
|
|
#[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq)]
|
|
|
|
pub struct Bloom<T: BloomHashIndex> {
|
|
|
|
pub keys: Vec<u64>,
|
2019-05-30 21:31:35 -07:00
|
|
|
pub bits: BitVec<u64>,
|
|
|
|
num_bits_set: u64,
|
2018-11-15 13:23:26 -08:00
|
|
|
_phantom: PhantomData<T>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T: BloomHashIndex> Bloom<T> {
|
2019-01-15 13:56:54 -08:00
|
|
|
pub fn new(num_bits: usize, keys: Vec<u64>) -> Self {
|
|
|
|
let bits = BitVec::new_fill(false, num_bits as u64);
|
|
|
|
Bloom {
|
|
|
|
keys,
|
|
|
|
bits,
|
2019-05-30 21:31:35 -07:00
|
|
|
num_bits_set: 0,
|
2019-02-09 09:20:43 -08:00
|
|
|
_phantom: PhantomData::default(),
|
2019-01-15 13:56:54 -08:00
|
|
|
}
|
|
|
|
}
|
2018-11-15 13:23:26 -08:00
|
|
|
/// create filter optimal for num size given the `false_rate`
|
|
|
|
/// the keys are randomized for picking data out of a collision resistant hash of size
|
|
|
|
/// `keysize` bytes
|
|
|
|
/// https://hur.st/bloomfilter/
|
|
|
|
pub fn random(num: usize, false_rate: f64, max_bits: usize) -> Self {
|
|
|
|
let min_num_bits = ((num as f64 * false_rate.log(2f64))
|
2018-12-07 19:01:28 -08:00
|
|
|
/ (1f64 / 2f64.powf(2f64.log(2f64))).log(2f64))
|
|
|
|
.ceil() as usize;
|
2018-11-15 13:23:26 -08:00
|
|
|
let num_bits = cmp::max(1, cmp::min(min_num_bits, max_bits));
|
|
|
|
let num_keys = ((num_bits as f64 / num as f64) * 2f64.log(2f64)).round() as usize;
|
|
|
|
let keys: Vec<u64> = (0..num_keys).map(|_| rand::thread_rng().gen()).collect();
|
2019-01-15 13:56:54 -08:00
|
|
|
Self::new(num_bits, keys)
|
2018-11-15 13:23:26 -08:00
|
|
|
}
|
|
|
|
fn pos(&self, key: &T, k: u64) -> u64 {
|
2019-01-15 13:56:54 -08:00
|
|
|
key.hash_at_index(k) % self.bits.len()
|
|
|
|
}
|
|
|
|
pub fn clear(&mut self) {
|
2019-01-31 06:53:52 -08:00
|
|
|
self.bits = BitVec::new_fill(false, self.bits.len());
|
2019-05-30 21:31:35 -07:00
|
|
|
self.num_bits_set = 0;
|
2018-11-15 13:23:26 -08:00
|
|
|
}
|
|
|
|
pub fn add(&mut self, key: &T) {
|
|
|
|
for k in &self.keys {
|
|
|
|
let pos = self.pos(key, *k);
|
2019-05-30 21:31:35 -07:00
|
|
|
if !self.bits.get(pos) {
|
|
|
|
self.num_bits_set += 1;
|
|
|
|
self.bits.set(pos, true);
|
|
|
|
}
|
2018-11-15 13:23:26 -08:00
|
|
|
}
|
|
|
|
}
|
2019-01-31 06:53:52 -08:00
|
|
|
pub fn contains(&self, key: &T) -> bool {
|
2018-11-15 13:23:26 -08:00
|
|
|
for k in &self.keys {
|
|
|
|
let pos = self.pos(key, *k);
|
|
|
|
if !self.bits.get(pos) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-01-15 13:56:54 -08:00
|
|
|
fn slice_hash(slice: &[u8], hash_index: u64) -> u64 {
|
2019-01-17 18:22:21 -08:00
|
|
|
let mut hasher = FnvHasher::with_key(hash_index);
|
|
|
|
hasher.write(slice);
|
|
|
|
hasher.finish()
|
2019-01-15 13:56:54 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<T: AsRef<[u8]>> BloomHashIndex for T {
|
|
|
|
fn hash_at_index(&self, hash_index: u64) -> u64 {
|
|
|
|
slice_hash(self.as_ref(), hash_index)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-15 13:23:26 -08:00
|
|
|
#[cfg(test)]
|
|
|
|
mod test {
|
|
|
|
use super::*;
|
2018-11-16 08:04:46 -08:00
|
|
|
use solana_sdk::hash::{hash, Hash};
|
2018-11-15 13:23:26 -08:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_bloom_filter() {
|
|
|
|
//empty
|
|
|
|
let bloom: Bloom<Hash> = Bloom::random(0, 0.1, 100);
|
|
|
|
assert_eq!(bloom.keys.len(), 0);
|
|
|
|
assert_eq!(bloom.bits.len(), 1);
|
|
|
|
|
|
|
|
//normal
|
|
|
|
let bloom: Bloom<Hash> = Bloom::random(10, 0.1, 100);
|
|
|
|
assert_eq!(bloom.keys.len(), 3);
|
|
|
|
assert_eq!(bloom.bits.len(), 34);
|
|
|
|
|
|
|
|
//saturated
|
|
|
|
let bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);
|
|
|
|
assert_eq!(bloom.keys.len(), 1);
|
|
|
|
assert_eq!(bloom.bits.len(), 100);
|
|
|
|
}
|
|
|
|
#[test]
|
|
|
|
fn test_add_contains() {
|
|
|
|
let mut bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);
|
2018-11-17 20:48:32 -08:00
|
|
|
//known keys to avoid false positives in the test
|
|
|
|
bloom.keys = vec![0, 1, 2, 3];
|
2018-11-15 13:23:26 -08:00
|
|
|
|
|
|
|
let key = hash(b"hello");
|
|
|
|
assert!(!bloom.contains(&key));
|
|
|
|
bloom.add(&key);
|
|
|
|
assert!(bloom.contains(&key));
|
|
|
|
|
|
|
|
let key = hash(b"world");
|
|
|
|
assert!(!bloom.contains(&key));
|
|
|
|
bloom.add(&key);
|
|
|
|
assert!(bloom.contains(&key));
|
|
|
|
}
|
|
|
|
#[test]
|
|
|
|
fn test_random() {
|
|
|
|
let mut b1: Bloom<Hash> = Bloom::random(10, 0.1, 100);
|
|
|
|
let mut b2: Bloom<Hash> = Bloom::random(10, 0.1, 100);
|
|
|
|
b1.keys.sort();
|
|
|
|
b2.keys.sort();
|
|
|
|
assert_ne!(b1.keys, b2.keys);
|
|
|
|
}
|
|
|
|
}
|