solana/runtime/src/bloom.rs

//! Simple Bloom Filter
use bv::BitVec;
use fnv::FnvHasher;
use rand::{self, Rng};
use serde::{Deserialize, Serialize};
use std::cmp;
use std::hash::Hasher;
use std::marker::PhantomData;

/// Generate a stable hash of `self` for each `hash_index`
/// Best effort can be made for uniqueness of each hash.
pub trait BloomHashIndex {
    fn hash_at_index(&self, hash_index: u64) -> u64;
}

#[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq)]
pub struct Bloom<T: BloomHashIndex> {
    pub keys: Vec<u64>,
    pub bits: BitVec<u64>,
    num_bits_set: u64,
    _phantom: PhantomData<T>,
}

impl<T: BloomHashIndex> Bloom<T> {
    pub fn new(num_bits: usize, keys: Vec<u64>) -> Self {
        let bits = BitVec::new_fill(false, num_bits as u64);
        Bloom {
            keys,
            bits,
            num_bits_set: 0,
            _phantom: PhantomData::default(),
        }
    }
    /// create filter optimal for num size given the `false_rate`
    /// the keys are randomized for picking data out of a collision resistant hash of size
    /// `keysize` bytes
    /// https://hur.st/bloomfilter/
    pub fn random(num: usize, false_rate: f64, max_bits: usize) -> Self {
        let min_num_bits = ((num as f64 * false_rate.log(2f64))
            / (1f64 / 2f64.powf(2f64.log(2f64))).log(2f64))
        .ceil() as usize;
        let num_bits = cmp::max(1, cmp::min(min_num_bits, max_bits));
        let num_keys = ((num_bits as f64 / num as f64) * 2f64.log(2f64)).round() as usize;
        let keys: Vec<u64> = (0..num_keys).map(|_| rand::thread_rng().gen()).collect();
        Self::new(num_bits, keys)
    }
    fn pos(&self, key: &T, k: u64) -> u64 {
        key.hash_at_index(k) % self.bits.len()
    }
    pub fn clear(&mut self) {
        self.bits = BitVec::new_fill(false, self.bits.len());
        self.num_bits_set = 0;
    }
    pub fn add(&mut self, key: &T) {
        for k in &self.keys {
            let pos = self.pos(key, *k);
            if !self.bits.get(pos) {
                self.num_bits_set += 1;
                self.bits.set(pos, true);
            }
        }
    }
    pub fn contains(&self, key: &T) -> bool {
        for k in &self.keys {
            let pos = self.pos(key, *k);
            if !self.bits.get(pos) {
                return false;
            }
        }
        true
    }
}

fn slice_hash(slice: &[u8], hash_index: u64) -> u64 {
    let mut hasher = FnvHasher::with_key(hash_index);
    hasher.write(slice);
    hasher.finish()
}

impl<T: AsRef<[u8]>> BloomHashIndex for T {
    fn hash_at_index(&self, hash_index: u64) -> u64 {
        slice_hash(self.as_ref(), hash_index)
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use solana_sdk::hash::{hash, Hash};

    #[test]
    fn test_bloom_filter() {
        //empty
        let bloom: Bloom<Hash> = Bloom::random(0, 0.1, 100);
        assert_eq!(bloom.keys.len(), 0);
        assert_eq!(bloom.bits.len(), 1);

        //normal
        let bloom: Bloom<Hash> = Bloom::random(10, 0.1, 100);
        assert_eq!(bloom.keys.len(), 3);
        assert_eq!(bloom.bits.len(), 34);

        //saturated
        let bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);
        assert_eq!(bloom.keys.len(), 1);
        assert_eq!(bloom.bits.len(), 100);
    }
    #[test]
    fn test_add_contains() {
        let mut bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);
        //known keys to avoid false positives in the test
        bloom.keys = vec![0, 1, 2, 3];

        let key = hash(b"hello");
        assert!(!bloom.contains(&key));
        bloom.add(&key);
        assert!(bloom.contains(&key));

        let key = hash(b"world");
        assert!(!bloom.contains(&key));
        bloom.add(&key);
        assert!(bloom.contains(&key));
    }
    #[test]
    fn test_random() {
        let mut b1: Bloom<Hash> = Bloom::random(10, 0.1, 100);
        let mut b2: Bloom<Hash> = Bloom::random(10, 0.1, 100);
        b1.keys.sort();
        b2.keys.sort();
        assert_ne!(b1.keys, b2.keys);
    }
}
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`//! Simple Bloom Filter`
			`use bv::BitVec;`
add bloom benchmarking, perf improvement from Fnv ~= 8X (#2477) * add bloom benchmarking, perf improvement from Fnv ~= 8X * have a look at bits.set() * ignore new benches to pacify CI (solana_upload_perf?) 2019-01-17 18:22:21 -08:00			`use fnv::FnvHasher;`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`use rand::{self, Rng};`
Move Bank to its own crate Also: * counters.rs to solana_metrics * genesis_block.rs to solana_sdk 2019-02-18 22:26:22 -08:00			`use serde::{Deserialize, Serialize};`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`use std::cmp;`
add bloom benchmarking, perf improvement from Fnv ~= 8X (#2477) * add bloom benchmarking, perf improvement from Fnv ~= 8X * have a look at bits.set() * ignore new benches to pacify CI (solana_upload_perf?) 2019-01-17 18:22:21 -08:00			`use std::hash::Hasher;`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`use std::marker::PhantomData;`

bloom for forking (#2431) * bloom for forking * clippy fixes * remove bloom_hash_index 2019-01-15 13:56:54 -08:00			/// Generate a stable hash of `self` for each `hash_index`
			`/// Best effort can be made for uniqueness of each hash.`
			`pub trait BloomHashIndex {`
			`fn hash_at_index(&self, hash_index: u64) -> u64;`
			`}`

Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`#[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq)]`
			`pub struct Bloom<T: BloomHashIndex> {`
			`pub keys: Vec<u64>,`
Create bank snapshots (#4244) * Revert "Revert "Create bank snapshots (#3671)" (#4243)" This reverts commit 81fa69d3471977259d62b88cd5b83b32e9e38219. * keep saved and unsaved copies of status cache * fix format check * bench for status cache serialize * misc cleanup * remove appendvec storage on purge * fix accounts restore * cleanup * Pass snapshot path as args * Fix clippy 2019-05-30 21:31:35 -07:00			`pub bits: BitVec<u64>,`
			`num_bits_set: u64,`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`_phantom: PhantomData<T>,`
			`}`

			`impl<T: BloomHashIndex> Bloom<T> {`
bloom for forking (#2431) * bloom for forking * clippy fixes * remove bloom_hash_index 2019-01-15 13:56:54 -08:00			`pub fn new(num_bits: usize, keys: Vec<u64>) -> Self {`
			`let bits = BitVec::new_fill(false, num_bits as u64);`
			`Bloom {`
			`keys,`
			`bits,`
Create bank snapshots (#4244) * Revert "Revert "Create bank snapshots (#3671)" (#4243)" This reverts commit 81fa69d3471977259d62b88cd5b83b32e9e38219. * keep saved and unsaved copies of status cache * fix format check * bench for status cache serialize * misc cleanup * remove appendvec storage on purge * fix accounts restore * cleanup * Pass snapshot path as args * Fix clippy 2019-05-30 21:31:35 -07:00			`num_bits_set: 0,`
Purge Default::default() 2019-02-09 09:20:43 -08:00			`_phantom: PhantomData::default(),`
bloom for forking (#2431) * bloom for forking * clippy fixes * remove bloom_hash_index 2019-01-15 13:56:54 -08:00			`}`
			`}`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			/// create filter optimal for num size given the `false_rate`
			`/// the keys are randomized for picking data out of a collision resistant hash of size`
			/// `keysize` bytes
			`/// https://hur.st/bloomfilter/`
			`pub fn random(num: usize, false_rate: f64, max_bits: usize) -> Self {`
			`let min_num_bits = ((num as f64 * false_rate.log(2f64))`
Upgrade to Rust 1.31.0 (#2052) * Upgrade to Rust 1.31.0 * Upgrade nightly * Fix all clippy warnings * Revert relaxed version check and update 2018-12-07 19:01:28 -08:00			`/ (1f64 / 2f64.powf(2f64.log(2f64))).log(2f64))`
			`.ceil() as usize;`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`let num_bits = cmp::max(1, cmp::min(min_num_bits, max_bits));`
			`let num_keys = ((num_bits as f64 / num as f64) * 2f64.log(2f64)).round() as usize;`
			`let keys: Vec<u64> = (0..num_keys).map(\|_\| rand::thread_rng().gen()).collect();`
bloom for forking (#2431) * bloom for forking * clippy fixes * remove bloom_hash_index 2019-01-15 13:56:54 -08:00			`Self::new(num_bits, keys)`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`}`
			`fn pos(&self, key: &T, k: u64) -> u64 {`
bloom for forking (#2431) * bloom for forking * clippy fixes * remove bloom_hash_index 2019-01-15 13:56:54 -08:00			`key.hash_at_index(k) % self.bits.len()`
			`}`
			`pub fn clear(&mut self) {`
StatusDeque split into separate objects with their own root checkpoint strategy (#2613) Split up StatusDeque into different modules * LastIdQueue tracks last_ids * StatusCache keeps track of signature statuses * StatusCache stores success as a bit in a bloom filter * Overhead for 1m Ok transactions is 4mb in memory * Less concurrency between the objects, last_id and status_cache are read and written to at different points in the pipeline * Each object has its own strategy for merging into the root checkpoint 2019-01-31 06:53:52 -08:00			`self.bits = BitVec::new_fill(false, self.bits.len());`
Create bank snapshots (#4244) * Revert "Revert "Create bank snapshots (#3671)" (#4243)" This reverts commit 81fa69d3471977259d62b88cd5b83b32e9e38219. * keep saved and unsaved copies of status cache * fix format check * bench for status cache serialize * misc cleanup * remove appendvec storage on purge * fix accounts restore * cleanup * Pass snapshot path as args * Fix clippy 2019-05-30 21:31:35 -07:00			`self.num_bits_set = 0;`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`}`
			`pub fn add(&mut self, key: &T) {`
			`for k in &self.keys {`
			`let pos = self.pos(key, *k);`
Create bank snapshots (#4244) * Revert "Revert "Create bank snapshots (#3671)" (#4243)" This reverts commit 81fa69d3471977259d62b88cd5b83b32e9e38219. * keep saved and unsaved copies of status cache * fix format check * bench for status cache serialize * misc cleanup * remove appendvec storage on purge * fix accounts restore * cleanup * Pass snapshot path as args * Fix clippy 2019-05-30 21:31:35 -07:00			`if !self.bits.get(pos) {`
			`self.num_bits_set += 1;`
			`self.bits.set(pos, true);`
			`}`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`}`
			`}`
StatusDeque split into separate objects with their own root checkpoint strategy (#2613) Split up StatusDeque into different modules * LastIdQueue tracks last_ids * StatusCache keeps track of signature statuses * StatusCache stores success as a bit in a bloom filter * Overhead for 1m Ok transactions is 4mb in memory * Less concurrency between the objects, last_id and status_cache are read and written to at different points in the pipeline * Each object has its own strategy for merging into the root checkpoint 2019-01-31 06:53:52 -08:00			`pub fn contains(&self, key: &T) -> bool {`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`for k in &self.keys {`
			`let pos = self.pos(key, *k);`
			`if !self.bits.get(pos) {`
			`return false;`
			`}`
			`}`
			`true`
			`}`
			`}`

bloom for forking (#2431) * bloom for forking * clippy fixes * remove bloom_hash_index 2019-01-15 13:56:54 -08:00			`fn slice_hash(slice: &[u8], hash_index: u64) -> u64 {`
add bloom benchmarking, perf improvement from Fnv ~= 8X (#2477) * add bloom benchmarking, perf improvement from Fnv ~= 8X * have a look at bits.set() * ignore new benches to pacify CI (solana_upload_perf?) 2019-01-17 18:22:21 -08:00			`let mut hasher = FnvHasher::with_key(hash_index);`
			`hasher.write(slice);`
			`hasher.finish()`
bloom for forking (#2431) * bloom for forking * clippy fixes * remove bloom_hash_index 2019-01-15 13:56:54 -08:00			`}`

			`impl<T: AsRef<[u8]>> BloomHashIndex for T {`
			`fn hash_at_index(&self, hash_index: u64) -> u64 {`
			`slice_hash(self.as_ref(), hash_index)`
			`}`
			`}`

Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00			`#[cfg(test)]`
			`mod test {`
			`use super::*;`
Move drone into its own crate 2018-11-16 08:04:46 -08:00			`use solana_sdk::hash::{hash, Hash};`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00
			`#[test]`
			`fn test_bloom_filter() {`
			`//empty`
			`let bloom: Bloom<Hash> = Bloom::random(0, 0.1, 100);`
			`assert_eq!(bloom.keys.len(), 0);`
			`assert_eq!(bloom.bits.len(), 1);`

			`//normal`
			`let bloom: Bloom<Hash> = Bloom::random(10, 0.1, 100);`
			`assert_eq!(bloom.keys.len(), 3);`
			`assert_eq!(bloom.bits.len(), 34);`

			`//saturated`
			`let bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);`
			`assert_eq!(bloom.keys.len(), 1);`
			`assert_eq!(bloom.bits.len(), 100);`
			`}`
			`#[test]`
			`fn test_add_contains() {`
			`let mut bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);`
Use known keys in the unit test to avoid random false positives. 2018-11-17 20:48:32 -08:00			`//known keys to avoid false positives in the test`
			`bloom.keys = vec![0, 1, 2, 3];`
Add scalable gossip library (#1546) * Cluster Replicated Data Store Separate the data storage and merge strategy from the network IO boundary. Implement an eager push overlay for transporting recent messages. Simulation shows fast convergence with 20k nodes. 2018-11-15 13:23:26 -08:00
			`let key = hash(b"hello");`
			`assert!(!bloom.contains(&key));`
			`bloom.add(&key);`
			`assert!(bloom.contains(&key));`

			`let key = hash(b"world");`
			`assert!(!bloom.contains(&key));`
			`bloom.add(&key);`
			`assert!(bloom.contains(&key));`
			`}`
			`#[test]`
			`fn test_random() {`
			`let mut b1: Bloom<Hash> = Bloom::random(10, 0.1, 100);`
			`let mut b2: Bloom<Hash> = Bloom::random(10, 0.1, 100);`
			`b1.keys.sort();`
			`b2.keys.sort();`
			`assert_ne!(b1.keys, b2.keys);`
			`}`
			`}`