1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
//! Implementation of a bloom filter.

use sha2::{Digest, Sha256};

/// Simple implementation of a Bloom Filter. Which is guaranteed to return 1 if an element
/// is in the set, but returns 1 with probability p (settable) if an item is not in the
/// set. Does not reveal what is in the set.
#[derive(Debug, PartialEq, PartialOrd)]
pub struct BloomFilter {
    bits: Vec<bool>,
    nhashes: usize,
}

impl BloomFilter {
    /// Create a new BloomFilter with `size` entries, using `nhashes` hash functions.
    pub fn new(size: usize, nhashes: usize) -> Self {
        BloomFilter {
            bits: vec![false; size],
            nhashes,
        }
    }

    /// Compute required expansion for false positive probability `p`.
    ///
    /// That is - if you plan to insert `n` items into the BloomFilter, and want a false
    /// positive probability of `p`, then you should set the BloomFilter size to
    /// `compute_expansion(p) * n`.
    pub fn compute_expansion(p: f64) -> f64 {
        -1.44 * p.log2()
    }

    /// Compute required number of hash functions for false positive probability `p`.
    pub fn compute_nhashes(p: f64) -> usize {
        (-p.log2()).ceil() as usize
    }

    /// Create a new BloomFilter with false positive probability `p` which can support up
    /// to `n` insertions.
    pub fn with_false_positive_prob(p: f64, n: usize) -> Self {
        Self::new(
            (Self::compute_expansion(p) * n as f64).ceil() as usize,
            Self::compute_nhashes(p),
        )
    }

    /// Get the number of bins in this BloomFilter.
    pub fn len(&self) -> usize {
        self.bits.len()
    }

    /// Get the number of hash functions in this BloomFilter.
    pub fn nhashes(&self) -> usize {
        self.nhashes
    }

    /// Get bloom filter bins.
    pub fn bins(&self) -> &[bool] {
        &self.bits
    }

    /// Get bloom filter bins packed in bytes.
    pub fn as_bytes(&self) -> Vec<u8> {
        crate::utils::pack_bits(self.bins())
    }

    /// Create bloom filter from bytes.
    pub fn from_bytes(bytes: &[u8], size: usize, nhashes: usize) -> Self {
        let bits = crate::utils::unpack_bits(bytes, size);
        BloomFilter { bits, nhashes }
    }

    /// Compute the bin that this value would go to in a BloomFilter.
    ///
    /// Result must be modded by the actual size of the bloom filter to avoid out of
    /// bounds errors.
    pub fn bin<V: AsRef<[u8]>>(value: &V, hash_index: usize) -> usize {
        // TODO: This code probably needs to use fixed-size integer types in order to be portable to
        // 32-bit architectures.
        debug_assert_eq!(std::mem::size_of::<usize>(), 8);
        let mut h = Sha256::new();
        h.update((hash_index as u64).to_le_bytes());
        h.update(value);
        let hbytes = h.finalize();
        u64::from_le_bytes(
            <[u8; 8]>::try_from(&hbytes[0..8]).expect("We're getting 8 bytes specifically"),
        ) as usize
    }

    /// Insert an item into the BloomFilter.
    pub fn insert<V: AsRef<[u8]>>(&mut self, value: &V) {
        for hash_index in 0..self.nhashes {
            let i = Self::bin(value, hash_index) % self.len();
            self.bits[i] = true;
        }
    }

    /// Check whether an item exists in the BloomFilter.
    pub fn contains<V: AsRef<[u8]>>(&mut self, value: &V) -> bool {
        (0..self.nhashes).all(|hash_index| {
            let i = Self::bin(value, hash_index) % self.len();
            self.bits[i]
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::{AesRng, Block};
    use rand::Rng;

    #[test]
    fn test_bloom_filter_membership() {
        let mut rng = AesRng::new();
        let n = 1000;
        let nhashes = 3;
        let mut filter = BloomFilter::new(n, nhashes);
        for _ in 0..128 {
            let x = rng.gen::<Block>();
            filter.insert(&x);
            assert!(filter.contains(&x));
        }
        assert_eq!(
            filter,
            BloomFilter::from_bytes(&filter.as_bytes(), n, nhashes)
        );
    }
}