1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
use super::AesRng;
use vectoreyes::{
    array_utils::{ArrayAdjacentPairs, ArrayUnrolledExt, ArrayUnrolledOps, UnrollableArraySize},
    Aes128EncryptOnly, AesBlockCipher, SimdBase, SimdBase32, SimdBase64, U32x4, U32x8, U64x4,
};

/// Sample `u32`s uniformly from `[0, bound)`.
#[derive(Debug, Clone, Copy)]
pub struct UniformIntegersUnderBound {
    threshold: u32,
    bound: u32,
}

impl UniformIntegersUnderBound {
    /// Create the distribution.
    ///
    /// # Performance
    /// This function performs very well as long as the bound, $`b`$ is such that $`t(b)`$ is small,
    /// where:
    /// ```math
    /// t(b) = (2^{32} - b + 1) \mod b
    /// ```
    ///
    /// In particular, the probability that a random vector of $`N`$ elements is accepted is:
    /// ```math
    /// \left(1 - \frac{t(b)}{2^{32}}\right)^N
    /// ```
    /// Thus, if $`t(b)`$ is quite small, then it is efficient for us to reject $`N`$ elements at a
    /// time, as opposed to rejecting individual elements.
    /// # Repeated Invocations
    /// It is inefficient to repeatedly call `new` with a fresh `bound`. However, an alternative
    /// algorithm has not yet been implemented.
    /// # Timing Side-Channel
    /// `bound` should be a _public_ value, since it may be leaked in the timing of sampling.
    /// # Panics
    /// Panics if `bound` is `0`.
    // TODO: implement Lemire's "Nearly Divisionless" algorithm, too.
    #[inline]
    pub fn new(bound: u32) -> Self {
        assert_ne!(bound, 0);
        let threshold = 0_u32.wrapping_sub(bound) % bound;
        UniformIntegersUnderBound { threshold, bound }
    }

    /// The exclusive bound of the integers produced by this generator.
    pub fn bound(&self) -> u32 {
        self.bound
    }

    /// Produce `Aes128EncryptOnly::BLOCK_COUNT_HINT * 4` uniformly distributed `u32`s (under the
    /// given bound).
    #[inline(always)]
    pub fn sample(&self, rng: &mut AesRng) -> [U32x8; Aes128EncryptOnly::BLOCK_COUNT_HINT / 2] {
        debug_assert_eq!(Aes128EncryptOnly::BLOCK_COUNT_HINT % 2, 0);
        const N: usize = Aes128EncryptOnly::BLOCK_COUNT_HINT;
        const HALF_N: usize = Aes128EncryptOnly::BLOCK_COUNT_HINT / 2;
        self.lemire_body::<N, HALF_N>(rng)
    }

    /// Produce 20 uniformly distributed `u32`s under the given bound.
    ///
    /// Random numbers are returned in `out[0][..]`, `out[1][..]`, and the even-indexed entries of
    /// `out[2]` (i.e. `out[2][0]`, `out[2][2]`, `out[2][4]`, `out[2][6]`).
    ///
    /// # Alternatives
    /// Consider using [Self::sample] instead. It may be faster on some platforms.
    #[inline(always)]
    pub fn sample_20(&self, rng: &mut AesRng) -> [U32x8; 3] {
        const N: usize = 5;
        const HALF_N: usize = 3;
        self.lemire_body::<N, HALF_N>(rng)
    }

    // NOTE: these techniques can also be extended to random number generation with a non-constant
    // upper-bound. However, if the bound is not known in advance, you might need to make different
    // choices. This is important when, for example, speeding up a Fisher-Yates shuffle.
    // See https://www.pcg-random.org/posts/bounded-rands.html
    // See https://lemire.me/blog/2019/06/06/nearly-divisionless-random-integer-generation-on-various-systems/
    // See https://github.com/colmmacc/s2n/blob/7ad9240c8b9ade0cc3a403a732ba9f1289934abd/utils/s2n_random.c#L187
    // See https://twitter.com/colmmacc/status/1311092454909599744
    // See https://github.com/apple/swift/pull/25286
    // This is an implementation of "Debiased Integer Multiplication — Lemire's Method"
    // See https://www.pcg-random.org/posts/bounded-rands.html
    #[inline(always)]
    fn lemire_body<const N: usize, const HALF_N: usize>(&self, rng: &mut AesRng) -> [U32x8; HALF_N]
    where
        ArrayUnrolledOps: UnrollableArraySize<N> + UnrollableArraySize<HALF_N>,
        [U32x8; N]: ArrayAdjacentPairs<T = U32x8, AdjacentPairs = [(U32x8, U32x8); HALF_N]>,
    {
        let range = U64x4::broadcast(self.bound as u64);
        let t = U32x8::broadcast(self.threshold);
        loop {
            let rand_bits = rng.random_bits_custom_size::<N>();
            let x = rand_bits.array_map(
                #[inline(always)]
                |x| U64x4::from(U32x4::from(x)),
            );
            let m = x.array_map(
                #[inline(always)]
                |x| U32x8::from(x.mul_lo(range)),
            );
            let m_interleaved = m.array_map(
                #[inline(always)]
                |m| m.shuffle::<3, 1, 2, 0>(),
            );
            let m_lo = m_interleaved
                .pair_adjacent_maybe_odd(U32x8::broadcast(u32::MAX))
                .array_map(
                    #[inline(always)]
                    |(a, b)| a.unpack_lo(b),
                );
            let l = m_lo;
            // We reject and try again if any element in l is less than self.threshold.
            // In other words, we reject if the min(every element in l, self.threshold) != threshold
            if l.array_fold(
                t,
                #[inline(always)]
                |a, b| a.min(b),
            ) != t
            {
                continue;
            }
            let m_hi = m_interleaved
                .pair_adjacent_maybe_odd(U32x8::ZERO)
                .array_map(
                    #[inline(always)]
                    |(a, b)| a.unpack_hi(b),
                );
            break m_hi;
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::Block;
    use proptest::prelude::*;
    use rand_core::SeedableRng;

    // The 500_000 bound is chosen somewhat arbitrarily, to make sure that the thresholds aren't
    // too big.

    proptest! {
        #[test]
        fn lemire_within_bounds(
            seed in any::<[u8; 16]>(),
            bound in 1..=500_000_u32,
        ) {
            let mut rng = AesRng::from_seed(Block::from(seed));
            let dist = UniformIntegersUnderBound::new(bound);
            for x in dist.sample(&mut rng).iter().copied() {
                for y in x.as_array().iter().copied() {
                    prop_assert!(y < bound);
                }
            }
        }
    }

    proptest! {
        #[test]
        fn lemire_20_within_bounds(
            seed in any::<[u8; 16]>(),
            bound in 1..=500_000_u32,
        ) {
            let mut rng = AesRng::from_seed(Block::from(seed));
            let dist = UniformIntegersUnderBound::new(bound);
            let out = dist.sample_20(&mut rng);
            for (i,x) in out.iter().copied().enumerate() {
                for (j,y) in x.as_array().iter().copied().enumerate() {
                    if i == 2 && (j % 2) == 1 {
                        prop_assert_eq!(y, 0);
                    }
                    prop_assert!(y < bound);
                }
            }
        }
    }
}