random: use rejection sampling for uniform bounded random integers

Until the very recent commits, many bounded random integers were
calculated using `get_random_u32() % max_plus_one`, which not only
incurs the price of a division -- indicating performance mostly was not
a real issue -- but also does not result in a uniformly distributed
output if max_plus_one is not a power of two. Recent commits moved to
using `prandom_u32_max(max_plus_one)`, which replaces the division with
a faster multiplication, but still does not solve the issue with
non-uniform output.

For some users, maybe this isn't a problem, and for others, maybe it is,
but for the majority of users, probably the question has never been
posed and analyzed, and nobody thought much about it, probably assuming
random is random is random. In other words, the unthinking expectation
of most users is likely that the resultant numbers are uniform.

So we implement here an efficient way of generating uniform bounded
random integers. Through use of compile-time evaluation, and avoiding
divisions as much as possible, this commit introduces no measurable
overhead. At least for hot-path uses tested, any potential difference
was lost in the noise. On both clang and gcc, code generation is pretty
small.

The new function, get_random_u32_below(), lives in random.h, rather than
prandom.h, and has a "get_random_xxx" function name, because it is
suitable for all uses, including cryptography.

In order to be efficient, we implement a kernel-specific variant of
Daniel Lemire's algorithm from "Fast Random Integer Generation in an
Interval", linked below. The kernel's variant takes advantage of
constant folding to avoid divisions entirely in the vast majority of
cases, works on both 32-bit and 64-bit architectures, and requests a
minimal amount of bytes from the RNG.

Link: https://arxiv.org/pdf/1805.10941.pdf
Cc: stable@vger.kernel.org # to ease future backports that use this api
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

Jason A. Donenfeld 3 years ago e9a688bc 6ce62593

+64 -16

3 changed files

expand all

drivers

char

random.c

include

linux

prandom.h

random.h

+22

drivers/char/random.c

··· 160 160 * u8 get_random_u8() 161 161 * u16 get_random_u16() 162 162 * u32 get_random_u32() 163 + * u32 get_random_u32_below(u32 ceil) 163 164 * u64 get_random_u64() 164 165 * unsigned long get_random_long() 165 166 * ··· 510 509 DEFINE_BATCHED_ENTROPY(u16) 511 510 DEFINE_BATCHED_ENTROPY(u32) 512 511 DEFINE_BATCHED_ENTROPY(u64) 512 + 513 + u32 __get_random_u32_below(u32 ceil) 514 + { 515 + /* 516 + * This is the slow path for variable ceil. It is still fast, most of 517 + * the time, by doing traditional reciprocal multiplication and 518 + * opportunistically comparing the lower half to ceil itself, before 519 + * falling back to computing a larger bound, and then rejecting samples 520 + * whose lower half would indicate a range indivisible by ceil. The use 521 + * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable 522 + * in 32-bits. 523 + */ 524 + u64 mult = (u64)ceil * get_random_u32(); 525 + if (unlikely((u32)mult < ceil)) { 526 + u32 bound = -ceil % ceil; 527 + while (unlikely((u32)mult < bound)) 528 + mult = (u64)ceil * get_random_u32(); 529 + } 530 + return mult >> 32; 531 + } 532 + EXPORT_SYMBOL(__get_random_u32_below); 513 533 514 534 #ifdef CONFIG_SMP 515 535 /*

+2 -16

include/linux/prandom.h

··· 23 23 #define prandom_init_once(pcpu_state) \ 24 24 DO_ONCE(prandom_seed_full_state, (pcpu_state)) 25 25 26 - /** 27 - * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) 28 - * @ep_ro: right open interval endpoint 29 - * 30 - * Returns a pseudo-random number that is in interval [0, ep_ro). This is 31 - * useful when requesting a random index of an array containing ep_ro elements, 32 - * for example. The result is somewhat biased when ep_ro is not a power of 2, 33 - * so do not use this for cryptographic purposes. 34 - * 35 - * Returns: pseudo-random number in interval [0, ep_ro) 36 - */ 26 + /* Deprecated: use get_random_u32_below() instead. */ 37 27 static inline u32 prandom_u32_max(u32 ep_ro) 38 28 { 39 - if (__builtin_constant_p(ep_ro <= 1U << 8) && ep_ro <= 1U << 8) 40 - return (get_random_u8() * ep_ro) >> 8; 41 - if (__builtin_constant_p(ep_ro <= 1U << 16) && ep_ro <= 1U << 16) 42 - return (get_random_u16() * ep_ro) >> 16; 43 - return ((u64)get_random_u32() * ep_ro) >> 32; 29 + return get_random_u32_below(ep_ro); 44 30 } 45 31 46 32 /*

+40

include/linux/random.h

··· 51 51 #endif 52 52 } 53 53 54 + u32 __get_random_u32_below(u32 ceil); 55 + 56 + /* 57 + * Returns a random integer in the interval [0, ceil), with uniform 58 + * distribution, suitable for all uses. Fastest when ceil is a constant, but 59 + * still fast for variable ceil as well. 60 + */ 61 + static inline u32 get_random_u32_below(u32 ceil) 62 + { 63 + if (!__builtin_constant_p(ceil)) 64 + return __get_random_u32_below(ceil); 65 + 66 + /* 67 + * For the fast path, below, all operations on ceil are precomputed by 68 + * the compiler, so this incurs no overhead for checking pow2, doing 69 + * divisions, or branching based on integer size. The resultant 70 + * algorithm does traditional reciprocal multiplication (typically 71 + * optimized by the compiler into shifts and adds), rejecting samples 72 + * whose lower half would indicate a range indivisible by ceil. 73 + */ 74 + BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0"); 75 + if (ceil <= 1) 76 + return 0; 77 + for (;;) { 78 + if (ceil <= 1U << 8) { 79 + u32 mult = ceil * get_random_u8(); 80 + if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil)) 81 + return mult >> 8; 82 + } else if (ceil <= 1U << 16) { 83 + u32 mult = ceil * get_random_u16(); 84 + if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil)) 85 + return mult >> 16; 86 + } else { 87 + u64 mult = (u64)ceil * get_random_u32(); 88 + if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil)) 89 + return mult >> 32; 90 + } 91 + } 92 + } 93 + 54 94 /* 55 95 * On 64-bit architectures, protect against non-terminated C string overflows 56 96 * by zeroing out the first byte of the canary; this leaves 56 bits of entropy.

Configure Feed

Configure Feed