On Tue, 17 May 2022, Raimo Niskanen wrote: > Why reinvent the wheel? > > Here is a pretty good walkthrough of established methods: > > https://www.pcg-random.org/posts/bounded-rands.html > > It sounds to me as if your suggested methor essentially is > "Bitmask with Rejection -- Apple's Method" with the added twist > to keep the unused bits of the base generator's word and append > new, which might be an optimization for small ranges, but might > be just overhead for large ones. > > In that post one can see that there might be other suggested smaller > optimizations that can be applied to the OpenBSD method, and that > the bitmask method is effective in many cases but not a clear winner.
Oh nice, I wasn't aware that Apple had an improved algorithm. I always thought the best avenue for a speedup was to consider only the bits that could satisfy the constraint, but it never occurred to me how to actually make use of this :) The toy benchmark below compares the existing implementation with reimplementations of both their version as well as something more close to Apple's actual method (which reuses the high bits). However, I don't see a speedup for either of the alternate approaches. [djm@djm ~]$ cc -O2 -Werror -Wall -Wextra -o rb rb.c && doas nice -n -20 ./rb openbsd; elapsed = 8.327954 bitmask; elapsed = 13.306228 bitmask+reuse; elapsed = 11.567389 Maybe my benchmark is crap or maybe I need dlg@ to omgoptimize it for me... ---- #include <stdlib.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> #ifndef __has_builtin #define __has_builtin(x) 0 #endif #if __has_builtin(__builtin_clz) #define arc4random_clz(x) __builtin_clz(x) #else #warning lacks __builtin_clz() /* Count most-significant zero bits, like __builtin_clz() */ static int arc4random_clz(unsigned int x) { int ret = 0; unsigned int n; const int lut[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; while (x != 0) { n = (x >> 28) & 0xf; if (n != 0) return ret + lut[n]; x <<= 4; ret += 4; } return 32; } #endif /* Current OpenBSD implementation */ static uint32_t arc4random_uniform1(uint32_t upper_bound) { uint32_t r, min; if (upper_bound < 2) return 0; /* 2**32 % x == (2**32 - x) % x */ min = -upper_bound % upper_bound; /* * This could theoretically loop forever but each retry has * p > 0.5 (worst case, usually far better) of selecting a * number inside the range we need, so it should rarely need * to re-roll. */ for (;;) { r = arc4random(); if (r >= min) break; } return r % upper_bound; } /* * Like "Bitmask with Rejection" implementation from * https://www.pcg-random.org/posts/bounded-rands.html */ static uint32_t arc4random_uniform2(uint32_t upper_bound) { uint32_t mask, r; if (upper_bound < 2) return 0; mask = (uint32_t)-1 >> arc4random_clz((upper_bound - 1) | 1);; for (;;) { r = arc4random(); if ((r & mask) < upper_bound) return r & mask; } /* NOTREACHED */ } /* * Like Apple's * https://opensource.apple.com/source/Libc/Libc-1158.50.2/gen/FreeBSD/arc4random.c */ static uint32_t arc4random_uniform3(uint32_t upper_bound) { int zeroes, bits, remain = 32; uint32_t mask, r; if (upper_bound < 2) return 0; zeroes = arc4random_clz((upper_bound - 1) | 1); bits = 32 - zeroes; mask = (uint32_t)-1 >> zeroes; for (;;) { r = arc4random(); if ((r & mask) < upper_bound) return r & mask; for (remain = zeroes; remain >= bits; remain -= bits) { r >>= bits; if ((r & mask) < upper_bound) return r & mask; } } /* NOTREACHED */ } #include <sys/time.h> static uint32_t fixture(const char *s, uint32_t (* const rnd)(uint32_t)) { const uint32_t trials = 50 * 1000 * 1000; const uint32_t max = 0x80000000; const uint32_t mul = 7; unsigned int v, i, r; struct timeval start, finish, delta; gettimeofday(&start, NULL); for (v = 3, r = 0; v < max; v *= mul) { /* printf("%u\n", v); */ for (i = 0; i < trials; i++) r |= rnd(v); } gettimeofday(&finish, NULL); timersub(&finish, &start, &delta); printf("%s; elapsed = %lld.%06lld\n", s, (long long)delta.tv_sec, (long long)delta.tv_usec); return r; } int main(void) { fixture("openbsd", arc4random_uniform1); fixture("bitmask", arc4random_uniform2); fixture("bitmask+reuse", arc4random_uniform3); }