On Wed, May 18, 2022 at 05:08:02PM +1000, Damien Miller wrote: > On Wed, 18 May 2022, Otto Moerbeek wrote: > > > instrumenting the code to count the number of arc4random calls I see thsi: > > > > openbsd; elapsed = 2.835819; calls = 12340949 > > bitmask; elapsed = 4.335576; calls = 17836216 > > bitmask+reuse; elapsed = 3.710277; calls = 15245337 > > > > (this is a different number of trials on a slow machine). > > > > So the promise of less calls is not fulfilled. Sounds like a bug. > > Well, I don't know whether the simple bitmasking approach will really > result in fewer calls. I *think* our current approach has a higher > probability per loop of suceeding (at least for small upper_bounds) than > mask-and-test, but my brain is too addled with a cold to calculate it :/
I *assumed* the speedups seen in the web page were because of less rnd calls. That was a mistake. > What Theo said is 100% right - the cost is dominated by that of the > underlying RNG. If anyone wants a faster arc4random_uniform() then the > first place to look it at arc4random(). One more reason to keep the current implementation of arc4random_uniform(). > > It's entirely possible that the speedups measured in that article > are because they are using a omgoptimised (non-crypto) RNG and the > improvements they saw were due solely to reduction the overhead inside > the uniformity code even if it came at the cost of more RNG queries. Right. > Anyway, here's a tweaked up version of the test harness that fakes out > arc4random() with a deterministic RNG that counts how often it is called > in case anyone wants to play with it further. -Otto > > ---- > > > #include <stdlib.h> > #include <stdio.h> > > #include <stdint.h> > #include <stdlib.h> > > static uint32_t nqueries; > > /* deterministic, query-counting arc4random() replacement for benchmarking */ > static uint32_t > fake_random(void) > { > static uint32_t ready, remain, msb; > uint32_t ret; > > if (!ready) { > srand_deterministic(31337); > ready = 1; > } > if (remain == 0) { > msb = (uint32_t)rand(); > remain = 31; /* XXX makes assumption re RAND_MAX */ > } > ret = (uint32_t)rand() | (msb << 31); > msb >>= 1; > remain--; > nqueries++; > return ret; > } > > #define arc4random() fake_random() > > #ifndef __has_builtin > #define __has_builtin(x) 0 > #endif > #if __has_builtin(__builtin_clz) > #define arc4random_clz(x) __builtin_clz(x) > #else > #warning lacks __builtin_clz() > /* Count most-significant zero bits, like __builtin_clz() */ > static int > arc4random_clz(unsigned int x) > { > int ret = 0; > unsigned int n; > const int lut[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; > > while (x != 0) { > n = (x >> 28) & 0xf; > if (n != 0) > return ret + lut[n]; > x <<= 4; > ret += 4; > } > return 32; > } > #endif > > /* Current OpenBSD implementation */ > static uint32_t > arc4random_uniform1(uint32_t upper_bound) > { > uint32_t r, min; > > if (upper_bound < 2) > return 0; > > /* 2**32 % x == (2**32 - x) % x */ > min = -upper_bound % upper_bound; > > /* > * This could theoretically loop forever but each retry has > * p > 0.5 (worst case, usually far better) of selecting a > * number inside the range we need, so it should rarely need > * to re-roll. > */ > for (;;) { > r = arc4random(); > if (r >= min) > break; > } > > return r % upper_bound; > } > > /* > * Like "Bitmask with Rejection" implementation from > * https://www.pcg-random.org/posts/bounded-rands.html > */ > static uint32_t > arc4random_uniform2(uint32_t upper_bound) > { > uint32_t mask, r; > > if (upper_bound < 2) > return 0; > > mask = 0xffffffff >> arc4random_clz((upper_bound - 1) | 1);; > for (;;) { > r = arc4random() & mask; > if (r < upper_bound) > return r; > } > /* NOTREACHED */ > } > > /* > * Like Apple's > * > https://opensource.apple.com/source/Libc/Libc-1158.50.2/gen/FreeBSD/arc4random.c > */ > static uint32_t > arc4random_uniform3(uint32_t upper_bound) > { > int zeroes, bits, remain; > uint32_t mask, r, rm; > > if (upper_bound < 2) > return 0; > > zeroes = arc4random_clz((upper_bound - 1) | 1); > bits = 32 - zeroes; > mask = (uint32_t)-1 >> zeroes; > > for (;;) { > r = arc4random(); > rm = r & mask; > if (rm < upper_bound) > return rm; > for (remain = zeroes; remain >= bits; remain -= bits) { > r >>= bits; > rm = r & mask; > if (rm < upper_bound) > return rm; > } > } > /* NOTREACHED */ > } > > #include <sys/time.h> > > static uint32_t > fixture(const char *s, uint32_t (* const rnd)(uint32_t)) > { > const uint32_t trials = 20 * 1000 * 1000; > const uint32_t max = 0x80000000; > const uint32_t mul = 7; > unsigned int v, i, r; > struct timeval start, finish, delta; > > nqueries = 0; > gettimeofday(&start, NULL); > for (v = 3, r = 0; v < max; v *= mul) { > /* printf("%u\n", v); */ > for (i = 0; i < trials; i++) > r |= rnd(v); > } > gettimeofday(&finish, NULL); > timersub(&finish, &start, &delta); > printf("%s; elapsed = %lld.%06lld, queries = %u\n", s, > (long long)delta.tv_sec, (long long)delta.tv_usec, nqueries); > return r; > } > > int main(void) { > fixture("openbsd", arc4random_uniform1); > fixture("bitmask", arc4random_uniform2); > fixture("bitmask+reuse", arc4random_uniform3); > } >