On Wed, 18 May 2022, Otto Moerbeek wrote: > instrumenting the code to count the number of arc4random calls I see thsi: > > openbsd; elapsed = 2.835819; calls = 12340949 > bitmask; elapsed = 4.335576; calls = 17836216 > bitmask+reuse; elapsed = 3.710277; calls = 15245337 > > (this is a different number of trials on a slow machine). > > So the promise of less calls is not fulfilled. Sounds like a bug.
Well, I don't know whether the simple bitmasking approach will really result in fewer calls. I *think* our current approach has a higher probability per loop of suceeding (at least for small upper_bounds) than mask-and-test, but my brain is too addled with a cold to calculate it :/ What Theo said is 100% right - the cost is dominated by that of the underlying RNG. If anyone wants a faster arc4random_uniform() then the first place to look it at arc4random(). It's entirely possible that the speedups measured in that article are because they are using a omgoptimised (non-crypto) RNG and the improvements they saw were due solely to reduction the overhead inside the uniformity code even if it came at the cost of more RNG queries. Anyway, here's a tweaked up version of the test harness that fakes out arc4random() with a deterministic RNG that counts how often it is called in case anyone wants to play with it further. ---- #include <stdlib.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> static uint32_t nqueries; /* deterministic, query-counting arc4random() replacement for benchmarking */ static uint32_t fake_random(void) { static uint32_t ready, remain, msb; uint32_t ret; if (!ready) { srand_deterministic(31337); ready = 1; } if (remain == 0) { msb = (uint32_t)rand(); remain = 31; /* XXX makes assumption re RAND_MAX */ } ret = (uint32_t)rand() | (msb << 31); msb >>= 1; remain--; nqueries++; return ret; } #define arc4random() fake_random() #ifndef __has_builtin #define __has_builtin(x) 0 #endif #if __has_builtin(__builtin_clz) #define arc4random_clz(x) __builtin_clz(x) #else #warning lacks __builtin_clz() /* Count most-significant zero bits, like __builtin_clz() */ static int arc4random_clz(unsigned int x) { int ret = 0; unsigned int n; const int lut[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; while (x != 0) { n = (x >> 28) & 0xf; if (n != 0) return ret + lut[n]; x <<= 4; ret += 4; } return 32; } #endif /* Current OpenBSD implementation */ static uint32_t arc4random_uniform1(uint32_t upper_bound) { uint32_t r, min; if (upper_bound < 2) return 0; /* 2**32 % x == (2**32 - x) % x */ min = -upper_bound % upper_bound; /* * This could theoretically loop forever but each retry has * p > 0.5 (worst case, usually far better) of selecting a * number inside the range we need, so it should rarely need * to re-roll. */ for (;;) { r = arc4random(); if (r >= min) break; } return r % upper_bound; } /* * Like "Bitmask with Rejection" implementation from * https://www.pcg-random.org/posts/bounded-rands.html */ static uint32_t arc4random_uniform2(uint32_t upper_bound) { uint32_t mask, r; if (upper_bound < 2) return 0; mask = 0xffffffff >> arc4random_clz((upper_bound - 1) | 1);; for (;;) { r = arc4random() & mask; if (r < upper_bound) return r; } /* NOTREACHED */ } /* * Like Apple's * https://opensource.apple.com/source/Libc/Libc-1158.50.2/gen/FreeBSD/arc4random.c */ static uint32_t arc4random_uniform3(uint32_t upper_bound) { int zeroes, bits, remain; uint32_t mask, r, rm; if (upper_bound < 2) return 0; zeroes = arc4random_clz((upper_bound - 1) | 1); bits = 32 - zeroes; mask = (uint32_t)-1 >> zeroes; for (;;) { r = arc4random(); rm = r & mask; if (rm < upper_bound) return rm; for (remain = zeroes; remain >= bits; remain -= bits) { r >>= bits; rm = r & mask; if (rm < upper_bound) return rm; } } /* NOTREACHED */ } #include <sys/time.h> static uint32_t fixture(const char *s, uint32_t (* const rnd)(uint32_t)) { const uint32_t trials = 20 * 1000 * 1000; const uint32_t max = 0x80000000; const uint32_t mul = 7; unsigned int v, i, r; struct timeval start, finish, delta; nqueries = 0; gettimeofday(&start, NULL); for (v = 3, r = 0; v < max; v *= mul) { /* printf("%u\n", v); */ for (i = 0; i < trials; i++) r |= rnd(v); } gettimeofday(&finish, NULL); timersub(&finish, &start, &delta); printf("%s; elapsed = %lld.%06lld, queries = %u\n", s, (long long)delta.tv_sec, (long long)delta.tv_usec, nqueries); return r; } int main(void) { fixture("openbsd", arc4random_uniform1); fixture("bitmask", arc4random_uniform2); fixture("bitmask+reuse", arc4random_uniform3); }