On Wed, 18 May 2022, Otto Moerbeek wrote:

> instrumenting the code to count the number of arc4random calls I see thsi:
> 
> openbsd; elapsed = 2.835819; calls = 12340949
> bitmask; elapsed = 4.335576; calls = 17836216
> bitmask+reuse; elapsed = 3.710277; calls = 15245337
> 
> (this is a different number of trials on a slow machine).
> 
> So the promise of less calls is not fulfilled. Sounds like a bug.

Well, I don't know whether the simple bitmasking approach will really
result in fewer calls. I *think* our current approach has a higher
probability per loop of suceeding (at least for small upper_bounds) than
mask-and-test, but my brain is too addled with a cold to calculate it :/

What Theo said is 100% right - the cost is dominated by that of the
underlying RNG. If anyone wants a faster arc4random_uniform() then the
first place to look it at arc4random().

It's entirely possible that the speedups measured in that article
are because they are using a omgoptimised (non-crypto) RNG and the
improvements they saw were due solely to reduction the overhead inside
the uniformity code even if it came at the cost of more RNG queries.

Anyway, here's a tweaked up version of the test harness that fakes out
arc4random() with a deterministic RNG that counts how often it is called
in case anyone wants to play with it further.

----


#include <stdlib.h>
#include <stdio.h>

#include <stdint.h>
#include <stdlib.h>

static uint32_t nqueries;

/* deterministic, query-counting arc4random() replacement for benchmarking */
static uint32_t
fake_random(void)
{
        static uint32_t ready, remain, msb;
        uint32_t ret;

        if (!ready) {
                srand_deterministic(31337);
                ready = 1;
        }
        if (remain == 0) {
                msb = (uint32_t)rand();
                remain = 31; /* XXX makes assumption re RAND_MAX */
        }
        ret = (uint32_t)rand() | (msb << 31);
        msb >>= 1;
        remain--;
        nqueries++;
        return ret;     
}

#define arc4random() fake_random()

#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#if __has_builtin(__builtin_clz)
#define arc4random_clz(x) __builtin_clz(x)
#else
#warning lacks __builtin_clz()
/* Count most-significant zero bits, like __builtin_clz() */
static int
arc4random_clz(unsigned int x)
{
        int ret = 0;
        unsigned int n;
        const int lut[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };

        while (x != 0) {
                n = (x >> 28) & 0xf;
                if (n != 0)
                        return ret + lut[n];
                x <<= 4;
                ret += 4;
        }
        return 32;
}
#endif

/* Current OpenBSD implementation */
static uint32_t
arc4random_uniform1(uint32_t upper_bound)
{
        uint32_t r, min;

        if (upper_bound < 2)
                return 0;

        /* 2**32 % x == (2**32 - x) % x */
        min = -upper_bound % upper_bound;

        /*
         * This could theoretically loop forever but each retry has
         * p > 0.5 (worst case, usually far better) of selecting a
         * number inside the range we need, so it should rarely need
         * to re-roll.
         */
        for (;;) {
                r = arc4random();
                if (r >= min)
                        break;
        }

        return r % upper_bound;
}

/*
 * Like "Bitmask with Rejection" implementation from
 * https://www.pcg-random.org/posts/bounded-rands.html
 */
static uint32_t
arc4random_uniform2(uint32_t upper_bound)
{
        uint32_t mask, r;

        if (upper_bound < 2)
                return 0;

        mask = 0xffffffff >> arc4random_clz((upper_bound - 1) | 1);;
        for (;;) {
                r = arc4random() & mask;
                if (r < upper_bound)
                        return r;
        }
        /* NOTREACHED */
}

/*
 * Like Apple's
 * 
https://opensource.apple.com/source/Libc/Libc-1158.50.2/gen/FreeBSD/arc4random.c
 */
static uint32_t
arc4random_uniform3(uint32_t upper_bound)
{
        int zeroes, bits, remain;
        uint32_t mask, r, rm;

        if (upper_bound < 2)
                return 0;

        zeroes = arc4random_clz((upper_bound - 1) | 1);
        bits = 32 - zeroes;
        mask = (uint32_t)-1 >> zeroes;

        for (;;) {
                r = arc4random();
                rm = r & mask;
                if (rm < upper_bound)
                        return rm;
                for (remain = zeroes; remain >= bits; remain -= bits) {
                        r >>= bits;
                        rm = r & mask;
                        if (rm < upper_bound)
                                return rm;
                }
        }
        /* NOTREACHED */
}

#include <sys/time.h>

static uint32_t
fixture(const char *s, uint32_t (* const rnd)(uint32_t))
{
        const uint32_t trials = 20 * 1000 * 1000;
        const uint32_t max = 0x80000000;
        const uint32_t mul = 7;
        unsigned int v, i, r;
        struct timeval start, finish, delta;

        nqueries = 0;
        gettimeofday(&start, NULL);
        for (v = 3, r = 0; v < max; v *= mul) {
                /* printf("%u\n", v); */
                for (i = 0; i < trials; i++)
                        r |= rnd(v);
        }
        gettimeofday(&finish, NULL);
        timersub(&finish, &start, &delta);
        printf("%s; elapsed = %lld.%06lld, queries = %u\n", s,
            (long long)delta.tv_sec, (long long)delta.tv_usec, nqueries);
        return r;
}

int main(void) {
        fixture("openbsd", arc4random_uniform1);
        fixture("bitmask", arc4random_uniform2);
        fixture("bitmask+reuse", arc4random_uniform3);
}

Reply via email to