On Tue, 17 May 2022, Raimo Niskanen wrote:

> Why reinvent the wheel?
> 
> Here is a pretty good walkthrough of established methods:
> 
>     https://www.pcg-random.org/posts/bounded-rands.html
> 
> It sounds to me as if your suggested methor essentially is
> "Bitmask with Rejection -- Apple's Method" with the added twist
> to keep the unused bits of the base generator's word and append
> new, which might be an optimization for small ranges, but might
> be just overhead for large ones.
> 
> In that post one can see that there might be other suggested smaller
> optimizations that can be applied to the OpenBSD method, and that
> the bitmask method is effective in many cases but not a clear winner.

Oh nice, I wasn't aware that Apple had an improved algorithm. I always
thought the best avenue for a speedup was to consider only the bits that
could satisfy the constraint, but it never occurred to me how to actually
make use of this :)

The toy benchmark below compares the existing implementation with
reimplementations of both their version as well as something more close
to Apple's actual method (which reuses the high bits).

However, I don't see a speedup for either of the alternate approaches.

[djm@djm ~]$ cc -O2 -Werror -Wall -Wextra -o rb rb.c && doas nice -n -20 ./rb 
openbsd; elapsed = 8.327954
bitmask; elapsed = 13.306228
bitmask+reuse; elapsed = 11.567389

Maybe my benchmark is crap or maybe I need dlg@ to omgoptimize it for me...

----

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>

#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
#if __has_builtin(__builtin_clz)
#define arc4random_clz(x) __builtin_clz(x)
#else
#warning lacks __builtin_clz()
/* Count most-significant zero bits, like __builtin_clz() */
static int
arc4random_clz(unsigned int x)
{
        int ret = 0;
        unsigned int n;
        const int lut[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };

        while (x != 0) {
                n = (x >> 28) & 0xf;
                if (n != 0)
                        return ret + lut[n];
                x <<= 4;
                ret += 4;
        }
        return 32;
}
#endif

/* Current OpenBSD implementation */
static uint32_t
arc4random_uniform1(uint32_t upper_bound)
{
        uint32_t r, min;

        if (upper_bound < 2)
                return 0;

        /* 2**32 % x == (2**32 - x) % x */
        min = -upper_bound % upper_bound;

        /*
         * This could theoretically loop forever but each retry has
         * p > 0.5 (worst case, usually far better) of selecting a
         * number inside the range we need, so it should rarely need
         * to re-roll.
         */
        for (;;) {
                r = arc4random();
                if (r >= min)
                        break;
        }

        return r % upper_bound;
}

/*
 * Like "Bitmask with Rejection" implementation from
 * https://www.pcg-random.org/posts/bounded-rands.html
 */
static uint32_t
arc4random_uniform2(uint32_t upper_bound)
{
        uint32_t mask, r;

        if (upper_bound < 2)
                return 0;

        mask = (uint32_t)-1 >> arc4random_clz((upper_bound - 1) | 1);;
        for (;;) {
                r = arc4random();
                if ((r & mask) < upper_bound)
                        return r & mask;
        }
        /* NOTREACHED */
}

/*
 * Like Apple's
 * 
https://opensource.apple.com/source/Libc/Libc-1158.50.2/gen/FreeBSD/arc4random.c
 */
static uint32_t
arc4random_uniform3(uint32_t upper_bound)
{
        int zeroes, bits, remain = 32;
        uint32_t mask, r;

        if (upper_bound < 2)
                return 0;

        zeroes = arc4random_clz((upper_bound - 1) | 1);
        bits = 32 - zeroes;
        mask = (uint32_t)-1 >> zeroes;

        for (;;) {
                r = arc4random();
                if ((r & mask) < upper_bound)
                        return r & mask;
                for (remain = zeroes; remain >= bits; remain -= bits) {
                        r >>= bits;
                        if ((r & mask) < upper_bound)
                                return r & mask;
                }
        }
        /* NOTREACHED */
}


#include <sys/time.h>

static uint32_t
fixture(const char *s, uint32_t (* const rnd)(uint32_t))
{
        const uint32_t trials = 50 * 1000 * 1000;
        const uint32_t max = 0x80000000;
        const uint32_t mul = 7;
        unsigned int v, i, r;
        struct timeval start, finish, delta;

        gettimeofday(&start, NULL);
        for (v = 3, r = 0; v < max; v *= mul) {
                /* printf("%u\n", v); */
                for (i = 0; i < trials; i++)
                        r |= rnd(v);
        }
        gettimeofday(&finish, NULL);
        timersub(&finish, &start, &delta);
        printf("%s; elapsed = %lld.%06lld\n", s,
            (long long)delta.tv_sec, (long long)delta.tv_usec);
        return r;
}

int main(void) {
        fixture("openbsd", arc4random_uniform1);
        fixture("bitmask", arc4random_uniform2);
        fixture("bitmask+reuse", arc4random_uniform3);
}

Reply via email to