Hi, committed a slightly modified patch as r239356. Changes from the submitted patch attached. To my surprise, it turned out that my fallback code using __gthread_key_{create,delete} and __ghtread_{get,set}_specific was faster than my TLS code, so I removed the TLS configure magic and the TLS code and just left the __gthread stuff in.
On Wed, Aug 10, 2016 at 2:11 PM, Janne Blomqvist <blomqvist.ja...@gmail.com> wrote: > Hi, > > thanks for the Ok. However, moments before committing I got cold feet > and started digging around; it unfortunately seems that TLS > (_Thread_local) is not supported on all targets. So I'll have to > copy-paste some configure magic from libgomp/libjava/etc., and provide > workarounds for such systems. :( > > On Tue, Aug 9, 2016 at 8:01 AM, Jerry DeLisle <jvdeli...@charter.net> wrote: >> On 08/08/2016 04:01 AM, Janne Blomqvist wrote: >>> >>> PING**2 >>> >> >> OK, thanks for patch. >> >> Jerry >> >>> On Sun, Jul 24, 2016 at 4:45 PM, Janne Blomqvist >>> <blomqvist.ja...@gmail.com> wrote: >>>> >>>> Hi, >>>> >>>> the attached patch replaces the current random_number / random_seed >>>> implementations with an implementation that better supports threads. >>>> It's an improved version of the RFC patch I posted earlier at >>>> https://gcc.gnu.org/ml/gcc-patches/2015-12/msg02110.html . Please see >>>> that earlier message for a longer-winded explanation of what's wrong >>>> with the current implementation and how the patch addresses this. >>>> >>>> In short, with the patch the random number generator state is now >>>> per-thread and stored in a per-thread (TLS) variable, enabling a >>>> lockless fast-path. This provides up to 2 orders of magnitude better >>>> performance on a synthetic benchmark using 4 threads, and provides a >>>> more deterministic result as the order that threads are scheduled does >>>> not affect the random number streams for each thread. >>>> >>>> Compared to the RFC patch, a number of minor and not-so-minor bugs >>>> have been fixed, so the patch now passes the testsuite (with a few >>>> modifications to the suite, part of the patch). Also, for REAL kinds >>>> 4, 8, 10 the generated streams are identical (except precision, of >>>> course) (like the current implementation), enabling precision >>>> comparisons, as requested by Steve Kargl. However, this does not >>>> extend to REAL(16) as that would have necessitated doubling the size >>>> of the state, along with potential issues of slower escape from a >>>> low-entropy state, for a feature that I believe is not used by >>>> particularly many users in the end. So if one wants to do precision >>>> comparisons with REAL(16) one must employ a wrapper routine. >>>> >>>> Regtested on x86_64-pc-linux-gnu, Ok for trunk? >>>> >>>> frontend ChangeLog: >>>> >>>> 2016-07-27 Janne Blomqvist <j...@gcc.gnu.org> >>>> >>>> * check.c (gfc_check_random_seed): Use new seed size in check. >>>> * intrinsic.texi (RANDOM_NUMBER): Updated documentation. >>>> (RANDOM_SEED): Likewise. >>>> >>>> >>>> testsuite: >>>> >>>> 2016-07-27 Janne Blomqvist <j...@gcc.gnu.org> >>>> >>>> * gfortran.dg/random_7.f90: Take into account that the last seed >>>> value is the special p value. >>>> * gfortran.dg/random_seed_1.f90: Seed size is now constant. >>>> >>>> >>>> libgfortran: >>>> 2016-07-27 Janne Blomqvist <j...@gcc.gnu.org> >>>> >>>> * intrinsics/random.c: Replace KISS with xorshift1024* with >>>> per-thread (TLS) state. >>>> * runtime/main.c (init): Don't call random_seed_i4. >>>> >>>> >>>> -- >>>> Janne Blomqvist >>> >>> >>> >>> >> > > > > -- > Janne Blomqvist -- Janne Blomqvist
diff --git a/libgfortran/intrinsics/random.c b/libgfortran/intrinsics/random.c index 516b640..6084ebe 100644 --- a/libgfortran/intrinsics/random.c +++ b/libgfortran/intrinsics/random.c @@ -205,20 +205,39 @@ static uint64_t master_state[] = { }; -/* The actual random number state, as a TLS variable. */ -static _Thread_local xorshift1024star_state rand_state; +static __gthread_key_t rand_state_key; + +static xorshift1024star_state* +get_rand_state (void) +{ + /* For single threaded apps. */ + static xorshift1024star_state rand_state; + + if (__gthread_active_p ()) + { + void* p = __gthread_getspecific (rand_state_key); + if (!p) + { + p = xcalloc (1, sizeof (xorshift1024star_state)); + __gthread_setspecific (rand_state_key, p); + } + return p; + } + else + return &rand_state; +} static uint64_t -xorshift1024star (void) +xorshift1024star (xorshift1024star_state* rs) { - int p = rand_state.p; - const uint64_t s0 = rand_state.s[p]; - uint64_t s1 = rand_state.s[p = (p + 1) & 15]; + int p = rs->p; + const uint64_t s0 = rs->s[p]; + uint64_t s1 = rs->s[p = (p + 1) & 15]; s1 ^= s1 << 31; - rand_state.s[p] = s1 ^ s0 ^ (s1 >> 11) ^ (s0 >> 30); - rand_state.p = p; - return rand_state.s[p] * UINT64_C(1181783497276652981); + rs->s[p] = s1 ^ s0 ^ (s1 >> 11) ^ (s0 >> 30); + rs->p = p; + return rs->s[p] * UINT64_C(1181783497276652981); } @@ -227,7 +246,7 @@ xorshift1024star (void) non-overlapping subsequences for parallel computations. */ static void -jump (void) +jump (xorshift1024star_state* rs) { static const uint64_t JUMP[] = { 0x84242f96eca9c41dULL, 0xa3c65b8776f96855ULL, 0x5b34a39f070b5837ULL, @@ -244,11 +263,11 @@ jump (void) { if (JUMP[i] & 1ULL << b) for(int j = 0; j < 16; j++) - t[j] ^= rand_state.s[(j + rand_state.p) & 15]; - xorshift1024star (); + t[j] ^= rs->s[(j + rs->p) & 15]; + xorshift1024star (rs); } for(int j = 0; j < 16; j++) - rand_state.s[(j + rand_state.p) & 15] = t[j]; + rs->s[(j + rs->p) & 15] = t[j]; } @@ -256,17 +275,17 @@ jump (void) using the master state and the number of times we must jump. */ static void -init_rand_state (const bool locked) +init_rand_state (xorshift1024star_state* rs, const bool locked) { if (!locked) __gthread_mutex_lock (&random_lock); - memcpy (rand_state.s, master_state, sizeof (master_state)); + memcpy (&rs->s, master_state, sizeof (master_state)); unsigned n = njumps++; if (!locked) __gthread_mutex_unlock (&random_lock); for (unsigned i = 0; i < n; i++) - jump (); - rand_state.init = true; + jump (rs); + rs->init = true; } @@ -345,9 +364,11 @@ getosrandom (void *buf, size_t buflen) void random_r4 (GFC_REAL_4 *x) { - if (unlikely (!rand_state.init)) - init_rand_state (false); - uint64_t r = xorshift1024star (); + xorshift1024star_state* rs = get_rand_state(); + + if (unlikely (!rs->init)) + init_rand_state (rs, false); + uint64_t r = xorshift1024star (rs); /* Take the higher bits, ensuring that a stream of real(4), real(8), and real(10) will be identical (except for precision). */ uint32_t high = (uint32_t) (r >> 32); @@ -362,10 +383,11 @@ void random_r8 (GFC_REAL_8 *x) { GFC_UINTEGER_8 r; + xorshift1024star_state* rs = get_rand_state(); - if (unlikely (!rand_state.init)) - init_rand_state (false); - r = xorshift1024star (); + if (unlikely (!rs->init)) + init_rand_state (rs, false); + r = xorshift1024star (rs); rnumber_8 (x, r); } iexport(random_r8); @@ -379,10 +401,11 @@ void random_r10 (GFC_REAL_10 *x) { GFC_UINTEGER_8 r; + xorshift1024star_state* rs = get_rand_state(); - if (unlikely (!rand_state.init)) - init_rand_state (false); - r = xorshift1024star (); + if (unlikely (!rs->init)) + init_rand_state (rs, false); + r = xorshift1024star (rs); rnumber_10 (x, r); } iexport(random_r10); @@ -398,11 +421,12 @@ void random_r16 (GFC_REAL_16 *x) { GFC_UINTEGER_8 r1, r2; + xorshift1024star_state* rs = get_rand_state(); - if (unlikely (!rand_state.init)) - init_rand_state (false); - r1 = xorshift1024star (); - r2 = xorshift1024star (); + if (unlikely (!rs->init)) + init_rand_state (rs, false); + r1 = xorshift1024star (rs); + r2 = xorshift1024star (rs); rnumber_16 (x, r1, r2); } iexport(random_r16); @@ -422,8 +446,10 @@ arandom_r4 (gfc_array_r4 *x) index_type stride0; index_type dim; GFC_REAL_4 *dest; + xorshift1024star_state* rs = get_rand_state(); int n; + dest = x->base_addr; dim = GFC_DESCRIPTOR_RANK (x); @@ -439,13 +465,13 @@ arandom_r4 (gfc_array_r4 *x) stride0 = stride[0]; - if (unlikely (!rand_state.init)) - init_rand_state (false); + if (unlikely (!rs->init)) + init_rand_state (rs, false); while (dest) { /* random_r4 (dest); */ - uint64_t r = xorshift1024star (); + uint64_t r = xorshift1024star (rs); uint32_t high = (uint32_t) (r >> 32); rnumber_4 (dest, high); @@ -489,6 +515,7 @@ arandom_r8 (gfc_array_r8 *x) index_type stride0; index_type dim; GFC_REAL_8 *dest; + xorshift1024star_state* rs = get_rand_state(); int n; dest = x->base_addr; @@ -506,13 +533,13 @@ arandom_r8 (gfc_array_r8 *x) stride0 = stride[0]; - if (unlikely (!rand_state.init)) - init_rand_state (false); + if (unlikely (!rs->init)) + init_rand_state (rs, false); while (dest) { /* random_r8 (dest); */ - uint64_t r = xorshift1024star (); + uint64_t r = xorshift1024star (rs); rnumber_8 (dest, r); /* Advance to the next element. */ @@ -557,6 +584,7 @@ arandom_r10 (gfc_array_r10 *x) index_type stride0; index_type dim; GFC_REAL_10 *dest; + xorshift1024star_state* rs = get_rand_state(); int n; dest = x->base_addr; @@ -574,13 +602,13 @@ arandom_r10 (gfc_array_r10 *x) stride0 = stride[0]; - if (unlikely (!rand_state.init)) - init_rand_state (false); + if (unlikely (!rs->init)) + init_rand_state (rs, false); while (dest) { /* random_r10 (dest); */ - uint64_t r = xorshift1024star (); + uint64_t r = xorshift1024star (rs); rnumber_10 (dest, r); /* Advance to the next element. */ @@ -627,6 +655,7 @@ arandom_r16 (gfc_array_r16 *x) index_type stride0; index_type dim; GFC_REAL_16 *dest; + xorshift1024star_state* rs = get_rand_state(); int n; dest = x->base_addr; @@ -644,14 +673,14 @@ arandom_r16 (gfc_array_r16 *x) stride0 = stride[0]; - if (unlikely (!rand_state.init)) - init_rand_state (false); + if (unlikely (!rs->init)) + init_rand_state (rs, false); while (dest) { /* random_r16 (dest); */ - uint64_t r1 = xorshift1024star (); - uint64_t r2 = xorshift1024star (); + uint64_t r1 = xorshift1024star (rs); + uint64_t r2 = xorshift1024star (rs); rnumber_16 (dest, r1, r2); /* Advance to the next element. */ @@ -724,6 +753,8 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put, gfc_array_i4 *get) if (size != NULL) *size = SZ + 1; + xorshift1024star_state* rs = get_rand_state(); + /* Return the seed to GET data. */ if (get != NULL) { @@ -735,11 +766,11 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put, gfc_array_i4 *get) if (GFC_DESCRIPTOR_EXTENT(get,0) < (index_type) SZ + 1) runtime_error ("Array size of GET is too small."); - if (!rand_state.init) - init_rand_state (false); + if (!rs->init) + init_rand_state (rs, false); /* Unscramble the seed. */ - unscramble_seed (seed, (unsigned char *) rand_state.s, sizeof seed); + unscramble_seed (seed, (unsigned char *) rs->s, sizeof seed); /* Then copy it back to the user variable. */ for (size_t i = 0; i < SZ ; i++) @@ -748,7 +779,7 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put, gfc_array_i4 *get) sizeof(GFC_UINTEGER_4)); /* Finally copy the value of p after the seed. */ - get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rand_state.p; + get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rs->p; } else @@ -761,7 +792,7 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put, gfc_array_i4 *get) { getosrandom (master_state, sizeof (master_state)); njumps = 0; - init_rand_state (true); + init_rand_state (rs, true); } if (put != NULL) @@ -784,9 +815,9 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put, gfc_array_i4 *get) provide seeds with quality only in the lower or upper part. */ scramble_seed ((unsigned char *) master_state, seed, sizeof seed); njumps = 0; - init_rand_state (true); + init_rand_state (rs, true); - rand_state.p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] % 16; + rs->p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] & 15; } @@ -809,6 +840,8 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put, gfc_array_i8 *get) if (size != NULL) *size = SZ + 1; + xorshift1024star_state* rs = get_rand_state(); + /* Return the seed to GET data. */ if (get != NULL) { @@ -820,15 +853,15 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put, gfc_array_i8 *get) if (GFC_DESCRIPTOR_EXTENT(get,0) < (index_type) SZ + 1) runtime_error ("Array size of GET is too small."); - if (!rand_state.init) - init_rand_state (false); + if (!rs->init) + init_rand_state (rs, false); /* This code now should do correct strides. */ for (size_t i = 0; i < SZ; i++) - memcpy (&(get->base_addr[i * GFC_DESCRIPTOR_STRIDE(get,0)]), &rand_state.s[i], + memcpy (&(get->base_addr[i * GFC_DESCRIPTOR_STRIDE(get,0)]), &rs->s[i], sizeof (GFC_UINTEGER_8)); - get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rand_state.p; + get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rs->p; } else @@ -841,7 +874,7 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put, gfc_array_i8 *get) { getosrandom (master_state, sizeof (master_state)); njumps = 0; - init_rand_state (true); + init_rand_state (rs, true); } if (put != NULL) @@ -860,22 +893,34 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put, gfc_array_i8 *get) sizeof (GFC_UINTEGER_8)); njumps = 0; - init_rand_state (true); - rand_state.p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] % 16; + init_rand_state (rs, true); + rs->p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] & 15; } - __gthread_mutex_unlock (&random_lock); } } iexport(random_seed_i8); -#ifndef __GTHREAD_MUTEX_INIT +#if !defined __GTHREAD_MUTEX_INIT || defined __GTHREADS static void __attribute__((constructor)) -init (void) +constructor_random (void) { +#ifndef __GTHREAD_MUTEX_INIT __GTHREAD_MUTEX_INIT_FUNCTION (&random_lock); +#endif + if (__gthread_active_p ()) + __gthread_key_create (&rand_state_key, &free); +} +#endif + +#ifdef __GTHREADS +static void __attribute__((destructor)) +destructor_random (void) +{ + if (__gthread_active_p ()) + __gthread_key_delete (rand_state_key); } #endif