Hi,
committed a slightly modified patch as r239356. Changes from the
submitted patch attached. To my surprise, it turned out that my
fallback code using __gthread_key_{create,delete} and
__ghtread_{get,set}_specific was faster than my TLS code, so I removed
the TLS configure magic and the TLS code and just left the __gthread
stuff in.
On Wed, Aug 10, 2016 at 2:11 PM, Janne Blomqvist
<[email protected]> wrote:
> Hi,
>
> thanks for the Ok. However, moments before committing I got cold feet
> and started digging around; it unfortunately seems that TLS
> (_Thread_local) is not supported on all targets. So I'll have to
> copy-paste some configure magic from libgomp/libjava/etc., and provide
> workarounds for such systems. :(
>
> On Tue, Aug 9, 2016 at 8:01 AM, Jerry DeLisle <[email protected]> wrote:
>> On 08/08/2016 04:01 AM, Janne Blomqvist wrote:
>>>
>>> PING**2
>>>
>>
>> OK, thanks for patch.
>>
>> Jerry
>>
>>> On Sun, Jul 24, 2016 at 4:45 PM, Janne Blomqvist
>>> <[email protected]> wrote:
>>>>
>>>> Hi,
>>>>
>>>> the attached patch replaces the current random_number / random_seed
>>>> implementations with an implementation that better supports threads.
>>>> It's an improved version of the RFC patch I posted earlier at
>>>> https://gcc.gnu.org/ml/gcc-patches/2015-12/msg02110.html . Please see
>>>> that earlier message for a longer-winded explanation of what's wrong
>>>> with the current implementation and how the patch addresses this.
>>>>
>>>> In short, with the patch the random number generator state is now
>>>> per-thread and stored in a per-thread (TLS) variable, enabling a
>>>> lockless fast-path. This provides up to 2 orders of magnitude better
>>>> performance on a synthetic benchmark using 4 threads, and provides a
>>>> more deterministic result as the order that threads are scheduled does
>>>> not affect the random number streams for each thread.
>>>>
>>>> Compared to the RFC patch, a number of minor and not-so-minor bugs
>>>> have been fixed, so the patch now passes the testsuite (with a few
>>>> modifications to the suite, part of the patch). Also, for REAL kinds
>>>> 4, 8, 10 the generated streams are identical (except precision, of
>>>> course) (like the current implementation), enabling precision
>>>> comparisons, as requested by Steve Kargl. However, this does not
>>>> extend to REAL(16) as that would have necessitated doubling the size
>>>> of the state, along with potential issues of slower escape from a
>>>> low-entropy state, for a feature that I believe is not used by
>>>> particularly many users in the end. So if one wants to do precision
>>>> comparisons with REAL(16) one must employ a wrapper routine.
>>>>
>>>> Regtested on x86_64-pc-linux-gnu, Ok for trunk?
>>>>
>>>> frontend ChangeLog:
>>>>
>>>> 2016-07-27 Janne Blomqvist <[email protected]>
>>>>
>>>> * check.c (gfc_check_random_seed): Use new seed size in check.
>>>> * intrinsic.texi (RANDOM_NUMBER): Updated documentation.
>>>> (RANDOM_SEED): Likewise.
>>>>
>>>>
>>>> testsuite:
>>>>
>>>> 2016-07-27 Janne Blomqvist <[email protected]>
>>>>
>>>> * gfortran.dg/random_7.f90: Take into account that the last seed
>>>> value is the special p value.
>>>> * gfortran.dg/random_seed_1.f90: Seed size is now constant.
>>>>
>>>>
>>>> libgfortran:
>>>> 2016-07-27 Janne Blomqvist <[email protected]>
>>>>
>>>> * intrinsics/random.c: Replace KISS with xorshift1024* with
>>>> per-thread (TLS) state.
>>>> * runtime/main.c (init): Don't call random_seed_i4.
>>>>
>>>>
>>>> --
>>>> Janne Blomqvist
>>>
>>>
>>>
>>>
>>
>
>
>
> --
> Janne Blomqvist
--
Janne Blomqvist
diff --git a/libgfortran/intrinsics/random.c b/libgfortran/intrinsics/random.c
index 516b640..6084ebe 100644
--- a/libgfortran/intrinsics/random.c
+++ b/libgfortran/intrinsics/random.c
@@ -205,20 +205,39 @@ static uint64_t master_state[] = {
};
-/* The actual random number state, as a TLS variable. */
-static _Thread_local xorshift1024star_state rand_state;
+static __gthread_key_t rand_state_key;
+
+static xorshift1024star_state*
+get_rand_state (void)
+{
+ /* For single threaded apps. */
+ static xorshift1024star_state rand_state;
+
+ if (__gthread_active_p ())
+ {
+ void* p = __gthread_getspecific (rand_state_key);
+ if (!p)
+ {
+ p = xcalloc (1, sizeof (xorshift1024star_state));
+ __gthread_setspecific (rand_state_key, p);
+ }
+ return p;
+ }
+ else
+ return &rand_state;
+}
static uint64_t
-xorshift1024star (void)
+xorshift1024star (xorshift1024star_state* rs)
{
- int p = rand_state.p;
- const uint64_t s0 = rand_state.s[p];
- uint64_t s1 = rand_state.s[p = (p + 1) & 15];
+ int p = rs->p;
+ const uint64_t s0 = rs->s[p];
+ uint64_t s1 = rs->s[p = (p + 1) & 15];
s1 ^= s1 << 31;
- rand_state.s[p] = s1 ^ s0 ^ (s1 >> 11) ^ (s0 >> 30);
- rand_state.p = p;
- return rand_state.s[p] * UINT64_C(1181783497276652981);
+ rs->s[p] = s1 ^ s0 ^ (s1 >> 11) ^ (s0 >> 30);
+ rs->p = p;
+ return rs->s[p] * UINT64_C(1181783497276652981);
}
@@ -227,7 +246,7 @@ xorshift1024star (void)
non-overlapping subsequences for parallel computations. */
static void
-jump (void)
+jump (xorshift1024star_state* rs)
{
static const uint64_t JUMP[] = {
0x84242f96eca9c41dULL, 0xa3c65b8776f96855ULL, 0x5b34a39f070b5837ULL,
@@ -244,11 +263,11 @@ jump (void)
{
if (JUMP[i] & 1ULL << b)
for(int j = 0; j < 16; j++)
- t[j] ^= rand_state.s[(j + rand_state.p) & 15];
- xorshift1024star ();
+ t[j] ^= rs->s[(j + rs->p) & 15];
+ xorshift1024star (rs);
}
for(int j = 0; j < 16; j++)
- rand_state.s[(j + rand_state.p) & 15] = t[j];
+ rs->s[(j + rs->p) & 15] = t[j];
}
@@ -256,17 +275,17 @@ jump (void)
using the master state and the number of times we must jump. */
static void
-init_rand_state (const bool locked)
+init_rand_state (xorshift1024star_state* rs, const bool locked)
{
if (!locked)
__gthread_mutex_lock (&random_lock);
- memcpy (rand_state.s, master_state, sizeof (master_state));
+ memcpy (&rs->s, master_state, sizeof (master_state));
unsigned n = njumps++;
if (!locked)
__gthread_mutex_unlock (&random_lock);
for (unsigned i = 0; i < n; i++)
- jump ();
- rand_state.init = true;
+ jump (rs);
+ rs->init = true;
}
@@ -345,9 +364,11 @@ getosrandom (void *buf, size_t buflen)
void
random_r4 (GFC_REAL_4 *x)
{
- if (unlikely (!rand_state.init))
- init_rand_state (false);
- uint64_t r = xorshift1024star ();
+ xorshift1024star_state* rs = get_rand_state();
+
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
+ uint64_t r = xorshift1024star (rs);
/* Take the higher bits, ensuring that a stream of real(4), real(8),
and real(10) will be identical (except for precision). */
uint32_t high = (uint32_t) (r >> 32);
@@ -362,10 +383,11 @@ void
random_r8 (GFC_REAL_8 *x)
{
GFC_UINTEGER_8 r;
+ xorshift1024star_state* rs = get_rand_state();
- if (unlikely (!rand_state.init))
- init_rand_state (false);
- r = xorshift1024star ();
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
+ r = xorshift1024star (rs);
rnumber_8 (x, r);
}
iexport(random_r8);
@@ -379,10 +401,11 @@ void
random_r10 (GFC_REAL_10 *x)
{
GFC_UINTEGER_8 r;
+ xorshift1024star_state* rs = get_rand_state();
- if (unlikely (!rand_state.init))
- init_rand_state (false);
- r = xorshift1024star ();
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
+ r = xorshift1024star (rs);
rnumber_10 (x, r);
}
iexport(random_r10);
@@ -398,11 +421,12 @@ void
random_r16 (GFC_REAL_16 *x)
{
GFC_UINTEGER_8 r1, r2;
+ xorshift1024star_state* rs = get_rand_state();
- if (unlikely (!rand_state.init))
- init_rand_state (false);
- r1 = xorshift1024star ();
- r2 = xorshift1024star ();
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
+ r1 = xorshift1024star (rs);
+ r2 = xorshift1024star (rs);
rnumber_16 (x, r1, r2);
}
iexport(random_r16);
@@ -422,8 +446,10 @@ arandom_r4 (gfc_array_r4 *x)
index_type stride0;
index_type dim;
GFC_REAL_4 *dest;
+ xorshift1024star_state* rs = get_rand_state();
int n;
+
dest = x->base_addr;
dim = GFC_DESCRIPTOR_RANK (x);
@@ -439,13 +465,13 @@ arandom_r4 (gfc_array_r4 *x)
stride0 = stride[0];
- if (unlikely (!rand_state.init))
- init_rand_state (false);
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
while (dest)
{
/* random_r4 (dest); */
- uint64_t r = xorshift1024star ();
+ uint64_t r = xorshift1024star (rs);
uint32_t high = (uint32_t) (r >> 32);
rnumber_4 (dest, high);
@@ -489,6 +515,7 @@ arandom_r8 (gfc_array_r8 *x)
index_type stride0;
index_type dim;
GFC_REAL_8 *dest;
+ xorshift1024star_state* rs = get_rand_state();
int n;
dest = x->base_addr;
@@ -506,13 +533,13 @@ arandom_r8 (gfc_array_r8 *x)
stride0 = stride[0];
- if (unlikely (!rand_state.init))
- init_rand_state (false);
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
while (dest)
{
/* random_r8 (dest); */
- uint64_t r = xorshift1024star ();
+ uint64_t r = xorshift1024star (rs);
rnumber_8 (dest, r);
/* Advance to the next element. */
@@ -557,6 +584,7 @@ arandom_r10 (gfc_array_r10 *x)
index_type stride0;
index_type dim;
GFC_REAL_10 *dest;
+ xorshift1024star_state* rs = get_rand_state();
int n;
dest = x->base_addr;
@@ -574,13 +602,13 @@ arandom_r10 (gfc_array_r10 *x)
stride0 = stride[0];
- if (unlikely (!rand_state.init))
- init_rand_state (false);
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
while (dest)
{
/* random_r10 (dest); */
- uint64_t r = xorshift1024star ();
+ uint64_t r = xorshift1024star (rs);
rnumber_10 (dest, r);
/* Advance to the next element. */
@@ -627,6 +655,7 @@ arandom_r16 (gfc_array_r16 *x)
index_type stride0;
index_type dim;
GFC_REAL_16 *dest;
+ xorshift1024star_state* rs = get_rand_state();
int n;
dest = x->base_addr;
@@ -644,14 +673,14 @@ arandom_r16 (gfc_array_r16 *x)
stride0 = stride[0];
- if (unlikely (!rand_state.init))
- init_rand_state (false);
+ if (unlikely (!rs->init))
+ init_rand_state (rs, false);
while (dest)
{
/* random_r16 (dest); */
- uint64_t r1 = xorshift1024star ();
- uint64_t r2 = xorshift1024star ();
+ uint64_t r1 = xorshift1024star (rs);
+ uint64_t r2 = xorshift1024star (rs);
rnumber_16 (dest, r1, r2);
/* Advance to the next element. */
@@ -724,6 +753,8 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put,
gfc_array_i4 *get)
if (size != NULL)
*size = SZ + 1;
+ xorshift1024star_state* rs = get_rand_state();
+
/* Return the seed to GET data. */
if (get != NULL)
{
@@ -735,11 +766,11 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put,
gfc_array_i4 *get)
if (GFC_DESCRIPTOR_EXTENT(get,0) < (index_type) SZ + 1)
runtime_error ("Array size of GET is too small.");
- if (!rand_state.init)
- init_rand_state (false);
+ if (!rs->init)
+ init_rand_state (rs, false);
/* Unscramble the seed. */
- unscramble_seed (seed, (unsigned char *) rand_state.s, sizeof seed);
+ unscramble_seed (seed, (unsigned char *) rs->s, sizeof seed);
/* Then copy it back to the user variable. */
for (size_t i = 0; i < SZ ; i++)
@@ -748,7 +779,7 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put,
gfc_array_i4 *get)
sizeof(GFC_UINTEGER_4));
/* Finally copy the value of p after the seed. */
- get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rand_state.p;
+ get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rs->p;
}
else
@@ -761,7 +792,7 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put,
gfc_array_i4 *get)
{
getosrandom (master_state, sizeof (master_state));
njumps = 0;
- init_rand_state (true);
+ init_rand_state (rs, true);
}
if (put != NULL)
@@ -784,9 +815,9 @@ random_seed_i4 (GFC_INTEGER_4 *size, gfc_array_i4 *put,
gfc_array_i4 *get)
provide seeds with quality only in the lower or upper part. */
scramble_seed ((unsigned char *) master_state, seed, sizeof seed);
njumps = 0;
- init_rand_state (true);
+ init_rand_state (rs, true);
- rand_state.p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] % 16;
+ rs->p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] & 15;
}
@@ -809,6 +840,8 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put,
gfc_array_i8 *get)
if (size != NULL)
*size = SZ + 1;
+ xorshift1024star_state* rs = get_rand_state();
+
/* Return the seed to GET data. */
if (get != NULL)
{
@@ -820,15 +853,15 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put,
gfc_array_i8 *get)
if (GFC_DESCRIPTOR_EXTENT(get,0) < (index_type) SZ + 1)
runtime_error ("Array size of GET is too small.");
- if (!rand_state.init)
- init_rand_state (false);
+ if (!rs->init)
+ init_rand_state (rs, false);
/* This code now should do correct strides. */
for (size_t i = 0; i < SZ; i++)
- memcpy (&(get->base_addr[i * GFC_DESCRIPTOR_STRIDE(get,0)]),
&rand_state.s[i],
+ memcpy (&(get->base_addr[i * GFC_DESCRIPTOR_STRIDE(get,0)]), &rs->s[i],
sizeof (GFC_UINTEGER_8));
- get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rand_state.p;
+ get->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(get, 0)] = rs->p;
}
else
@@ -841,7 +874,7 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put,
gfc_array_i8 *get)
{
getosrandom (master_state, sizeof (master_state));
njumps = 0;
- init_rand_state (true);
+ init_rand_state (rs, true);
}
if (put != NULL)
@@ -860,22 +893,34 @@ random_seed_i8 (GFC_INTEGER_8 *size, gfc_array_i8 *put,
gfc_array_i8 *get)
sizeof (GFC_UINTEGER_8));
njumps = 0;
- init_rand_state (true);
- rand_state.p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] % 16;
+ init_rand_state (rs, true);
+ rs->p = put->base_addr[SZ * GFC_DESCRIPTOR_STRIDE(put, 0)] & 15;
}
-
__gthread_mutex_unlock (&random_lock);
}
}
iexport(random_seed_i8);
-#ifndef __GTHREAD_MUTEX_INIT
+#if !defined __GTHREAD_MUTEX_INIT || defined __GTHREADS
static void __attribute__((constructor))
-init (void)
+constructor_random (void)
{
+#ifndef __GTHREAD_MUTEX_INIT
__GTHREAD_MUTEX_INIT_FUNCTION (&random_lock);
+#endif
+ if (__gthread_active_p ())
+ __gthread_key_create (&rand_state_key, &free);
+}
+#endif
+
+#ifdef __GTHREADS
+static void __attribute__((destructor))
+destructor_random (void)
+{
+ if (__gthread_active_p ())
+ __gthread_key_delete (rand_state_key);
}
#endif