Dnia 2011-02-13, nie o godzinie 19:12 -0500, Andreas Kloeckner pisze: > On Mon, 14 Feb 2011 00:51:13 +0100, Tomasz Rybak <[email protected]> wrote: > > After discussion with Martin Laprise I have come with the following code > > (see attachment). It uses all available MPs, but I think it needs > > some code to decide whether to use entire GPU (in case generated > > vector is long) or only few blocks (otherwise). > > > > I can fix attached code to better suit PyCUDA style so you can push > > it to git, and only then try to add code managing number of used blocks. > > Please work your changes into the branch I created. The changes there > concerned (much) more than style.
I have noticed - I like your solution. BTW - you misspelled names of float2 and double2 CURAND functions; I have fixed then in attached patch. Also those functions (float2, double2) are available for XORWOW generator, not for Sobol32 - unless I misunderstood purpose of variable has_box_muller. I have changed value of this variable to True in XORWOW, and to False in Sobol32, according to CURAND documentation (2.2.2.7 for curand_normal2 and 2.2.2.8 for curand_normal2_double. > > > > - The Sobol' direction vectors need to come from a very specific set to > > > make sense, see curandGetDirectionVectors32 in the CURAND docs. We > > > should probably call/wrap this function to get those vectors. Further, > > > each generator should use a different vector, rather than the same > > > one. > > > > > > - The Sobol' initialization needs to be worked out. In particular, I > > > would like both generators to do something sensible if they're > > > initialized without arguments. > > > > Agree on both points. > > Ok, sounds good. See attached patch. I have added field self.block_count that equals to number of MPs, and it is number of blocks that are run for generating random numbers. Should I try to play with it and use less blocks for shorter sequences, or just leave it as is? I would prefer leaving as is ;-) ; for smaller generated sequences kernels are executed quickly, so potential performance gains could not be worth sophisticated code. I have managed to use maximum number of threads on Tesla - during initialisation I am just calling 2*blocks, each initialising only half of generators that are used for one block. Test case worked on ION, and sample program worked on Martin Laprise machine, so I believe this is good solution. After you pull changes into git I will start working on seed_getters. > > Btw, please do ask Nv to clarify the CURAND license. OK. -- Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 3673fca..cb76408 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -247,10 +247,11 @@ def rand(shape, dtype=np.float32, stream=None):
# {{{ Base class
gen_template = """
-__global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, const int n)
+__global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, const int n)
{
- const int tidx = threadIdx.x;
- for (int idx = tidx; idx < n; idx += blockDim.x)
+ const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
+ const int delta = blockDim.x*gridDim.x;
+ for (int idx = tidx; idx < n; idx += delta)
d[idx] = curand%(suffix)s(&s[tidx]);
}
"""
@@ -266,14 +267,14 @@ extern "C"
__global__ void skip_ahead(%(state_type)s *s, const int n, const int skip)
{
- const int idx = threadIdx.x;
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < n)
skipahead(skip, &s[idx]);
}
__global__ void skip_ahead_array(%(state_type)s *s, const int n, const int *skip)
{
- const int idx = threadIdx.x;
+ const int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < n)
skipahead(skip[idx], &s[idx]);
}
@@ -296,8 +297,8 @@ class _RandomNumberGeneratorBase(object):
("uniform_double", "double", "_uniform_double"),
("normal_float", "float", "_normal"),
("normal_double", "double", "_normal_double"),
- ("normal_float2", "float2", "normal_float2"),
- ("normal_double2", "double2", "normal_double2"),
+ ("normal_float2", "float2", "_normal2"),
+ ("normal_double2", "double2", "_normal2_double"),
]
def __init__(self, state_type, additional_source):
@@ -309,20 +310,19 @@ class _RandomNumberGeneratorBase(object):
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
dev = drv.Context.get_device()
- if dev.compute_capability() >= (2, 0):
- block_size = dev.get_attribute(
- drv.device_attribute.MAX_THREADS_PER_BLOCK)
- block_dimension = dev.get_attribute(
- drv.device_attribute.MAX_BLOCK_DIM_X)
- self.generator_count = min(block_size, block_dimension)
- else:
- self.generator_count = 256
+ block_size = dev.get_attribute(
+ drv.device_attribute.MAX_THREADS_PER_BLOCK)
+ block_dimension = dev.get_attribute(
+ drv.device_attribute.MAX_BLOCK_DIM_X)
+ self.generator_count = min(block_size, block_dimension)
+ self.block_count = dev.get_attribute(
+ pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT)
from pycuda.characterize import sizeof
data_type_size = sizeof(state_type, "#include <curand_kernel.h>")
self.state = drv.mem_alloc(
- self.generator_count * data_type_size)
+ self.block_count * self.generator_count * data_type_size)
from pycuda.characterize import has_double_support
@@ -373,8 +373,8 @@ class _RandomNumberGeneratorBase(object):
else:
raise NotImplementedError
- func.prepared_async_call((1, 1), stream, self.state,
- data.gpudata, data.size)
+ func.prepared_async_call((self.block_count, 1), stream,
+ self.state, data.gpudata, data.size)
def fill_normal(self, data, stream=None):
if data.dtype == np.float32:
@@ -391,8 +391,8 @@ class _RandomNumberGeneratorBase(object):
func = self.generators[func_name]
- func.prepared_async_call((1, 1), stream, self.state,
- data.gpudata, data_size)
+ func.prepared_async_call((self.block_count, 1), stream,
+ self.state, data.gpudata, data_size)
def gen_uniform(self, shape, dtype, stream=None):
result = array.empty(shape, dtype)
@@ -405,12 +405,12 @@ class _RandomNumberGeneratorBase(object):
return result
def call_skip_ahead(self, i, stream=None):
- self.skip_ahead.prepared_async_call((1, 1), stream, self.state,
- self.generator_count, i)
+ self.skip_ahead.prepared_async_call((self.block_count, 1), stream,
+ self.state, self.generator_count, i)
def call_skip_ahead_array(self, i, stream=None):
- self.skip_ahead_array.prepared_async_call((1, 1), stream, self.state,
- self.generator_count, i.gpudata)
+ self.skip_ahead_array.prepared_async_call((self.block_count, 1),
+ stream, self.state, self.generator_count, i.gpudata)
# }}}
@@ -421,15 +421,16 @@ extern "C" {
__global__ void prepare_with_seeds(curandState *s, const int n,
const int *seed, const int offset)
{
- if (threadIdx.x < n)
- curand_init(seed[threadIdx.x], threadIdx.x, offset, &s[threadIdx.x]);
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(seed[id], threadIdx.x, offset, &s[id]);
}
}
"""
class XORWOWRandomNumberGenerator(_RandomNumberGeneratorBase):
- has_box_muller = False
+ has_box_muller = True
def __init__(self, seed_getter=None, offset=0):
"""
@@ -467,8 +468,13 @@ class XORWOWRandomNumberGenerator(_RandomNumberGeneratorBase):
if has_stack:
drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
try:
- p.prepared_call((1, 1), self.state, self.generator_count,
- seed.gpudata, offset)
+ dev = drv.Context.get_device()
+ if dev.compute_capability() >= (2, 0):
+ p.prepared_call((self.block_count, 1), self.state,
+ self.block_count * self.generator_count, seed.gpudata, offset)
+ else:
+ p.prepared_call((2 * self.block_count, 1), self.state,
+ self.block_count * self.generator_count // 2, seed.gpudata, offset)
except drv.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
@@ -476,8 +482,6 @@ class XORWOWRandomNumberGenerator(_RandomNumberGeneratorBase):
if has_stack:
drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
-
-
# }}}
# {{{ Sobol32 RNG
@@ -487,8 +491,9 @@ extern "C" {
__global__ void prepare(curandStateSobol32 *s, const int n, unsigned int *v,
const unsigned int o)
{
- if (threadIdx.x < n)
- curand_init(v, o, &s[threadIdx.x]);
+ const int id = blockIdx.x*blockDim.x+threadIdx.x;
+ if (id < n)
+ curand_init(v, o, &s[id]);
}
}
"""
@@ -500,7 +505,7 @@ class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase):
and normal probability function of type int, float, and double.
"""
- has_box_muller = True
+ has_box_muller = False
def __init__(self, dir_vector, offset):
super(Sobol32RandomNumberGenerator, self).__init__('curandStateSobol32',
@@ -521,8 +526,14 @@ class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase):
if has_stack:
drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
try:
- p.prepared_call((1, 1), self.state, self.generator_count, vector,
- offset)
+
+ dev = drv.Context.get_device()
+ if dev.compute_capability() >= (2, 0):
+ p.prepared_call((self.block_count, 1), self.state,
+ self.block_count * self.generator_count, vector, offset)
+ else:
+ p.prepared_call((2 * self.block_count, 1), self.state,
+ self.block_count * self.generator_count // 2, vector, offset)
except drv.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
