Dnia 2011-02-13, nie o godzinie 19:12 -0500, Andreas Kloeckner pisze:
> On Mon, 14 Feb 2011 00:51:13 +0100, Tomasz Rybak <[email protected]> wrote:
> > After discussion with Martin Laprise I have come with the following code
> > (see attachment). It uses all available MPs, but I think it needs
> > some code to decide whether to use entire GPU (in case generated
> > vector is long) or only few blocks (otherwise).
> > 
> > I can fix attached code to better suit PyCUDA style so you can push
> > it to git, and only then try to add code managing number of used blocks.
> 
> Please work your changes into the branch I created. The changes there
> concerned (much) more than style.

I have noticed - I like your solution.
BTW - you misspelled names of float2 and double2 CURAND functions;
I have fixed then in attached patch.
Also those functions (float2, double2) are available for XORWOW
generator, not for Sobol32 - unless I misunderstood purpose of variable
has_box_muller. I have changed value of this variable to True in XORWOW,
and to False in Sobol32, according to CURAND documentation
(2.2.2.7 for curand_normal2 and 2.2.2.8 for curand_normal2_double.

> 
> > > - The Sobol' direction vectors need to come from a very specific set to
> > >   make sense, see curandGetDirectionVectors32 in the CURAND docs. We
> > >   should probably call/wrap this function to get those vectors. Further,
> > >   each generator should use a different vector, rather than the same
> > >   one.
> > > 
> > > - The Sobol' initialization needs to be worked out. In particular, I
> > >   would like both generators to do something sensible if they're
> > >   initialized without arguments.
> > 
> > Agree on both points. 
> 
> Ok, sounds good.

See attached patch.

I have added field self.block_count that equals to number of MPs,
and it is number of blocks that are run for generating random numbers.
Should I try to play with it and use less blocks for shorter sequences,
or just leave it as is? I would prefer leaving as is ;-) ; for
smaller generated sequences kernels are executed quickly, so potential
performance gains could not be worth sophisticated code.

I have managed to use maximum number of threads on Tesla - during
initialisation I am just calling 2*blocks, each initialising
only half of generators that are used for one block. Test case
worked on ION, and sample program worked on Martin Laprise machine,
so I believe this is good solution.

After you pull changes into git I will start working on seed_getters.

> 
> Btw, please do ask Nv to clarify the CURAND license.

OK.



-- 
Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860
Fingerprint A481 824E 7DD3 9C0E C40A  488E C654 FB33 2AD5 9860
http://member.acm.org/~tomaszrybak
diff --git a/pycuda/curandom.py b/pycuda/curandom.py
index 3673fca..cb76408 100644
--- a/pycuda/curandom.py
+++ b/pycuda/curandom.py
@@ -247,10 +247,11 @@ def rand(shape, dtype=np.float32, stream=None):
 # {{{ Base class
 
 gen_template = """
-__global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, const int n) 
+__global__ void %(name)s(%(state_type)s *s, %(out_type)s *d, const int n)
 {
-  const int tidx = threadIdx.x;
-  for (int idx = tidx; idx < n; idx += blockDim.x)
+  const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
+  const int delta = blockDim.x*gridDim.x;
+  for (int idx = tidx; idx < n; idx += delta)
     d[idx] = curand%(suffix)s(&s[tidx]);
 }
 """
@@ -266,14 +267,14 @@ extern "C"
 
 __global__ void skip_ahead(%(state_type)s *s, const int n, const int skip) 
 {
-  const int idx = threadIdx.x;
+  const int idx = blockIdx.x*blockDim.x+threadIdx.x;
   if (idx < n)
     skipahead(skip, &s[idx]);
 }
 
 __global__ void skip_ahead_array(%(state_type)s *s, const int n, const int *skip) 
 {
-  const int idx = threadIdx.x;
+  const int idx = blockIdx.x*blockDim.x+threadIdx.x;
   if (idx < n)
       skipahead(skip[idx], &s[idx]);
 }
@@ -296,8 +297,8 @@ class _RandomNumberGeneratorBase(object):
         ("uniform_double", "double", "_uniform_double"),
         ("normal_float", "float", "_normal"),
         ("normal_double", "double", "_normal_double"),
-        ("normal_float2", "float2", "normal_float2"),
-        ("normal_double2", "double2", "normal_double2"),
+        ("normal_float2", "float2", "_normal2"),
+        ("normal_double2", "double2", "_normal2_double"),
         ]
 
     def __init__(self, state_type, additional_source):
@@ -309,20 +310,19 @@ class _RandomNumberGeneratorBase(object):
         # pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
 
         dev = drv.Context.get_device()
-        if dev.compute_capability() >= (2, 0):
-            block_size = dev.get_attribute(
-                drv.device_attribute.MAX_THREADS_PER_BLOCK)
-            block_dimension =  dev.get_attribute(
-                drv.device_attribute.MAX_BLOCK_DIM_X)
-            self.generator_count = min(block_size, block_dimension)
-        else:
-            self.generator_count = 256
+        block_size = dev.get_attribute(
+            drv.device_attribute.MAX_THREADS_PER_BLOCK)
+        block_dimension =  dev.get_attribute(
+            drv.device_attribute.MAX_BLOCK_DIM_X)
+        self.generator_count = min(block_size, block_dimension)
+        self.block_count = dev.get_attribute(
+            pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT)
 
         from pycuda.characterize import sizeof
         data_type_size = sizeof(state_type, "#include <curand_kernel.h>")
 
         self.state = drv.mem_alloc(
-            self.generator_count * data_type_size)
+            self.block_count * self.generator_count * data_type_size)
 
         from pycuda.characterize import has_double_support
 
@@ -373,8 +373,8 @@ class _RandomNumberGeneratorBase(object):
         else:
             raise NotImplementedError
 
-        func.prepared_async_call((1, 1), stream, self.state,
-            data.gpudata, data.size)
+        func.prepared_async_call((self.block_count, 1), stream,
+            self.state, data.gpudata, data.size)
 
     def fill_normal(self, data, stream=None):
         if data.dtype == np.float32:
@@ -391,8 +391,8 @@ class _RandomNumberGeneratorBase(object):
 
         func = self.generators[func_name]
 
-        func.prepared_async_call((1, 1), stream, self.state,
-            data.gpudata, data_size)
+        func.prepared_async_call((self.block_count, 1), stream,
+            self.state, data.gpudata, data_size)
 
     def gen_uniform(self, shape, dtype, stream=None):
         result = array.empty(shape, dtype)
@@ -405,12 +405,12 @@ class _RandomNumberGeneratorBase(object):
         return result
 
     def call_skip_ahead(self, i, stream=None):
-        self.skip_ahead.prepared_async_call((1, 1), stream, self.state,
-            self.generator_count, i)
+        self.skip_ahead.prepared_async_call((self.block_count, 1), stream,
+            self.state, self.generator_count, i)
 
     def call_skip_ahead_array(self, i, stream=None):
-        self.skip_ahead_array.prepared_async_call((1, 1), stream, self.state,
-            self.generator_count, i.gpudata)
+        self.skip_ahead_array.prepared_async_call((self.block_count, 1),
+            stream, self.state, self.generator_count, i.gpudata)
 
 # }}}
 
@@ -421,15 +421,16 @@ extern "C" {
 __global__ void prepare_with_seeds(curandState *s, const int n,
   const int *seed, const int offset) 
 {
-  if (threadIdx.x < n)
-    curand_init(seed[threadIdx.x], threadIdx.x, offset, &s[threadIdx.x]);
+  const int id = blockIdx.x*blockDim.x+threadIdx.x;
+  if (id < n)
+    curand_init(seed[id], threadIdx.x, offset, &s[id]);
 }
 
 }
 """
 
 class XORWOWRandomNumberGenerator(_RandomNumberGeneratorBase):
-    has_box_muller = False
+    has_box_muller = True
 
     def __init__(self, seed_getter=None, offset=0):
         """
@@ -467,8 +468,13 @@ class XORWOWRandomNumberGenerator(_RandomNumberGeneratorBase):
             if has_stack:
                 drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
             try:
-                p.prepared_call((1, 1), self.state, self.generator_count,
-                    seed.gpudata, offset)
+                dev = drv.Context.get_device()
+                if dev.compute_capability() >= (2, 0):
+                    p.prepared_call((self.block_count, 1), self.state,
+                        self.block_count * self.generator_count, seed.gpudata, offset)
+                else:
+                    p.prepared_call((2 * self.block_count, 1), self.state,
+                        self.block_count * self.generator_count // 2, seed.gpudata, offset)
             except drv.LaunchError:
                 raise ValueError("Initialisation failed. Decrease number of threads.")
 
@@ -476,8 +482,6 @@ class XORWOWRandomNumberGenerator(_RandomNumberGeneratorBase):
             if has_stack:
                 drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size)
 
-
-
 # }}}
 
 # {{{ Sobol32 RNG
@@ -487,8 +491,9 @@ extern "C" {
 __global__ void prepare(curandStateSobol32 *s, const int n, unsigned int *v,
     const unsigned int o) 
 {
-  if (threadIdx.x < n)
-    curand_init(v, o, &s[threadIdx.x]);
+  const int id = blockIdx.x*blockDim.x+threadIdx.x;
+  if (id < n)
+    curand_init(v, o, &s[id]);
 }
 }
 """
@@ -500,7 +505,7 @@ class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase):
     and normal probability function of type int, float, and double.
     """
 
-    has_box_muller = True
+    has_box_muller = False
 
     def __init__(self, dir_vector, offset):
         super(Sobol32RandomNumberGenerator, self).__init__('curandStateSobol32',
@@ -521,8 +526,14 @@ class Sobol32RandomNumberGenerator(_RandomNumberGeneratorBase):
             if has_stack:
                 drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k
             try:
-                p.prepared_call((1, 1), self.state, self.generator_count, vector,
-                    offset)
+
+                dev = drv.Context.get_device()
+                if dev.compute_capability() >= (2, 0):
+                    p.prepared_call((self.block_count, 1), self.state,
+                        self.block_count * self.generator_count, vector, offset)
+                else:
+                    p.prepared_call((2 * self.block_count, 1), self.state,
+                        self.block_count * self.generator_count // 2, vector, offset)
             except drv.LaunchError:
                 raise ValueError("Initialisation failed. Decrease number of threads.")
 

Attachment: signature.asc
Description: This is a digitally signed message part

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to