Dnia 2011-02-13, nie o godzinie 15:13 -0500, Andreas Kloeckner pisze: > Hi Tomasz, > > Thanks for submitting the CURAND wrapper for inclusion in PyCUDA! > > On Thu, 03 Feb 2011 12:01:55 +0100, Tomasz Rybak <[email protected]> wrote: > > Sorry if you have received my previous mail (from 2011-02-01 22:52) > > but I have not got any reply - so here I am sending it again. > > > > I have seen your email from 2010-12-26. I was in Germany, so I was > > able to send reply on 2011-01-11. I have fixed code and removed > > repetitions. CURAND wrapper already gained some interest in PyCUDA > > mailing list - Martin Laprise used it and pointed that it can be > > optimised in mail from 2011-01-18. > > > > I am attaching patch (diff to master > > f2ae507a357a3bf7949beff03963995b655fa547) > > I've put (a fair bit!) of work into your wrapper. You can find the > results of my work here: > > http://git.tiker.net/pycuda.git/shortlog/refs/heads/curand-wrapper-v2-from-tomasz > > (i.e. branch curand-wrapper-v2-from-tomasz of the main repo) > > I'm generally happy with the state of the code, but a few issues remain > that I would like you to sort out before I merge to master: > > - All computations are submitted as just a single block (grid=(1,1)) > That cannot be a good use of the GPU. We should default to creating at > least (2-3)x(number of SMs) blocks of generators, so we can fill up the > GPU.
After discussion with Martin Laprise I have come with the following code (see attachment). It uses all available MPs, but I think it needs some code to decide whether to use entire GPU (in case generated vector is long) or only few blocks (otherwise). I can fix attached code to better suit PyCUDA style so you can push it to git, and only then try to add code managing number of used blocks. > > - The Sobol' direction vectors need to come from a very specific set to > make sense, see curandGetDirectionVectors32 in the CURAND docs. We > should probably call/wrap this function to get those vectors. Further, > each generator should use a different vector, rather than the same > one. > > - The Sobol' initialization needs to be worked out. In particular, I > would like both generators to do something sensible if they're > initialized without arguments. Agree on both points. Can other members of list give some comments how they would like to use it, or how they see API for creating RNG? Regards. -- Tomasz Rybak <[email protected]> GPG/PGP key ID: 2AD5 9860 Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860 http://member.acm.org/~tomaszrybak
#! /usr/bin/python
import numpy
# Need to allocate memory for curandState - there in not enough memory
# (neither shared nor private) on the chip for one for each thread ;-)
# sizeof(curandState))
curand_state_size = 40
# sizeof(curandStateSobol32))
curand_state_sobol_size = 136
random_source = """
// Uses C++ features (templates); do not surround with extern C
#include <curand_kernel.h>
extern "C" {
__global__ void uniform_int(%(data_type)s *s, int *d, const int n) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
const int delta = blockDim.x*gridDim.x;
for (int idx = tidx; idx < n; idx += delta) {
d[idx] = curand(&s[tidx]);
}
}
__global__ void uniform_float(%(data_type)s *s, float *d, const int n) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
const int delta = blockDim.x*gridDim.x;
for (int idx = tidx; idx < n; idx += delta) {
d[idx] = curand_uniform(&s[tidx]);
}
}
__global__ void uniform_double(%(data_type)s *s, double *d, const int n) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
const int delta = blockDim.x*gridDim.x;
for (int idx = tidx; idx < n; idx += delta) {
d[idx] = curand_uniform_double(&s[tidx]);
}
}
__global__ void normal_float(%(data_type)s *s, float *d, const int n) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
const int delta = blockDim.x*gridDim.x;
for (int idx = tidx; idx < n; idx += delta) {
d[idx] = curand_normal(&s[tidx]);
}
}
__global__ void normal_double(%(data_type)s *s, double *d, const int n) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
const int delta = blockDim.x*gridDim.x;
for (int idx = tidx; idx < n; idx += delta) {
d[idx] = curand_normal_double(&s[tidx]);
}
}
__global__ void skip_ahead(%(data_type)s *s, const int n, const int skip) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
if (tidx < n) {
skipahead(skip, &s[tidx]);
}
}
__global__ void skip_ahead_array(%(data_type)s *s, const int n, const int *skip) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
if (tidx < n) {
skipahead(skip[tidx], &s[tidx]);
}
}
}
"""
class RandomNumberGenerator(object):
"""
Class surrounding CURAND kernels from CUDA 3.2.
It allows for generating random numbers with uniform
and normal probability function of various types.
"""
def __init__(self, data_type, data_type_size, additional_source):
import pycuda.compiler
import pycuda.driver
if pycuda.driver.get_version() < (3, 2, 0):
raise EnvironmentError("Need at least CUDA 3.2")
# Max 256 threads on ION during preparing
# Need to limit number of threads. If not there is failed execution:
# pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources
dev = pycuda.driver.Context.get_device()
block_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
block_dimension = dev.get_attribute(pycuda.driver.device_attribute.MAX_BLOCK_DIM_X)
self.generator_count = min(block_size, block_dimension)
if dev.compute_capability() >= (2, 0):
pass
else:
self.generator_count = 256
self.block_count = dev.get_attribute(pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT)
# TODO: cudaThreadSetLimit(cudaLimitStackSize, 16k) on Fermi
self.state = pycuda.driver.mem_alloc(data_type_size *
self.generator_count * self.block_count)
source = str(random_source + additional_source) % {
'data_type': data_type,
}
self.module = pycuda.compiler.SourceModule(source, no_extern_c=True,
keep=True)
self.uniform_int = self.module.get_function("uniform_int")
self.uniform_int.prepare("PPi", block=(self.generator_count, 1, 1))
self.uniform_float = self.module.get_function("uniform_float")
self.uniform_float.prepare("PPi", block=(self.generator_count, 1, 1))
self.uniform_double = self.module.get_function("uniform_double")
self.uniform_double.prepare("PPi", block=(self.generator_count, 1, 1))
self.normal_float = self.module.get_function("normal_float")
self.normal_float.prepare("PPi", block=(self.generator_count, 1, 1))
self.normal_double = self.module.get_function("normal_double")
self.normal_double.prepare("PPi", block=(self.generator_count, 1, 1))
self.skip_ahead = self.module.get_function("skip_ahead")
self.skip_ahead.prepare("Pii", block=(self.generator_count, 1, 1))
self.skip_ahead_array = self.module.get_function("skip_ahead_array")
self.skip_ahead_array.prepare("PiP", block=(self.generator_count, 1, 1))
def __del__(self):
self.state.free()
def fill_uniform_int(self, data, input_size, stream = None):
self.uniform_int.prepared_async_call((self.block_count, 1), stream,
self.state, data, input_size)
def fill_uniform_float(self, data, input_size, stream = None):
self.uniform_float.prepared_async_call((self.block_count, 1), stream,
self.state, data, input_size)
def fill_uniform_double(self, data, input_size, stream = None):
self.uniform_double.prepared_async_call((self.block_count, 1), stream,
self.state, data, input_size)
def fill_normal_float(self, data, input_size, stream = None):
self.normal_float.prepared_async_call((self.block_count, 1), stream,
self.state, data, input_size)
def fill_normal_double(self, data, input_size, stream = None):
self.normal_double.prepared_async_call((self.block_count, 1), stream,
self.state, data, input_size)
def fill_uniform(self, data, stream = None):
if data.dtype == numpy.float32:
self.fill_uniform_float(data.gpudata, data.size, stream)
elif data.dtype == numpy.float64:
self.fill_uniform_double(data.gpudata, data.size, stream)
elif data.dtype in [numpy.int, numpy.int32, numpy.uint32]:
self.fill_uniform_int(data.gpudata, data.size, stream)
else:
raise NotImplementedError
def fill_normal(self, data, stream = None):
if isinstance(data.dtype, numpy.float32):
self.fill_normal_float(data.gpudata, data.size, stream)
elif isinstance(data.dtype, numpy.float64):
self.fill_normal_double(data.gpudata, data.size, stream)
else:
raise NotImplementedError
def call_skip_ahead(self, i, stream = None):
self.skip_ahead.prepared_async_call((self.block_count, 1), stream,
self.state, self.generator_count, i)
def call_skip_ahead_array(self, i, stream = None):
self.skip_ahead_array.prepared_async_call((self.block_count, 1),
stream, self.state, self.generator_count, i.gpudata)
def __call__(self, shape, dtype = numpy.float32, stream = None):
import pycuda.gpuarray
result = pycuda.gpuarray.GPUArray(shape, dtype)
self.fill_uniform(result, stream)
return result
pseudo_random_source = """
extern "C" {
__global__ void prepare_with_seed(curandState *s, const int n, const int seed, const int offset) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
if (tidx < n) {
curand_init(seed, tidx, offset, &s[tidx]);
}
}
__global__ void prepare_with_seeds(curandState *s, const int n, const int *seed, const int offset) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
if (tidx < n) {
curand_init(seed[tidx], tidx, offset, &s[tidx]);
}
}
__global__ void normal_float2(curandState *s, float2 *d, const int n) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
const int delta = blockDim.x*gridDim.x;
for (int idx = tidx; idx < n; idx += delta) {
d[idx] = curand_normal2(&s[tidx]);
}
}
__global__ void normal_double2(curandState *s, double2 *d, const int n) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
const int delta = blockDim.x*gridDim.x;
for (int idx = tidx; idx < n; idx += delta) {
d[idx] = curand_normal2_double(&s[tidx]);
}
}
}
"""
class PseudoRandomNumberGenerator(RandomNumberGenerator):
"""
Class surrounding CURAND kernels from CUDA 3.2.
It allows for generating pseudo-random numbers with uniform
and normal probability function of type int, float, double,
float2, double2.
"""
def __init__(self, offset, seed = None):
import pycuda._driver
import pycuda.gpuarray
super(PseudoRandomNumberGenerator, self).__init__('curandState',
curand_state_size, pseudo_random_source)
self.normal_float2 = self.module.get_function("normal_float2")
self.normal_float2.prepare("PPi", block=(self.generator_count, 1, 1))
self.normal_double2 = self.module.get_function("normal_double2")
self.normal_double2.prepare("PPi", block=(self.generator_count, 1, 1))
# Initialise
if seed is None:
import random
import sys
seed = random.randint(0, ((1 << 31) - 1))
if isinstance(seed, int):
p = self.module.get_function("prepare_with_seed")
p.prepare("Piii", block=(self.generator_count, 1, 1))
try:
p.prepared_call((self.block_count, 1), self.state,
self.generator_count * self.block_count, seed, offset)
except pycuda._driver.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
else:
if isinstance(seed, list) or isinstance(seed, tuple):
seed = numpy.array(seed, dtype=numpy.int32)
if isinstance(seed, numpy.ndarray):
seed = pycuda.gpuarray.to_gpu(seed).astype(numpy.int32)
if (isinstance(seed, pycuda.gpuarray.GPUArray) and
seed.dtype == numpy.int32):
p = self.module.get_function("prepare_with_seeds")
p.prepare("PiPi", block=(self.generator_count, 1, 1))
try:
p.prepared_call((self.block_count, 1), self.state,
self.generator_count * self.block_count, seed.gpudata,
offset)
except pycuda._driver.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
else:
raise ValueError("Need GPUArray of integers")
def fill_normal_float2(self, data, input_size, stream = None):
self.normal_float2.prepared_async_call((self.block_count, 1), stream,
self.state, data, input_size)
def fill_normal_double2(self, data, input_size, stream = None):
self.normal_double2.prepared_async_call((self.block_count, 1), stream,
self.state, data, input_size)
quasi_random_source = """
// Uses C++ features (templates); do not surround with extern C
#include <curand_kernel.h>
extern "C" {
__global__ void prepare(curandStateSobol32 *s, const int n, unsigned int *v,
const unsigned int o) {
const int tidx = blockIdx.x*blockDim.x+threadIdx.x;
if (tidx < n) {
curand_init(v, o, &s[tidx]);
}
}
}
"""
class QuasiRandomNumberGenerator(RandomNumberGenerator):
"""
Class surrounding CURAND kernels from CUDA 3.2.
It allows for generating quasi-random numbers with uniform
and normal probability function of type int, float, and double.
"""
def __init__(self, vector, offset):
import pycuda._driver
super(QuasiRandomNumberGenerator, self).__init__('curandStateSobol32',
curand_state_sobol_size, quasi_random_source)
p = self.module.get_function("prepare")
p.prepare("PiPi", block=(self.generator_count, 1, 1))
try:
p.prepared_call((self.block_count, 1), self.state,
self.generator_count * self.block_count, vector, offset)
except pycuda._driver.LaunchError:
raise ValueError("Initialisation failed. Decrease number of threads.")
mp_source = """
// Uses C++ features (templates); do not surround with extern C
#include <curand_kernel.h>
extern "C" {
__global__ void prepare_with_seed(curandState *s, const int n, const int seed, const int offset) {
int id = blockIdx.x*blockDim.x+threadIdx.x;
if (id < n) {
curand_init(seed, threadIdx.x, offset, &s[id]);
}
}
__global__ void normal_float(%(data_type)s *s, float *d, const int n) {
int id = blockIdx.x*blockDim.x+threadIdx.x;
const int tidx = threadIdx.x;
for (int idx = id; idx < n; idx += blockDim.x*gridDim.x) {
d[idx] = curand_normal(&s[id]);
}
}
}
"""
def testMP(data, input_size, stream = None):
import pycuda.compiler
import pycuda.driver
import random
if pycuda.driver.get_version() < (3, 2, 0):
raise EnvironmentError("Need at least CUDA 3.2")
dev = pycuda.driver.Context.get_device()
block_size = dev.get_attribute(pycuda.driver.device_attribute.MAX_THREADS_PER_BLOCK)
block_dimension = dev.get_attribute(pycuda.driver.device_attribute.MAX_BLOCK_DIM_X)
generator_count = min(block_size, block_dimension)
mp_count = dev.get_attribute(pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT)
state = pycuda.driver.mem_alloc(generator_count * curand_state_size)
seed = random.randint(0, ((1 << 31) - 1))
source = str(mp_source) % {
'data_type': 'curandState',
}
module = pycuda.compiler.SourceModule(source, no_extern_c=True, keep=True)
p = module.get_function("prepare_with_seed")
if dev.compute_capability() >= (2, 0):
p.prepare("Piii", block=(generator_count, 1, 1))
p.prepared_call((mp_count, 1), state, generator_count, seed, 0)
else:
# Ugly hack for non-Fermi
p.prepare("Piii", block=(generator_count/2, 1, 1))
p.prepared_call((mp_count*2, 1), state, generator_count, seed, 0)
normal_float = module.get_function("normal_float")
normal_float.prepare("PPi", block=(generator_count, 1, 1))
normal_float.prepared_async_call((mp_count, 1), stream, state, data, input_size)
if __name__ == '__main__':
import pycuda.gpuarray
import pycuda.autoinit
rr = PseudoRandomNumberGenerator(0, numpy.random.random(256).astype(numpy.int32))
data = pycuda.gpuarray.zeros([10000], numpy.float32)
print data[0:200]
rr.fill_uniform_float(data.gpudata, 512)
print data[0:200]
data = rr((100, ), numpy.uint32)
del rr
print data[0:200]
data = pycuda.gpuarray.zeros([256], numpy.int)
rr = QuasiRandomNumberGenerator(data.gpudata, 1)
signature.asc
Description: This is a digitally signed message part
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
