Hi, 

I'm attempting to write code which will allocate block and grid sizes based 
upon the machines capability for a large number of computations, large enough 
that I want to know the device's limitations: 
import pycuda.driver as cuda 
import pycuda.autoinit 
from pycuda.compiler import SourceModule 
import numpy 
a = numpy.random.randn(131072,2).astype(numpy.float32) 
a_gpu = cuda.mem_alloc(a.nbytes) 
cuda.memcpy_htod(a_gpu, a) 
mod = SourceModule(""" 
__global__ void doublify(float *a) 
{int thread =threadIdx.x+threadIdx.y*blockDim.x; 
int block=blockIdx.x*blockDim.x*blockDim.y; 
int grid=blockIdx.y*gridDim.x*blockDim.x*blockDim.y; 
int idx = thread+block+grid; 
a[idx] *= 2; 
} 
""") 
func = mod.get_function("doublify") 
func(a_gpu, block=(2,cuda.device_attribute.MAX_THREADS_PER_BLOCK,1), 
grid=(cuda.device_attribute.MAX_GRID_DIM_X,cuda.device_attribute.MAX_GRID_DIM_Y))
 
a_doubled = numpy.empty_like(a) 
cuda.memcpy_dtoh(a_doubled, a_gpu) 
print a 
print a_doubled 

returns: 
[[-0.51754272 -0.4062421 ] 
[-0.72260356 -0.98106903] 
[-0.10904041 0.82718426] 
..., 
[ 0.25420722 -0.59294581] 
[-1.18791282 -0.49158984] 
[ 0.45278689 0.69320816]] 
[[-1.03508544 -0.8124842 ] 
[-1.44520712 -1.96213806] 
[-0.21808082 1.65436852] 
..., 
[ 0.25420722 -0.59294581] 
[-1.18791282 -0.49158984] 
[ 0.45278689 0.69320816]] 

It's only done the computation for the first thirty lines. However the 
following code works: 
import pycuda.driver as cudaimport pycuda.driver as cuda 
import pycuda.autoinit 
from pycuda.compiler import SourceModule 
import numpy 
a = numpy.random.randn(131072,2).astype(numpy.float32) 
a_gpu = cuda.mem_alloc(a.nbytes) 
cuda.memcpy_htod(a_gpu, a) 
mod = SourceModule(""" 
__global__ void doublify(float *a) 
{int thread =threadIdx.x+threadIdx.y*blockDim.x; 
int block=blockIdx.x*blockDim.x*blockDim.y; 
int grid=blockIdx.y*gridDim.x*blockDim.x*blockDim.y; 
int idx = thread+block+grid; 
a[idx] *= 2; 
} 
""") 
func = mod.get_function("doublify") 
func(a_gpu, block=(2,512,1), grid=(16,16)) 
a_doubled = numpy.empty_like(a) 
cuda.memcpy_dtoh(a_doubled, a_gpu) 
print a 
print a_doubled 
import pycuda.autoinit 
from pycuda.compiler import SourceModule 
import numpy 
a = numpy.random.randn(131072,2).astype(numpy.float32) 
a_gpu = cuda.mem_alloc(a.nbytes) 
cuda.memcpy_htod(a_gpu, a) 
mod = SourceModule(""" 
__global__ void doublify(float *a) 
{int thread =threadIdx.x+threadIdx.y*blockDim.x; 
int block=blockIdx.x*blockDim.x*blockDim.y; 
int grid=blockIdx.y*gridDim.x*blockDim.x*blockDim.y; 
int idx = thread+block+grid; 
a[idx] *= 2; 
} 
""") 
func = mod.get_function("doublify") 
func(a_gpu, block=(2,512,1), grid=(16,16)) 
a_doubled = numpy.empty_like(a) 
cuda.memcpy_dtoh(a_doubled, a_gpu) 
print a 
print a_doubled 

Also, print cuda.device_attribute.MAX_THREADS_PER_BLOCK returns the string 
MAX_THREADS_PER_BLOCK. 

I'm using pyCUDA 0.94.2 
and CUDA 3.2.0 

Any suggestions would be greatly appreciated, 

-drp 
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to