hi everyone.

i'm having an issue using a kernel i wrote. the goal is to simulate a
recurrent neural network. the experimental code is attached. it
contains two sequential simulations, the first one runs on the GPU,
the second one on the CPU. just for performance comparison.

everything seems to work just fine as long as:
 - the number of timesteps (num_samples) is below 20k
 - the size of the network (res_dim) is below 300 nodes

both results are the same, everything works as expected (GPU -> cracy
fast, CPU -> ridiculously slow). but as soon as these numbers get too
high, something strange happens. the computation seems to work but i
receive the following exception when copying the results back to
python via gpuarray.get()

Traceback (most recent call last):
  File "<ipython console>", line 1, in <module>
  File 
"/usr/local/lib/python2.6/dist-packages/spyderlib/widgets/externalshell/startup.py",
line 122, in runfile
    execfile(filename, glbs)
  File "/home/.../sandbox.py", line 71, in <module>
    tmp = gpu_x.get()
  File 
"/usr/local/lib/python2.6/dist-packages/pycuda-0.94.1-py2.6-linux-x86_64.egg/pycuda/gpuarray.py",
line 115, in get
    drv.memcpy_dtoh(ary, self.gpudata)
LaunchError: cuMemcpyDtoH failed: launch timeout

when i leave out the get() line everything seems to work. the same
thing happens whithout X running.

does someone have an idea what's going on?

greetings,
m


ps: some information about the environment:

Linux 2.6.32-30-generic #59-Ubuntu x86_64 GNU/Linux
python 2.6.5
pycuda 0.94.1
nvidia devdriver_3.2_linux_64_260.19.21
spyder 2.0.6

-- 

Martin Hammerschmied   |  student of Telematics
[email protected]  |  Graz University Of Technology
# -*- coding: utf-8 -*-

'''
Created on Apr 5, 2011

@author: Martin Hammerschmied
'''

import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray
import numpy as np
import time

from pycuda.compiler import SourceModule

def gpu_iterate_node(M, N):
    mod = SourceModule("""
    __global__ void iterate_node(float w[{1}][{1}], float w_in[{1}][{0}], float *w_bias, float *x, float *u, float *iterations) {{
    
        int i = threadIdx.x;
        float A;
        
        for (long iter = 0; iter < iterations[0]; iter++) {{
            
            A = 0;

            __syncthreads();
            for (int j = 0; j < {1}; j++) {{
                A += w[i][j] * x[j];
            }}
            
            for (int j = 0; j < {0}; j++) {{
                A += w_in[i][j] * u[j];
            }}
            
            x[i] = tanh(A + w_bias[i]);
        }}
    }}
    """.format(M, N))
    return mod

res_dim = 500
inp_dim = 3
num_samples = np.float32(100000)

w = 0.002 * np.random.rand(res_dim,res_dim).astype(np.float32)
w_in = 0.1 * np.random.rand(res_dim,inp_dim).astype(np.float32)
w_bias = 0.1 * np.random.rand(res_dim).astype(np.float32)
u = np.random.rand(inp_dim).astype(np.float32)
x = np.zeros(res_dim).astype(np.float32)
y = np.zeros(res_dim).astype(np.float32)

f_iterate_node = gpu_iterate_node(inp_dim, res_dim).get_function("iterate_node")

gpu_w = gpuarray.to_gpu(w)
gpu_w_in = gpuarray.to_gpu(w_in)
gpu_w_bias = gpuarray.to_gpu(w_bias)
gpu_u = gpuarray.to_gpu(u)
gpu_x = gpuarray.to_gpu(x)
gpu_iterations = gpuarray.to_gpu(num_samples)

print "GPU start"
t = time.time()
f_iterate_node(gpu_w, gpu_w_in, gpu_w_bias, gpu_x, gpu_u, gpu_iterations, block=(res_dim,1,1))

print str(time.time() - t) + " seconds"
tmp = gpu_x.get()
print tmp

print "CPU start"
t = time.time()
for i in range(num_samples):
    x = np.tanh( np.dot(w_in,u) + np.dot(w, x) + w_bias )

print str(time.time() - t) + " seconds"
print x
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to