Hi,
I'm trying to run the following code on my Mac laptop to multiply a 1D vector
by a square matrix and then dot the result with another vector. It works fine
when the dimension of the matrix and vector (DIM) is small but when at test
values close the operating level (DIM ~ 1500 - 2000), it computes for a few
seconds and then crashes the laptop with an unresponsive keyboard and screen.
I've run the wiki-examples MatrixmulTIled.py with the same dimensions and that
works fine. I would be grateful if someone could point out what I am doing
wrong.
Cheers,
Matthew
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
from pycuda.curandom import rand as curand
import pycuda.autoinit
import time
DIM = 1600
BLOCK_SIZE = 16 # 512 max threads per mp
gi = np.random.randn(DIM).astype(np.float32)
gj = np.random.randn(DIM).astype(np.float32)
dissim = np.random.randn(DIM, DIM).astype(np.float32)
ans = gpuarray.empty((DIM,), np.float32)
"""
Each thread calculates for one term (axis)
"""
kernel_code_template = '''
__global__ void GOPKernel(float *gi, float *gj, float *d, float *ans)
{
// Element
int row = blockIdx.y * blockDim.y + threadIdx.y;
float val = 0;
for (int k = 0; k < %(DIM)s; ++k) {
float d_elem = d[row * %(DIM)s + k];
float gi_elem = gi[k];
float gj_elem = gj[k];
val += 0.5 * gi[row] * d_elem * gi_elem;
val += 0.5 * gj[row] * d_elem * gj_elem;
val -= gi[row] * d_elem * gj_elem;
}
ans[row] = val;
}
'''
# Get the kernel code from the template
kernel_code = kernel_code_template % {
'DIM': DIM
}
# Compile the kernel code
mod = compiler.SourceModule(kernel_code)
# Get the kernel function from the compiled module
gopker = mod.get_function("GOPKernel")
gi = gpuarray.to_gpu(gi)
gj = gpuarray.to_gpu(gj)
dissim = gpuarray.to_gpu(dissim)
gridx = DIM / BLOCK_SIZE if DIM / BLOCK_SIZE == 1 else DIM / BLOCK_SIZE + 1
gridy = DIM / BLOCK_SIZE if DIM / BLOCK_SIZE == 1 else DIM / BLOCK_SIZE + 1
# Call the function on the card
gopker(
# inputs
gi, gj, dissim,
# output
ans,
# block of multiple threads
block = (BLOCK_SIZE, BLOCK_SIZE, 1),
# grid of blocks
grid = (gridx, gridy)
)
# Get result
z = ans.get()
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda