Hi everyone,
I have been trying to create a kernel concurrency example for pycuda but
have been unsuccessful. Can anyone point out what I'm doing wrong? Thanks in
advance!
My pycuda file is attached, and the output that iI'm getting on my system
is:
=== Device attributes
Name: GeForce GTX 480
Compute capability: (2, 0)
Concurrent Kernels: True
=== Checking answers
Dataset 0 : passed.
Dataset 1 : passed.
=== Timing info (for last set of kernel launches)
Dataset 0
kernel_begin : 6.64652776718
kernel_end : 6.67667198181
Dataset 1
kernel_begin : 6.67804813385
kernel_end : 6.70713615417
Notice that the second kernel launch (which operates on Dataset 1, only
begins once the first launch (on Dataset 0) has already completed.
Jesse
#! /usr/bin/env python
# A simple program to illustrate kernel concurrency with PyCuda.
# Reference: Chapter 3.2.6.5 in Cuda C Programming Guide Version 3.2.
# Jesse Lu, 2011-04-04
import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
#
# Set up test scenario.
#
# Create a simple test kernel.
mod = SourceModule("""
__global__ void my_kernel(float *d) {
const int i = threadIdx.x;
d[i] = 2 * d[i];
}
""")
my_kernel = mod.get_function("my_kernel")
# Create the test data on the host.
N = 400 # Size of datasets.
n = 2 # Number of datasets (and concurrent operations) used.
data, data_check, d_data = [], [], []
for k in range(n):
data.append(np.random.randn(N).astype(np.float32)) # Create random data.
data_check.append(data[k].copy()) # For checking the result afterwards.
d_data.append(drv.mem_alloc(data[k].nbytes)) # Allocate memory on device.
#
# Start concurrency test.
#
# Use this event as a reference point.
ref = drv.Event()
ref.record()
# Create the streams and events needed.
stream, event = [], []
marker_names = ['kernel_begin', 'kernel_end']
for k in range(n):
stream.append(drv.Stream())
event.append(dict([(marker_names[l], drv.Event()) for l in range(len(marker_names))]))
# Transfer to device.
for k in range(n):
drv.memcpy_htod(d_data[k], data[k])
# Run kernels many times, we will only keep data from last loop iteration.
for j in range(100):
for k in range(n):
event[k]['kernel_begin'].record(stream[k])
my_kernel(d_data[k], block=(N,1,1), stream=stream[k])
event[k]['kernel_end'].record(stream[k])
# Transfer data back to host.
for k in range(n):
drv.memcpy_dtoh(data[k], d_data[k])
#
# Output results.
#
print '\n=== Device attributes'
dev = pycuda.autoinit.device
print 'Name:', dev.name()
print 'Compute capability:', dev.compute_capability()
print 'Concurrent Kernels:', \
bool(dev.get_attribute(drv.device_attribute.CONCURRENT_KERNELS))
print '\n=== Checking answers'
for k in range(n):
print 'Dataset', k, ':',
if (np.linalg.norm(2**(j+1)*data_check[k] - data[k]) == 0.0):
print 'passed.'
else:
print 'FAILED!'
print '\n=== Timing info (for last set of kernel launches)'
for k in range(n):
print 'Dataset', k
for l in range(len(marker_names)):
print marker_names[l], ':', ref.time_till(event[k][marker_names[l]])
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda