Sorry about that. I meant line 63.
# Run kernels many times, we will only keep data from last loop iteration.
for j in range(10):
for k in range(n):
event[k]['kernel_begin'].record(stream[k])
my_kernel(d_data[k], block=(N,1,1), stream=stream[k])
for k in range(n): # (this one) Comment out to break concurrency
event[k]['kernel_end'].record(stream[k])
On Tue, Apr 5, 2011 at 3:16 PM, Andreas Kloeckner
<[email protected]>wrote:
> On Tue, 5 Apr 2011 14:08:17 -0700, Jesse Lu <[email protected]> wrote:
> > Hi again,
> >
> > Here is a working example of kernel concurrency (file attached).
> Launching
> > all the kernels before placing a record event enabled concurrency. Try it
> > for yourself, if you comment out line 57 the kernels will execute in
> serial.
> > Here is the timing info output from the program on my machine:
> >
> > === Timing info (for last set of kernel launches)
> > Dataset 0
> > kernel_begin : 40.617023468
> > kernel_end : 45.0715522766
> > Dataset 1
> > kernel_begin : 40.6191368103
> > kernel_end : 45.0742073059
> >
> > Notice that the kernel execution seems to overlap. I'm pretty certain
> that
> > launches are occuring concurrently because, with line 57 commented out,
> the
> > runtimes are essentially doubled and there is no overlap in the timing.
> Here
> > is that output:
>
> Line 57 in your code is blank? Which line do you mean?
>
> > === Timing info (for last set of kernel launches)
> > Dataset 0
> > kernel_begin : 80.6352005005
> > kernel_end : 85.0898895264
> > Dataset 1
> > kernel_begin : 85.091293335
> > kernel_end : 89.5460510254
> >
> > Finally, Andreas, should I post this on the wiki?
>
> Yes, please--this would be very good to keep around.
>
> > Also, is there any way to
> > disable the cuda api trace without reinstalling pycuda? Thanks!
>
> Nope--you need to recompile, at least for the moment.
>
> Andreas
>
>
#! /usr/bin/env python
# A simple program to illustrate kernel concurrency with PyCuda.
# Reference: Chapter 3.2.6.5 in Cuda C Programming Guide Version 3.2.
# Jesse Lu, 2011-04-04
import numpy as np
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
#
# Set up test scenario.
#
# Create a simple test kernel.
mod = SourceModule("""
__global__ void my_kernel(float *d) {
const int i = threadIdx.x;
for (int m=0; m<100; m++) {
for (int k=0; k<100 ; k++)
d[i] = d[i] * 2.0;
for (int k=0; k<100 ; k++)
d[i] = d[i] / 2.0;
}
d[i] = d[i] * 2.0;
}
""")
my_kernel = mod.get_function("my_kernel")
# Create the test data on the host.
N = 400 # Size of datasets.
n = 2 # Number of datasets (and concurrent operations) used.
data, data_check, d_data = [], [], []
for k in range(n):
data.append(np.random.randn(N).astype(np.float32)) # Create random data.
data_check.append(data[k].copy()) # For checking the result afterwards.
d_data.append(drv.mem_alloc(data[k].nbytes)) # Allocate memory on device.
#
# Start concurrency test.
#
# Use this event as a reference point.
ref = drv.Event()
ref.record()
# Create the streams and events needed.
stream, event = [], []
marker_names = ['kernel_begin', 'kernel_end']
for k in range(n):
stream.append(drv.Stream())
event.append(dict([(marker_names[l], drv.Event()) for l in range(len(marker_names))]))
# Transfer to device.
for k in range(n):
drv.memcpy_htod(d_data[k], data[k])
# Run kernels many times, we will only keep data from last loop iteration.
for j in range(10):
for k in range(n):
event[k]['kernel_begin'].record(stream[k])
my_kernel(d_data[k], block=(N,1,1), stream=stream[k])
for k in range(n): # Commenting out should break concurrency
event[k]['kernel_end'].record(stream[k])
# Transfer data back to host.
for k in range(n):
drv.memcpy_dtoh(data[k], d_data[k])
#
# Output results.
#
print '\n=== Device attributes'
dev = pycuda.autoinit.device
print 'Name:', dev.name()
print 'Compute capability:', dev.compute_capability()
print 'Concurrent Kernels:', \
bool(dev.get_attribute(drv.device_attribute.CONCURRENT_KERNELS))
print '\n=== Checking answers'
for k in range(n):
print 'Dataset', k, ':',
if (np.linalg.norm((data_check[k] * 2**(j+1)) - data[k]) == 0.0):
print 'passed.'
else:
print 'FAILED!'
print '\n=== Timing info (for last set of kernel launches)'
for k in range(n):
print 'Dataset', k
for l in range(len(marker_names)):
print marker_names[l], ':', ref.time_till(event[k][marker_names[l]])
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda