Hi everyone,
I ran a simple experiment today, which consisted of trying to maximize the
memory (device memory) throughput of a very simple kernel. I was slightly
disappointed that I was only able to achieve 72% of the theoretical maximum
bandwidth. My GPU is a C2070. The file is attached and is executed using:
*
*
*$ python test_pycuda_speed.py *
*0.72196600476 utilization (1.0 is perfect utilization).*
*Achieved bandwidth: 98 GB/s*
*Theoretical maximum bandwidth: 136 GB/s*
*Fastest kernel execution time: 0.000777023971081*
*Optimum block shape: (160, 1, 1)*
*.*
*----------------------------------------------------------------------*
*Ran 1 test in 0.814s*
*
*
*OK*
The questions that I have are:
- How close can others get to the theoretical peak bandwidth?
- Any suggested tweaks to increase performance?
Thanks!
Jesse
import numpy as np
import unittest
from jinja2 import Template
import pycuda.autoinit
from pycuda import gpuarray as ga
from pycuda import compiler
class TestPycudaSpeed(unittest.TestCase):
""" Test Pycuda for achieving max memory bandwidth. """
def setUp(self):
""" Spaces of various sizes and dtypes to test. """
d = pycuda.autoinit.device
self.max_bandwidth = (1024 * d.memory_clock_rate / 8 * d.global_memory_bus_width * 2) / 1024**3
self.max_threads = d.max_threads_per_block
def test_copy_kernel(self):
""" Do a simple kernel. """
dims = (1024,5000)
# Create kernel.
cuda_source = Template("""
__global__ void copy(double *a, double *b) {
const int i = threadIdx.x + blockDim.x * blockIdx.x;
const int j = threadIdx.y + blockDim.y * blockIdx.y;
const int ind = i + {{ dims[0] }} * j;
if ((i < {{ dims[0] }}) && (j < {{ dims[1] }}))
b[ind] = a[ind];
}""").render(dims=dims)
mod = compiler.SourceModule(cuda_source)
func = mod.get_function('copy')
# Create test data.
a = ga.to_gpu(np.random.randn(*dims).astype(np.float64))
b = ga.zeros_like(a)
time, shape = self.run_shapes(func, dims, (a, b))
self.assertTrue((a.get() == b.get()).all())
payload = 2
bandwidth = (8 * np.prod(dims) * payload / time) / 1024**3
print bandwidth / self.max_bandwidth, 'utilization (1.0 is perfect utilization).'
print 'Achieved bandwidth:', int(bandwidth), 'GB/s'
print 'Theoretical maximum bandwidth:', self.max_bandwidth, 'GB/s'
print 'Fastest kernel execution time:', time
print 'Optimum block shape:', shape
def run_shapes(self, func, dims, input_vars):
# Make all possible shapes.
poss_shapes = [(k,1,1) for k in range(16,self.max_threads+1,16)]
# Try all shapes and record results.
res = []
for shape in poss_shapes:
grid_shape = (int(np.ceil(float(dims[0])/shape[0])), dims[1])
func.prepare([x.dtype for x in input_vars])
time = func.prepared_timed_call(grid_shape, shape, \
*[x.gpudata for x in input_vars])
res.append((time(), shape))
best_result = min(res)
return best_result
if __name__ == '__main__':
unittest.main()
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda