[PyCUDA] Achieving peak memory bandwidth

Jesse Lu Thu, 16 Feb 2012 12:05:13 -0800

Hi everyone,

I ran a simple experiment today, which consisted of trying to maximize the
memory (device memory) throughput of a very simple kernel. I was slightly
disappointed that I was only able to achieve 72% of the theoretical maximum
bandwidth. My GPU is a C2070. The file is attached and is executed using:
*
*
*$ python test_pycuda_speed.py *
*0.72196600476 utilization (1.0 is perfect utilization).*
*Achieved bandwidth: 98 GB/s*
*Theoretical maximum bandwidth: 136 GB/s*
*Fastest kernel execution time: 0.000777023971081*
*Optimum block shape: (160, 1, 1)*
*.*
*----------------------------------------------------------------------*
*Ran 1 test in 0.814s*
*
*
*OK*


The questions that I have are:

   - How close can others get to the theoretical peak bandwidth?
   - Any suggested tweaks to increase performance?

Thanks!

Jesse

import numpy as np
import unittest
from jinja2 import Template
import pycuda.autoinit
from pycuda import gpuarray as ga
from pycuda import compiler

class TestPycudaSpeed(unittest.TestCase):
    """ Test Pycuda for achieving max memory bandwidth. """

    def setUp(self):
        """ Spaces of various sizes and dtypes to test. """
        d = pycuda.autoinit.device
        self.max_bandwidth = (1024 * d.memory_clock_rate / 8 * d.global_memory_bus_width * 2) / 1024**3
        self.max_threads = d.max_threads_per_block

    def test_copy_kernel(self):
        """ Do a simple kernel. """

        dims = (1024,5000)
        # Create kernel.
        cuda_source = Template("""
            __global__ void copy(double *a, double *b) {
                const int i = threadIdx.x + blockDim.x * blockIdx.x;
                const int j = threadIdx.y + blockDim.y * blockIdx.y;
                const int ind = i + {{ dims[0] }} * j;
                if ((i < {{ dims[0] }}) && (j < {{ dims[1] }}))
                    b[ind] = a[ind];
            }""").render(dims=dims)
        mod = compiler.SourceModule(cuda_source) 
        func = mod.get_function('copy')

        # Create test data.
        a = ga.to_gpu(np.random.randn(*dims).astype(np.float64))
        b = ga.zeros_like(a)

        time, shape = self.run_shapes(func, dims, (a, b))
        self.assertTrue((a.get() == b.get()).all())
        payload = 2
        bandwidth = (8 * np.prod(dims) * payload / time) / 1024**3

        print bandwidth / self.max_bandwidth, 'utilization (1.0 is perfect utilization).'
        print 'Achieved bandwidth:', int(bandwidth), 'GB/s'
        print 'Theoretical maximum bandwidth:', self.max_bandwidth, 'GB/s'
        print 'Fastest kernel execution time:', time
        print 'Optimum block shape:', shape

    def run_shapes(self, func, dims, input_vars):
        
        # Make all possible shapes.
        poss_shapes = [(k,1,1) for k in range(16,self.max_threads+1,16)]

        # Try all shapes and record results.
        res = []
        for shape in poss_shapes:
            grid_shape = (int(np.ceil(float(dims[0])/shape[0])), dims[1])
            func.prepare([x.dtype for x in input_vars])
            time = func.prepared_timed_call(grid_shape, shape, \
                                        *[x.gpudata for x in input_vars])
            res.append((time(), shape))

        best_result = min(res)
        return best_result




if __name__ == '__main__':
    unittest.main()

_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

[PyCUDA] Achieving peak memory bandwidth

Reply via email to