Thank you for all the suggestions.
Disabling ECC gave me a ~10% boost right off the bat, and the assignment
kernel does indeed show higher performance, although __restrict__ doesn't
seem to help in this particular case. I will also give double2 a try,
although > 80% utilization seems satisfactory. My new results are below.
The new test file is attached. Thanks again! The pycuda community is really
beneficial.
*$ python test_pycuda_speed.py *
*
*
*Assign kernel:*
*0.891788509501 utilization (1.0 is perfect utilization).*
*Achieved bandwidth: 121 GB/s*
*Theoretical maximum bandwidth: 136 GB/s*
*Fastest kernel execution time: 0.000314527988434*
*Optimum block shape: (112, 1, 1)*
*.*
*Copy kernel:*
*0.825055444628 utilization (1.0 is perfect utilization).*
*Achieved bandwidth: 112 GB/s*
*Theoretical maximum bandwidth: 136 GB/s*
*Fastest kernel execution time: 0.000679935991764*
*Optimum block shape: (128, 1, 1)*
*
*
*Copy kernel2:*
*0.825094285587 utilization (1.0 is perfect utilization).*
*Achieved bandwidth: 112 GB/s*
*Theoretical maximum bandwidth: 136 GB/s*
*Fastest kernel execution time: 0.00067990398407*
*Optimum block shape: (128, 1, 1)*
*..*
*----------------------------------------------------------------------*
*Ran 3 tests in 1.290s*
*
*
*OK*
On Thu, Feb 16, 2012 at 1:11 PM, marmaduke <[email protected]> wrote:
> On Thu, Feb 16, 2012, at 11:57, Jesse Lu wrote:
> > Hi everyone,
> >
> > I ran a simple experiment today, which consisted of trying to maximize
> > the
> > memory (device memory) throughput of a very simple kernel.
>
> I don't have pycuda installed at the moment, so I can't try, but your
> benchmark reads and writes from two arrays that aren't declared with
> __restrict__ (can nvcc assumed they aren't aliased?). If you declared
> the arrays as unaliased, do you see an improvement? Also, if you just
> write memory (e.g., a[ind] = 1.0;) do you get better bandwidth
> utilization?
>
> The cuda prog guide also mentions that kernels that mix memory access
> and computational work will show better performance on both overall
> because the compiler can schedule simultaneous access and computation,
> but I don't know how significant that is.
>
> Marmaduke
>
import numpy as np
import unittest
from jinja2 import Template
import pycuda.autoinit
from pycuda import gpuarray as ga
from pycuda import compiler
class TestPycudaSpeed(unittest.TestCase):
""" Test Pycuda for achieving max memory bandwidth. """
def setUp(self):
""" Spaces of various sizes and dtypes to test. """
d = pycuda.autoinit.device
self.max_bandwidth = (1024 * d.memory_clock_rate / 8 * d.global_memory_bus_width * 2) / 1024**3
self.max_threads = d.max_threads_per_block
self.dims = (1024,5000)
self.is_ecc_on = d.ecc_enabled
def test_ecc_disabled(self):
""" Make sure ECC is disabled. """
self.assertTrue(self.is_ecc_on == False, \
'ECC enabled! Should be disabled for best performance.')
def test_assign_kernel(self):
""" Do a simple assign kernel. """
print '\nAssign kernel:'
# Create kernel.
cuda_source = Template("""
__global__ void copy(double *a) {
const int i = threadIdx.x + blockDim.x * blockIdx.x;
const int j = threadIdx.y + blockDim.y * blockIdx.y;
const int ind = i + {{ dims[0] }} * j;
if ((i < {{ dims[0] }}) && (j < {{ dims[1] }}))
a[ind] = 1.0;
}""").render(dims=self.dims)
mod = compiler.SourceModule(cuda_source)
func = mod.get_function('copy')
# Create test data.
a = ga.to_gpu(np.random.randn(*self.dims).astype(np.float64))
self.run_shapes(func, 1, self.dims, (a,))
def test_copy_kernel(self):
""" Do a simple copy kernel. """
# Create test data.
a = ga.to_gpu(np.random.randn(*self.dims).astype(np.float64))
b = ga.zeros_like(a)
print '\nCopy kernel:'
# Create kernel.
cuda_source = Template("""
__global__ void copy(double *a, double *b) {
const int i = threadIdx.x + blockDim.x * blockIdx.x;
const int j = threadIdx.y + blockDim.y * blockIdx.y;
const int ind = i + {{ dims[0] }} * j;
if ((i < {{ dims[0] }}) && (j < {{ dims[1] }}))
b[ind] = a[ind];
}""").render(dims=self.dims)
mod = compiler.SourceModule(cuda_source)
func = mod.get_function('copy')
self.run_shapes(func, 2, self.dims, (a, b))
print '\nCopy kernel2:'
# Create kernel.
cuda_source = Template("""
__global__ void copy(double* __restrict__ a, double* __restrict__ b) {
const int i = threadIdx.x + blockDim.x * blockIdx.x;
const int j = threadIdx.y + blockDim.y * blockIdx.y;
const int ind = i + {{ dims[0] }} * j;
if ((i < {{ dims[0] }}) && (j < {{ dims[1] }}))
b[ind] = a[ind];
}""").render(dims=self.dims)
mod = compiler.SourceModule(cuda_source)
func = mod.get_function('copy')
self.run_shapes(func, 2, self.dims, (a, b))
def run_shapes(self, func, payload, dims, input_vars):
# Make all possible shapes.
poss_shapes = [(k,1,1) for k in range(16,self.max_threads+1,16)]
# Try all shapes and record results.
res = []
for shape in poss_shapes:
grid_shape = (int(np.ceil(float(dims[0])/shape[0])), dims[1])
func.prepare([x.dtype for x in input_vars])
time = func.prepared_timed_call(grid_shape, shape, \
*[x.gpudata for x in input_vars])
res.append((time(), shape))
time, shape = min(res)
bandwidth = (8 * np.prod(dims) * payload / time) / 1024**3
print bandwidth / self.max_bandwidth, 'utilization (1.0 is perfect utilization).'
print 'Achieved bandwidth:', int(bandwidth), 'GB/s'
print 'Theoretical maximum bandwidth:', self.max_bandwidth, 'GB/s'
print 'Fastest kernel execution time:', time
print 'Optimum block shape:', shape
if __name__ == '__main__':
unittest.main()
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda