Hello, When you use the erf function with pyCuda, execution takes almost 2x longer than with nvcc. The exp function takes about 30% longer. I use these functions in my pyCuda program a lot and in that case it makes the program 3x slower than when using nvcc. I attached two small example programs that can be run straight away. Regards, Michiel.
add_loop.cu
Description: Binary data
import numpy
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
N = 10000000
NrOfBlocks = 64
NrOfThreads = 64
def getSource():
source = SourceModule("""
#define N 10000000
__global__ void add( float* a, float* b, float* c )
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < N)
{
// c[tid] = a[tid] + b[tid];
// c[tid] = erf(a[tid]) + erf(b[tid]);
c[tid] = exp(a[tid]) + exp(b[tid]);
tid += gridDim.x * blockDim.x;
}
}
""")
return source
def main():
print "N:", N
a = numpy.arange(N).astype(numpy.float32)
b = 2*a.copy()
c = numpy.zeros(N).astype(numpy.float32)
mod = getSource()
a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)
cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)
cudaStart = cuda.Event()
cudaStop = cuda.Event()
cudaStart.record()
func = mod.get_function("add")
func( a_gpu, b_gpu, c_gpu, block=(NrOfThreads,1,1), grid=(NrOfBlocks,1) )
cudaStop.record()
cudaStop.synchronize()
cuda.memcpy_dtoh(c, c_gpu)
cudaMSecs = cudaStart.time_till(cudaStop)
print "Time (ms):", cudaMSecs
if __name__ == "__main__":
main()
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
