Re: [PyCUDA] Histograms with PyCUDA

Francisco Villaescusa Navarro Fri, 06 Apr 2012 09:54:51 -0700

Thanks for all the suggestions!

Regarding removing sqrt: it seems that the code only gains about ~1%,and you lose the capacity to easily define linear intervals...

I have tried with sqrt and sqrtf, but there is not difference in thetotal time (or it is very small).

The code to find the histogram of an array with values between 0 and 1should be something as:


import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.cumath as cumath
from pycuda.compiler import SourceModule
from pycuda import compiler

grid_gpu_template = """
__global__ void grid(float *values, int size, float *temp_grid)
{
    unsigned int id = threadIdx.x;
    int i,bin;
    const uint interv = %(interv)s;

    for(i=id;i<size;i+=blockDim.x){
        bin=(int)(values[i]*interv);
        if (bin==interv){
           bin=interv-1;
        }
        temp_grid[id*interv+bin]+=1.0;
    }
}
"""

reduction_gpu_template = """
__global__ void reduction(float *temp_grid, float *his)
{
    unsigned int id = blockIdx.x*blockDim.x+threadIdx.x;
    const uint interv = %(interv)s;
    const uint threads = %(max_number_of_threads)s;

    if(id<interv){
        for(int i=0;i<threads;i++){
            his[id]+=temp_grid[id+interv*i];
        }
    }
}
"""

number_of_points=100000000
max_number_of_threads=512
interv=1024

blocks=interv/max_number_of_threads
if interv%max_number_of_threads!=0:
    blocks+=1

values=np.random.random(number_of_points).astype(np.float32)

grid_gpu = grid_gpu_template % {
    'interv': interv,
}
mod_grid = compiler.SourceModule(grid_gpu)
grid = mod_grid.get_function("grid")

reduction_gpu = reduction_gpu_template % {
    'interv': interv,
    'max_number_of_threads': max_number_of_threads,
}
mod_redt = compiler.SourceModule(reduction_gpu)
redt = mod_redt.get_function("reduction")

values_gpu=gpuarray.to_gpu(values)

temp_grid_gpu=gpuarray.zeros((max_number_of_threads,interv),dtype=np.float32)

hist=np.zeros(interv,dtype=np.float32)
hist_gpu=gpuarray.to_gpu(hist)

start=time.clock()*1e3

grid(values_gpu,np.int32(number_of_points),temp_grid_gpu,grid=(1,1),block=(max_number_of_threads,1,1))redt(temp_grid_gpu,hist_gpu,grid=(blocks,1),block=(max_number_of_threads,1,1))

hist=hist_gpu.get()
print 'Time used to grid with GPU:',time.clock()*1e3-start,' ms'


start=time.clock()*1e3
bins_histo=np.linspace(0.0,1.0,interv+1)
hist_CPU=np.histogram(values,bins=bins_histo)[0]
print 'Time used to grid with CPU:',time.clock()*1e3-start,' ms'

print 'max difference between methods=',np.max(hist_CPU-hist)


################

Results:

Time used to grid with GPU: 680.0  ms
Time used to grid with CPU: 9320.0  ms
max difference between methods= 0.0

So it seems that with this algorithm we can't achieve factors largerthan ~15


Fran.



_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Re: [PyCUDA] Histograms with PyCUDA

Reply via email to