Thanks for all the suggestions!

Regarding removing sqrt: it seems that the code only gains about ~1%, and you lose the capacity to easily define linear intervals...

I have tried with sqrt and sqrtf, but there is not difference in the total time (or it is very small).

The code to find the histogram of an array with values between 0 and 1 should be something as:

import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.cumath as cumath
from pycuda.compiler import SourceModule
from pycuda import compiler

grid_gpu_template = """
__global__ void grid(float *values, int size, float *temp_grid)
{
    unsigned int id = threadIdx.x;
    int i,bin;
    const uint interv = %(interv)s;

    for(i=id;i<size;i+=blockDim.x){
        bin=(int)(values[i]*interv);
        if (bin==interv){
           bin=interv-1;
        }
        temp_grid[id*interv+bin]+=1.0;
    }
}
"""

reduction_gpu_template = """
__global__ void reduction(float *temp_grid, float *his)
{
    unsigned int id = blockIdx.x*blockDim.x+threadIdx.x;
    const uint interv = %(interv)s;
    const uint threads = %(max_number_of_threads)s;

    if(id<interv){
        for(int i=0;i<threads;i++){
            his[id]+=temp_grid[id+interv*i];
        }
    }
}
"""

number_of_points=100000000
max_number_of_threads=512
interv=1024

blocks=interv/max_number_of_threads
if interv%max_number_of_threads!=0:
    blocks+=1

values=np.random.random(number_of_points).astype(np.float32)

grid_gpu = grid_gpu_template % {
    'interv': interv,
}
mod_grid = compiler.SourceModule(grid_gpu)
grid = mod_grid.get_function("grid")

reduction_gpu = reduction_gpu_template % {
    'interv': interv,
    'max_number_of_threads': max_number_of_threads,
}
mod_redt = compiler.SourceModule(reduction_gpu)
redt = mod_redt.get_function("reduction")

values_gpu=gpuarray.to_gpu(values)
temp_grid_gpu =gpuarray.zeros((max_number_of_threads,interv),dtype=np.float32)
hist=np.zeros(interv,dtype=np.float32)
hist_gpu=gpuarray.to_gpu(hist)

start=time.clock()*1e3
grid (values_gpu ,np .int32 (number_of_points ),temp_grid_gpu,grid=(1,1),block=(max_number_of_threads,1,1)) redt(temp_grid_gpu,hist_gpu,grid=(blocks, 1),block=(max_number_of_threads,1,1))
hist=hist_gpu.get()
print 'Time used to grid with GPU:',time.clock()*1e3-start,' ms'


start=time.clock()*1e3
bins_histo=np.linspace(0.0,1.0,interv+1)
hist_CPU=np.histogram(values,bins=bins_histo)[0]
print 'Time used to grid with CPU:',time.clock()*1e3-start,' ms'

print 'max difference between methods=',np.max(hist_CPU-hist)


################

Results:

Time used to grid with GPU: 680.0  ms
Time used to grid with CPU: 9320.0  ms
max difference between methods= 0.0

So it seems that with this algorithm we can't achieve factors larger than ~15

Fran.



_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to