Dear PyCuda community,

First of all I would like to introduce myself: I am a scientific
developer and I am pretty new to PyCuda (even if I followed a CUDA
course). I would like to port part of a very big application to GPU,
switching from FFTw to scikit.cuda (cu_fft part). This was straight
forward, thanks to the very good abstraction done in PyCuda. I got
already speed-up of 5x with exactly the same result compared to fftw.

My problems starts when integrating the code into python-threads;
indeed the large application will make all PyCuda calls from different
threads and ends with memory leaks on the GPU crashing after a couple
of minutes. So I need to enforce all python threads to use the same
context on the GPU.

I have another question: why is the data1_gpu.ptr changing whereas 
data2_gpu.ptr and plan fixed (as expected in my code ?

Thanks for your help. 
Cheers

-- 
Jérôme Kieffer
Data analysis unit - ESRF
#!/usr//bin/python
#coding: utf8
from __future__ import with_statement
__author__ = "Jérôme Kieffer"
__contact__ = "[email protected]"
__license__ = "GPLv3+"
__copyright__ = "2011, ESRF, Grenoble"
__date__ = "20120112"
__doc__ = "This is a python module to measure image offsets using pyfftw3 or fftpack"
import os, threading, time, gc
try:
    import fftw3
except ImportError:
    fftw3 = None
try:
    import pycuda
    import pycuda.autoinit
    import pycuda.elementwise
    import pycuda.gpuarray as gpuarray
    import scikits.cuda.fft as cu_fft
except ImportError:
    cu_fft = None

import numpy


class CudaCorrelate(object):
    plans = {}
    data1_gpus = {}
    data2_gpus = {}
    multconj = None
    ctx = None
    sem = threading.Semaphore()

    def __init__(self, shape):
        self.shape = tuple(shape)
    def init(self):
        with self.__class__.sem:
            if self.ctx is None:
                self.__class__.ctx = pycuda.autoinit.context
            self.ctx.push()
            if self.shape not in self.__class__.plans:
                print "Single exec plan"
                self.__class__.plans[self.shape] = cu_fft.Plan(self.shape, numpy.complex128, numpy.complex128)
            if self.shape not in self.__class__.data1_gpus:
                print "Single exec data1"
                self.__class__.data1_gpus[self.shape] = gpuarray.empty(self.shape, numpy.complex128)
            if self.shape not in self.__class__.data2_gpus:
                print "Single exec data2"
                self.__class__.data2_gpus[self.shape] = gpuarray.empty(self.shape, numpy.complex128)
            if not self.__class__.multconj:
                self.__class__.multconj = pycuda.elementwise.ElementwiseKernel("pycuda::complex<double> *a, pycuda::complex<double> *b", "a[i]*=conj(b[i])")
            self.ctx.synchronize()
#            self.ctx.detach()
            self.ctx.pop()
    def correlate(self, data1, data2):
        self.init()
        with self.__class__.sem:
            self.ctx.push()
            plan = self.__class__.plans[self.shape]
            data1_gpu = self.__class__.data1_gpus[self.shape]
            data2_gpu = self.__class__.data2_gpus[self.shape]
            print data1_gpu.ptr, data2_gpu.ptr, plan
            data1_gpu.set(data1.astype(numpy.complex128))
            cu_fft.fft(data1_gpu, data1_gpu, plan)

            print data1_gpu.ptr, data2_gpu.ptr, plan
            data2_gpu.set(data2.astype(numpy.complex128))
            cu_fft.fft(data2_gpu, data2_gpu, plan)
    #            data1_gpu *= data2_gpu.conj()
            self.multconj(data1_gpu, data2_gpu)
            cu_fft.ifft(data1_gpu, data1_gpu, plan, True)
            res = data1_gpu.get().real
            print data1_gpu.ptr, data2_gpu.ptr, plan
            self.ctx.synchronize()
#            self.ctx.detach()
            self.ctx.pop()

if __name__=='__main__':
    shape = (2001,1001)
    data1 = numpy.random.random(shape)
    data2 = numpy.random.random(shape)
    cc = CudaCorrelate(shape)
    cc.init()
    print("should be Working")
    for i in range(50):
        cc.correlate(data1,data2)
    print("Memory leaks")
    for i in range(50):
        t=threading.Thread(target=cc.correlate,args=(data1,data2))
        t.start()
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to