import pyfft.cuda as pyfft
import numpy.fft as cpufft
import pycuda.driver as cuda
import pycuda.autoinit, time
import pycuda.gpuarray as gpuarray
import numpy as N
import pycuda.tools

typeC = N.complex128
typeR = N.float64

def to_gpu(x):
    if x.__array_interface__['strides']==None:
        return gpuarray.to_gpu(x)
    else:
        return gpuarray.to_gpu(x.copy())

NN  = 1024*16*32*4  # Size of FFT
NN2 = 1024/32       # Number of FFT
Plan = pyfft.Plan((NN,),dtype=typeC) 
fft = N.zeros((NN2,NN),typeC)
a=N.array(N.random.rand(NN),typeC)
startTime = time.time()
for ii in range(NN2):
    a_g = to_gpu(a)   # To GPU
    Plan.execute(a_g) # Do 4 fft's
    Plan.execute(a_g)
    Plan.execute(a_g)
    Plan.execute(a_g)
    fft[ii,:] = a_g.get() # From GPU

print "Time: ",time.time()-startTime

del Plan  # Profiler does not want to work without this.
pycuda.tools.clear_context_caches()