import pyfft.cuda as pyfft
import numpy.fft as cpufft
import pycuda.driver as cuda
import pycuda.autoinit, time
import pycuda.gpuarray as gpuarray
import numpy as N

typeC = N.complex128
typeR = N.float64

def to_gpu(x,stream=None):
    y = cuda.pagelocked_empty_like(x)
    y[:] = x
    return gpuarray.to_gpu_async(y,stream=stream)

NN  = 1024*16*32*4
NN2 = 1024/32

stream1, stream2 = cuda.Stream(), cuda.Stream()
event1, event2   = cuda.Event(), cuda.Event()

Plan1 = pyfft.Plan((NN,),dtype=typeC,stream=stream1,\
                       wait_for_finish=False, fast_math=True)
Plan2 = pyfft.Plan((NN,),dtype=typeC,stream=stream2,\
                       wait_for_finish=False, fast_math=True)

fft = N.zeros((NN2,NN),typeC)

a1=N.array(N.random.rand(NN),typeC)
a1pl = cuda.pagelocked_empty_like(a1)
a1pl[:] = a1[:]
a2=N.array(N.random.rand(NN),typeC)
a2pl = cuda.pagelocked_empty_like(a2)
a2pl[:] = a2[:]

startTime = time.time()
print time.time()-startTime
a1_g = gpuarray.to_gpu_async(a1pl,stream=stream1)
print time.time()-startTime
Plan1.execute(a1_g,wait_for_finish=False)
print time.time()-startTime
Plan1.execute(a1_g,wait_for_finish=False)
print time.time()-startTime
Plan1.execute(a1_g,wait_for_finish=False)
print time.time()-startTime
Plan1.execute(a1_g,wait_for_finish=False)
print time.time()-startTime
fft1 = a1_g.get_async(stream=stream1)
print time.time()-startTime
a2_g = gpuarray.to_gpu_async(a2pl,stream=stream2)
print time.time()-startTime
Plan2.execute(a2_g,wait_for_finish=False)
Plan2.execute(a2_g,wait_for_finish=False)
Plan2.execute(a2_g,wait_for_finish=False)
Plan2.execute(a2_g,wait_for_finish=False)
fft2 = a2_g.get_async(stream=stream2)

for ii in range(1,NN2/2):
    stream1.synchronize()
    fft[(ii-1)*2,:]=fft1   # Should be correct data after sync
    a1_g = gpuarray.to_gpu_async(a1pl,stream=stream1)
    Plan1.execute(a1_g,wait_for_finish=False)
    Plan1.execute(a1_g,wait_for_finish=False)
    Plan1.execute(a1_g,wait_for_finish=False)
    Plan1.execute(a1_g,wait_for_finish=False)
    fft1 = a1_g.get_async(stream=stream1)
    stream2.synchronize()
    fft[(ii-1)*2+1,:]=fft2  # Should be correct data after sync
    a2_g = gpuarray.to_gpu_async(a2pl,stream=stream2)
    Plan2.execute(a2_g,wait_for_finish=False)
    Plan2.execute(a2_g,wait_for_finish=False)
    Plan2.execute(a2_g,wait_for_finish=False)
    Plan2.execute(a2_g,wait_for_finish=False)
    fft2 = a2_g.get_async(stream=stream2)

stream1.synchronize()
fft[NN2-2,:]=fft1
stream2.synchronize()
fft[NN2-1,:]=fft2

print "Time: ",time.time()-startTime

del Plan1, Plan2, stream1, stream2
pycuda.tools.clear_context_caches()
