Hallo, I have a Pycuda code, which deals with two kernels. Both kernels run well separately, but when I put them together, there is a memory problem "LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered". In the second kernel "DotKernel", I can't change the values of any shared array or global array. Could you please have a look at the code? Thank you very much!
Best regards, Morvan
import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools
from pycuda.compiler import SourceModule
import numpy as np
from time import *
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import time
from scipy import signal
from numba import jit
import numba as nb
kernel_code_template = """
#include <cuComplex.h>
__global__ void OuterKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA2, cuFloatComplex *C)
{
const uint wB = %(MATRIX_SIZE_O)d;
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
const uint tx = threadIdx.x;
const uint ty = threadIdx.y;
__shared__ cuFloatComplex A_y ;
__shared__ cuFloatComplex B_y ;
__shared__ cuFloatComplex beta2;
__shared__ cuFloatComplex C_add;
const uint c = wB *by + bx;
for (int index_r = 0; index_r < %(rSize)d; index_r ++)
{
for (int wA = 1; wA < %(tSize)d+1; wA ++)
{
const uint aBegin = %(tSize)d * %(BLOCK_SIZE)d * by;
const uint aEnd = aBegin + wA -1 ;
const uint aStep = %(BLOCK_SIZE)d;
const int bBegin = %(tSize)d * %(BLOCK_SIZE)d * bx;
const uint bStep = %(BLOCK_SIZE)d ;
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
A_y = make_cuFloatComplex(cuCrealf(A[a + wB*ty + tx]),cuCimagf(A[a + wB*ty + tx]));
B_y = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wB*ty + tx]));
C_add= cuCmulf(A_y,B_y);
}
beta2 = make_cuFloatComplex(cuCrealf(BETA2[%(tSize)d * index_r+wA-1]),cuCimagf(BETA2[%(tSize)d * index_r+wA-1]));
C_add = cuCmulf(C_add, beta2);
C[c] = cuCaddf(C[c], C_add);
}
}
}
__global__ void DotKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA, cuFloatComplex *C, cuFloatComplex *FFT,cuFloatComplex *CC)
{
const uint wA = %(MATRIX_SIZE_I)d;
const uint wB = %(MATRIX_SIZE_O)d;
const uint bx = blockIdx.x;
const uint by = blockIdx.y;
const uint aBegin = wA * %(BLOCK_SIZE)d * by;
const uint aEnd = aBegin + wA - 1;
const uint aStep = 1;
const int bBegin = %(BLOCK_SIZE)d * bx;
const uint bStep = wB;
cuFloatComplex Csub = make_cuFloatComplex(0,0);
cuFloatComplex Csub_1 = make_cuFloatComplex(0,0);
cuFloatComplex Csub_2 = make_cuFloatComplex(0,0);
__shared__ cuFloatComplex A_global[%(MATRIX_SIZE_I)d];
__shared__ cuFloatComplex C1[%(MATRIX_SIZE_I)d];
for (int index_r = 0; index_r < %(rSize)d; index_r ++)
{
for (int index_t = 0; index_t < %(tSize)d; index_t ++)
{
for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
{
A_global[i] = make_cuFloatComplex(cuCrealf(B[i*%(tSize)d + index_t]),cuCimagf(B[i*%(tSize)d + index_t]));
}
__syncthreads();
Csub = make_cuFloatComplex(0,0);
for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep)
{
Csub = cuCaddf(Csub,cuCmulf(A_global[a], C[b]));
__syncthreads();
}
const uint c = wB * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx;
C1[c] = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
__syncthreads();
for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
{
A_global[i] = make_cuFloatComplex(0,0);
A_global[i] = make_cuFloatComplex(cuCrealf(FFT[i*%(rSize)d + index_r]),cuCimagf(FFT[i*%(rSize)d + index_r]));
}
CC[c] = make_cuFloatComplex(0,0);
__syncthreads();
CC[c] = cuCmulf(C1[c],A_global[c]);
Csub_1 = cuCmulf(Csub,A_global[c]);
__syncthreads();
BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);
Csub_1 = make_cuFloatComplex(0,0);
for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
{
Csub_1 = cuCaddf(Csub_1, CC[i]);
//BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
}
for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
{
A_global[i] = make_cuFloatComplex(0,0);
A_global[i] = make_cuFloatComplex(cuCrealf(A[i*%(tSize)d + index_t]),cuCimagf(A[i*%(tSize)d + index_t]));
}
CC[c] = make_cuFloatComplex(0,0);
__syncthreads();
//CC[c] = make_cuFloatComplex(cuCrealf(A_global[c]),cuCimagf(A_global[c]));
CC[c] = cuCmulf(C1[c],A_global[c]);
__syncthreads();
BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);
Csub_2 = make_cuFloatComplex(0,0);
for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
{
Csub_2 = cuCaddf(Csub_2, CC[i]);
//BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
}
//BETA[index_r* %(tSize)d + index_t] = make_cuFloatComplex(cuCrealf(Csub_2), cuCimagf(Csub_2));
BETA[index_r* %(tSize)d + index_t] = cuCdivf(Csub_1,Csub_2);
}
}
}
"""
tic = time.time()
iaa_iter = 4
c = 3e8
fc = 3.15e9
lam = c/fc
Mt = 7
Mr = 5
rmax = 300
theta = (np.arange(-80, 80, 0.5))/360*2*np.pi
r = np.arange(0, rmax, rmax/100)
tsize = theta.size
rsize = r.size
fmcw_fft = np.arange(0,Mt*Mr*theta.size) + 1j*1
fmcw_fft = fmcw_fft.reshape(Mt*Mr, theta.size).astype(np.complex64)
VX = np.zeros(Mt*Mr)
for i in range(0, Mt*Mr+1, 1):
VX[i-1] = i*lam/2
B = np.zeros((Mt*Mr, theta.size), dtype=complex)
for index_theta in range(0, theta.size, 1):
alpha = np.exp(1j*2*np.pi*fc*np.sin(theta[index_theta])/c*VX)
if iaa_iter == 0:
B[:, index_theta] = np.multiply(alpha, signal.nuttall(Mr*Mt))
else:
B[:, index_theta] = alpha
beta = np.zeros((r.size, theta.size), dtype=complex)
for index_r in range(0, r.size):
for index_theta in range(0, theta.size):
beta[index_r, index_theta] = np.dot(np.conj(B[:, index_theta]), fmcw_fft[:, index_r]) / np.dot(np.conj(B[:, index_theta]), B[:, index_theta])
RZero = np.zeros((Mr*Mt, Mr*Mt), dtype=complex)
beta = beta.astype(np.complex64)
BETA2 = (np.square(np.abs(beta)) + 1j*0).astype(np.complex64)
Bcon = np.conj(B)
MATRIX_SIZE_O = 35
MATRIX_SIZE_I = 35
BLOCK_SIZE = 1
GRID_X = MATRIX_SIZE_O
GRID_Y = MATRIX_SIZE_O
B = np.real(B).astype(np.float16) + 1j*np.imag(B).astype(np.float16)
a_cpu = B
Bcon = np.real(Bcon).astype(np.float16) + 1j*np.imag(Bcon).astype(np.float16)
b_cpu = Bcon
FFT_cpu = np.real(fmcw_fft).astype(np.float16) + 1j*np.imag(fmcw_fft).astype(np.float16)
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.to_gpu(BETA2)
c_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
FFT_gpu = gpuarray.to_gpu(FFT_cpu)
kernel_code = kernel_code_template % {
'MATRIX_SIZE_I': MATRIX_SIZE_I,
'MATRIX_SIZE_O': MATRIX_SIZE_O,
'BLOCK_SIZE': BLOCK_SIZE,
'tSize': tsize,
'rSize': rsize,
}
mod = compiler.SourceModule(kernel_code)
outer = mod.get_function("OuterKernel")
t1 = time.time()
outer(
a_gpu, b_gpu, BETA2_gpu,
c_gpu,
grid = (GRID_X, GRID_Y),
block = (BLOCK_SIZE, BLOCK_SIZE, 1),
)
t2 = time.time()
t_gpu = t2-t1
print('INFORMATION:',pycuda.driver.mem_get_info())
print('t_gpu1 = ', t_gpu)
aa_gpu = gpuarray.to_gpu(a_cpu)
bb_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.empty((rsize, tsize), np.complex64)
rinv_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
rinv_gpu = gpuarray.to_gpu(c_gpu.get())
cc_gpu = gpuarray.empty((1,MATRIX_SIZE_O), np.complex64)
mod = compiler.SourceModule(kernel_code)
dot = mod.get_function("DotKernel")
t1 = time.time()
dot(
aa_gpu,
bb_gpu,
BETA2_gpu,
rinv_gpu,
FFT_gpu,
cc_gpu,
grid = (GRID_X, GRID_Y),
block = (BLOCK_SIZE, BLOCK_SIZE, 1),
)
t2 = time.time()
t_gpu = t2-t1
print('t_gpu2 = ', t_gpu)
beta = BETA2_gpu.get()
toc = time.time()
print(toc-tic)
beta[1:12, :] = 0
plt.contourf(np.outer(r, np.sin(theta)), np.outer(r, np.cos(theta)), 20*np.log10(np.abs(beta)))
plt.show()
smime.p7s
Description: S/MIME Cryptographic Signature
_______________________________________________ PyCUDA mailing list -- [email protected] To unsubscribe send an email to [email protected]
