[PyCUDA] [Pycuda] LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

Jie Liu Tue, 12 Nov 2019 10:26:18 -0800

Hallo,

I have a Pycuda code, which deals with two kernels. Both kernels run well
separately, but when I put them together, there is  a memory problem
"LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered".
In the second kernel "DotKernel", I can't change the values of any shared
array or global array. Could you please have a look at the code? Thank you
very much!


Best regards,

Morvan

import pycuda.autoinit
from pycuda import driver, compiler, gpuarray, tools   
from pycuda.compiler import SourceModule
import numpy as np
from time import *

import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import time
from scipy import signal
from numba import jit
import numba as nb

kernel_code_template = """

#include <cuComplex.h>

__global__ void OuterKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA2, cuFloatComplex *C)
{
        const uint wB = %(MATRIX_SIZE_O)d;
        const uint bx = blockIdx.x;
        const uint by = blockIdx.y;
        const uint tx = threadIdx.x;
        const uint ty = threadIdx.y;

        __shared__ cuFloatComplex A_y ;
        __shared__ cuFloatComplex B_y ;
        __shared__ cuFloatComplex beta2;
        __shared__ cuFloatComplex C_add;

        const uint c = wB  *by +  bx;

        for (int index_r = 0; index_r < %(rSize)d; index_r ++)
        {
            for (int wA = 1; wA < %(tSize)d+1; wA ++)
            {
                const uint aBegin = %(tSize)d * %(BLOCK_SIZE)d * by;
                const uint aEnd   = aBegin + wA -1 ;
                const uint aStep = %(BLOCK_SIZE)d;
                const int bBegin = %(tSize)d * %(BLOCK_SIZE)d * bx;
                const uint bStep = %(BLOCK_SIZE)d ;

                for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
                {
                    A_y = make_cuFloatComplex(cuCrealf(A[a + wB*ty + tx]),cuCimagf(A[a + wB*ty + tx]));
                    B_y = make_cuFloatComplex(cuCrealf(B[b + wB*ty + tx]),cuCimagf(B[b + wB*ty + tx]));
                    C_add= cuCmulf(A_y,B_y);                                                 
                }

                beta2 = make_cuFloatComplex(cuCrealf(BETA2[%(tSize)d * index_r+wA-1]),cuCimagf(BETA2[%(tSize)d * index_r+wA-1]));
                C_add = cuCmulf(C_add, beta2);
                C[c] = cuCaddf(C[c], C_add);
            }
        }
    }

    __global__ void DotKernel(cuFloatComplex *A, cuFloatComplex *B, cuFloatComplex *BETA, cuFloatComplex *C, cuFloatComplex *FFT,cuFloatComplex *CC)
    {   
        const uint wA = %(MATRIX_SIZE_I)d;
        const uint wB = %(MATRIX_SIZE_O)d;

        const uint bx = blockIdx.x;
        const uint by = blockIdx.y;

        const uint aBegin = wA * %(BLOCK_SIZE)d * by;
        const uint aEnd   = aBegin + wA - 1;
        const uint aStep = 1;
        const int bBegin = %(BLOCK_SIZE)d * bx;
        const uint bStep = wB;

        cuFloatComplex Csub = make_cuFloatComplex(0,0);
        cuFloatComplex Csub_1 = make_cuFloatComplex(0,0);
        cuFloatComplex Csub_2 = make_cuFloatComplex(0,0);

        __shared__ cuFloatComplex A_global[%(MATRIX_SIZE_I)d];
        __shared__ cuFloatComplex C1[%(MATRIX_SIZE_I)d];

        for (int index_r = 0; index_r < %(rSize)d; index_r ++)
        {   
            for (int index_t = 0; index_t < %(tSize)d; index_t ++)
            {
                for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
                {
                    A_global[i] = make_cuFloatComplex(cuCrealf(B[i*%(tSize)d + index_t]),cuCimagf(B[i*%(tSize)d + index_t]));
                }
                __syncthreads();

                Csub = make_cuFloatComplex(0,0);

                for (int a = aBegin, b = bBegin;a <= aEnd;a += aStep, b += bStep)
                {
                    Csub = cuCaddf(Csub,cuCmulf(A_global[a], C[b]));
                    __syncthreads();
                }

                const uint c = wB * %(BLOCK_SIZE)d * by + %(BLOCK_SIZE)d * bx;
                C1[c]  = make_cuFloatComplex(cuCrealf(Csub), cuCimagf(Csub));
                __syncthreads();   

                for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
                {
                    A_global[i] = make_cuFloatComplex(0,0);
                    A_global[i] = make_cuFloatComplex(cuCrealf(FFT[i*%(rSize)d + index_r]),cuCimagf(FFT[i*%(rSize)d + index_r]));
                }

                CC[c] = make_cuFloatComplex(0,0);
                __syncthreads();

                CC[c] = cuCmulf(C1[c],A_global[c]);
                Csub_1 = cuCmulf(Csub,A_global[c]);
                __syncthreads();

                BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);

                Csub_1 = make_cuFloatComplex(0,0);

                for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
                {
                    Csub_1 = cuCaddf(Csub_1, CC[i]);
                    //BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
                }

                for (int i = 0; i < %(MATRIX_SIZE_I)d; i ++)
                {
                    A_global[i] = make_cuFloatComplex(0,0);
                    A_global[i] = make_cuFloatComplex(cuCrealf(A[i*%(tSize)d + index_t]),cuCimagf(A[i*%(tSize)d + index_t]));
                }

                CC[c] = make_cuFloatComplex(0,0);
                __syncthreads();
                //CC[c] = make_cuFloatComplex(cuCrealf(A_global[c]),cuCimagf(A_global[c]));
                CC[c] = cuCmulf(C1[c],A_global[c]);
                __syncthreads();

                BETA[index_r * %(tSize)d + index_t] = make_cuFloatComplex(0,0);

                Csub_2 = make_cuFloatComplex(0,0);

                for (int i = 0; i<%(MATRIX_SIZE_I)d; i+=1)
                {
                    Csub_2 = cuCaddf(Csub_2, CC[i]);
                    //BETA[index_r* %(tSize)d + index_t] = cuCaddf(BETA[index_r * %(tSize)d + index_t],CC[i]);
                }           

                //BETA[index_r* %(tSize)d + index_t] =  make_cuFloatComplex(cuCrealf(Csub_2), cuCimagf(Csub_2));
                BETA[index_r* %(tSize)d + index_t] =  cuCdivf(Csub_1,Csub_2);

            }   
        }

}
"""

tic = time.time()

iaa_iter = 4       
c = 3e8             
fc = 3.15e9             
lam = c/fc           

Mt = 7                 
Mr = 5               

rmax = 300             

theta = (np.arange(-80, 80, 0.5))/360*2*np.pi
r = np.arange(0, rmax, rmax/100)

tsize = theta.size
rsize = r.size

fmcw_fft  = np.arange(0,Mt*Mr*theta.size) + 1j*1
fmcw_fft = fmcw_fft.reshape(Mt*Mr, theta.size).astype(np.complex64)

VX = np.zeros(Mt*Mr)
for i in range(0, Mt*Mr+1, 1):
    VX[i-1] = i*lam/2

B = np.zeros((Mt*Mr, theta.size), dtype=complex)

for index_theta in range(0, theta.size, 1):
    alpha = np.exp(1j*2*np.pi*fc*np.sin(theta[index_theta])/c*VX)

    if iaa_iter == 0:
        B[:, index_theta] = np.multiply(alpha, signal.nuttall(Mr*Mt))
    else:
        B[:, index_theta] = alpha


beta = np.zeros((r.size, theta.size), dtype=complex)

for index_r in range(0, r.size):
    for index_theta in range(0, theta.size):
        beta[index_r, index_theta] = np.dot(np.conj(B[:, index_theta]), fmcw_fft[:, index_r]) / np.dot(np.conj(B[:, index_theta]), B[:, index_theta]) 


RZero = np.zeros((Mr*Mt, Mr*Mt), dtype=complex)

beta = beta.astype(np.complex64)
BETA2 = (np.square(np.abs(beta)) + 1j*0).astype(np.complex64)
Bcon = np.conj(B)

MATRIX_SIZE_O  = 35
MATRIX_SIZE_I  = 35
BLOCK_SIZE = 1
GRID_X =  MATRIX_SIZE_O
GRID_Y = MATRIX_SIZE_O

B = np.real(B).astype(np.float16) + 1j*np.imag(B).astype(np.float16)
a_cpu = B
Bcon = np.real(Bcon).astype(np.float16) + 1j*np.imag(Bcon).astype(np.float16)
b_cpu = Bcon
FFT_cpu = np.real(fmcw_fft).astype(np.float16) + 1j*np.imag(fmcw_fft).astype(np.float16)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.to_gpu(BETA2)
c_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
FFT_gpu = gpuarray.to_gpu(FFT_cpu)

kernel_code = kernel_code_template % {
            'MATRIX_SIZE_I': MATRIX_SIZE_I,
            'MATRIX_SIZE_O': MATRIX_SIZE_O,
            'BLOCK_SIZE': BLOCK_SIZE,
            'tSize': tsize,
            'rSize': rsize,
            }

mod = compiler.SourceModule(kernel_code)
outer = mod.get_function("OuterKernel")

t1 = time.time()
outer(
            a_gpu, b_gpu, BETA2_gpu,
            c_gpu,
            grid = (GRID_X,  GRID_Y),
            block = (BLOCK_SIZE, BLOCK_SIZE, 1),
            )

t2 = time.time()
t_gpu = t2-t1

print('INFORMATION:',pycuda.driver.mem_get_info())
print('t_gpu1 = ', t_gpu)

aa_gpu = gpuarray.to_gpu(a_cpu)
bb_gpu = gpuarray.to_gpu(b_cpu)
BETA2_gpu = gpuarray.empty((rsize, tsize), np.complex64) 

rinv_gpu = gpuarray.empty((MATRIX_SIZE_O, MATRIX_SIZE_O), np.complex64)
rinv_gpu = gpuarray.to_gpu(c_gpu.get())

cc_gpu = gpuarray.empty((1,MATRIX_SIZE_O), np.complex64)

mod = compiler.SourceModule(kernel_code)
dot = mod.get_function("DotKernel")

t1 = time.time()       
dot(
        aa_gpu,
        bb_gpu,
        BETA2_gpu,
        rinv_gpu,
        FFT_gpu,
        cc_gpu,
        grid = (GRID_X,  GRID_Y),
        block = (BLOCK_SIZE, BLOCK_SIZE, 1),
        )
t2 = time.time()
t_gpu = t2-t1

print('t_gpu2 = ', t_gpu)

beta = BETA2_gpu.get()
toc = time.time()
print(toc-tic)


beta[1:12, :] = 0
plt.contourf(np.outer(r, np.sin(theta)), np.outer(r, np.cos(theta)), 20*np.log10(np.abs(beta)))
plt.show()

smime.p7s
Description: S/MIME Cryptographic Signature

_______________________________________________
PyCUDA mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[PyCUDA] [Pycuda] LogicError: cuMemcpyDtoH failed: an illegal memory access was encountered

Reply via email to