Bug#920497: clblas: *ger out of bounds memory access under pocl

Rebecca N. Palmer Sat, 26 Jan 2019 02:21:32 -0800

Package: libclblas2,libpocl2
Version: 2.12-1,1.2-3
(and also 1.2-2, but the below is from -3)

libgpuarray's test_ger (DEVICE=opencl0:0 POCL_KERNEL_CACHE=0 nosetests3-v pygpu.tests.test_blas:test_ger, requires python3-pygpu, python3-nose,python3-scipy, ocl-icd-opencl-dev, libclblas-dev) crashes with memorycorruption errors (e.g. "double free" - exact message varies) underclblas+pocl. Changing the test matrix size (default 4x5) to 64x64 (i.e.a whole number of clblas blocks - 16x64 or 64x16 depending on arrayorder) makes it stop crashing.

clblas'https://sources.debian.org/src/clblas/2.12-1/src/samples/example_sger.c/(5x5 matrix, with CL_DEVICE_TYPE_GPU changed to _ALL) doesn't actuallycrash, but Valgrind says the kernel is reading and writing out-of-boundsmemory.

The kernel source has what look like proper bounds checks for theedge-of-matrix blocks(https://sources.debian.org/src/clblas/2.12-1/src/library/blas/gens/clTemplates/ger.cl/#L263),but disassembling the kernel suggests these aren't there in the binary:


#no cache is to avoid #919824

$ POCL_KERNEL_CACHE=0 valgrind --track-origins=yes --vgdb=yes--vgdb-error=0 ./example_sger &

$ gdb ./example_sger

(relevant part - => is current position)

0x0000000004853a0f <+2303>: mov $0x1,%eax # eax isn't 1 so wedidn't arrive straight down from here - the next 4 jumps are the onlyones into here

--Type <RET> for more, q to quit, c to continue without paging--
   0x0000000004853a14 <+2308>:  mov    0x20(%rbx),%rcx
   0x0000000004853a18 <+2312>:  nopl   0x0(%rax,%rax,1)
   0x0000000004853a20 <+2320>:  mov    %rdi,0x10(%rbx)
   0x0000000004853a24 <+2324>:  mov    %rsi,0x28(%rbx)
   0x0000000004853a28 <+2328>:  mov    %rsi,%r9
   0x0000000004853a2b <+2331>:  mov    %rdx,0x30(%rbx)
   0x0000000004853a2f <+2335>:  mov    %rdx,%r11
   0x0000000004853a32 <+2338>:  mov    %r10,0x38(%rbx)
   0x0000000004853a36 <+2342>:  mov    %r10,%rdi
   0x0000000004853a39 <+2345>:  mov    %rcx,0x20(%rbx)
   0x0000000004853a3d <+2349>:  xor    %r10d,%r10d
   0x0000000004853a40 <+2352>:  cmp    %r14,%rax
   0x0000000004853a43 <+2355>:  mov    0x40(%rbp),%r15
   0x0000000004853a47 <+2359>:  mov    0x30(%rbp),%r13d
   0x0000000004853a4b <+2363>:  mov    0x48(%rbx),%r8

0x0000000004853a4f <+2367>: jae 0x4853a9d<_pocl_launcher_Sger_R_kernel+2445>

   0x0000000004853a51 <+2369>:  nopw   %cs:0x0(%rax,%rax,1)
   0x0000000004853a5b <+2379>:  nopl   0x0(%rax,%rax,1)
   0x0000000004853a60 <+2384>:  mov    (%r9,%rax,4),%edx
   0x0000000004853a64 <+2388>:  mov    (%r11,%rax,4),%esi
   0x0000000004853a68 <+2392>:  shl    $0x4,%rsi

0x0000000004853a6c <+2396>: vmulps (%r12,%rsi,1),%xmm0,%xmm1 # temp= yRegS * alpha ;0x0000000004853a72 <+2402>: mov (%rdi,%rax,4),%esi # row indexto esi

--Type <RET> for more, q to quit, c to continue without paging--

0x0000000004853a75 <+2405>: imul %r13d,%esi # esi = row*lda, ldain r13d0x0000000004853a79 <+2409>: lea (%r8,%rsi,4),%rsi #row start torsi , r8 is base of A0x0000000004853a7d <+2413>: vbroadcastss (%r15,%rdx,4),%xmm2 # loadxreg to xmm2 - r15 = localX base, tIDy in rdx here0x0000000004853a83 <+2419>: mov (%rcx,%rax,4),%edx # columnindex to edx

   0x0000000004853a86 <+2422>:  vmulps %xmm1,%xmm2,%xmm1 # * of mad

=> 0x0000000004853a8a <+2426>: vaddps (%rsi,%rdx,4),%xmm1,%xmm1 #vload(out of bounds read - edx (col) is too big) and + of mad

   0x0000000004853a8f <+2431>:  vmovups %xmm1,(%rsi,%rdx,4) #vstore
   0x0000000004853a94 <+2436>:  add    $0x1,%rax

0x0000000004853a98 <+2440>: cmp %rax,%r14 # r14 is group size... is this the loop over workitems, with rax = local ID and (..,rax,4)= private variables? and if it is, where are the bounds checks?(...which is the bug...)0x0000000004853a9b <+2443>: jne 0x4853a60<_pocl_launcher_Sger_R_kernel+2384> # must have arrived from here

   0x0000000004853a9d <+2445>:  add    $0x1,%r10
   0x0000000004853aa1 <+2449>:  mov    0xc8(%rbx),%rax
   0x0000000004853aa8 <+2456>:  add    %rax,%rcx
   0x0000000004853aab <+2459>:  add    %rax,%rdi
   0x0000000004853aae <+2462>:  add    %rax,%r11
   0x0000000004853ab1 <+2465>:  add    %rax,%r9
   0x0000000004853ab4 <+2468>:  mov    $0x0,%eax
   0x0000000004853ab9 <+2473>:  cmp    0x70(%rbx),%r10

0x0000000004853abd <+2477>: jb 0x4853a40<_pocl_launcher_Sger_R_kernel+2352>

   0x0000000004853abf <+2479>:  mov    0x10(%rbx),%rdi
   0x0000000004853ac3 <+2483>:  add    $0x1,%rdi
   0x0000000004853ac7 <+2487>:  mov    0x20(%rbx),%rcx
--Type <RET> for more, q to quit, c to continue without paging--
   0x0000000004853acb <+2491>:  mov    0xf8(%rbx),%rax
   0x0000000004853ad2 <+2498>:  add    %rax,%rcx
   0x0000000004853ad5 <+2501>:  mov    0x38(%rbx),%r10
   0x0000000004853ad9 <+2505>:  add    %rax,%r10
   0x0000000004853adc <+2508>:  mov    0x30(%rbx),%rdx
   0x0000000004853ae0 <+2512>:  add    %rax,%rdx
   0x0000000004853ae3 <+2515>:  mov    0x28(%rbx),%rsi
   0x0000000004853ae7 <+2519>:  add    %rax,%rsi
   0x0000000004853aea <+2522>:  mov    $0x0,%eax
   0x0000000004853aef <+2527>:  cmp    0x68(%rbx),%rdi

0x0000000004853af3 <+2531>: jb 0x4853a20<_pocl_launcher_Sger_R_kernel+2320>


(gdb) info all-registers
rax            0x8                 8 work item localID?
rbx            0xc3bd9c0           205248960 on stack?
rcx            0xc3bd4c0           205247680 on stack?
rdx            0x20                32 current column number

rsi 0xe3c1e00 238820864 current row start (=r8 sorow 0)

rdi            0xc3bccc0           205245632 on stack?
rbp            0xc3bdb40           0xc3bdb40 on stack?
rsp            0xc3bc0c0           0xc3bc0c0 stack ptr
r8             0xe3c1e00           238820864 base of 128byte block A
r9             0xc3bc4c0           205243584 on stack?
r10            0x0                 0
r11            0xc3bc8c0           205244608 on stack?
r12            0x11bc5100          297554176
r13            0x5                 5 stride (in entries)
r14            0x100               256 workgroup size
r15            0x11bc5080          297554048 base of localX

rip 0x4853a8a 0x4853a8a<_pocl_launcher_Sger_R_kernel+2426>

eflags         0x10                [ AF ]
cs             0x0                 0
ss             0x0                 0
ds             0x0                 0
es             0x0                 0
fs             0x0                 0
--Type <RET> for more, q to quit, c to continue without paging--
gs             0x0                 0
st0            0                   (raw 0x00000000000000000000)
st1            0                   (raw 0x00000000000000000000)
st2            0                   (raw 0x00000000000000000000)
st3            0                   (raw 0x00000000000000000000)
st4            0                   (raw 0x00000000000000000000)
st5            0                   (raw 0x00000000000000000000)
st6            0                   (raw 0x00000000000000000000)
st7            0                   (raw 0x00000000000000000000)
fctrl          0x37f               895
fstat          0x0                 0
ftag           0xffff              65535
fiseg          0x0                 0
fioff          0x0                 0
foseg          0x0                 0
fooff          0x0                 0
fop            0x0                 0
mxcsr          0x1f80              [ IM DM ZM OM UM PM ]

ymm0 {v8_float = {0xa, 0xa, 0xa, 0xa, 0x0, 0x0, 0x0, 0x0},v4_double = {0x80000, 0x80000, 0x0, 0x0} # 10 = alpha or one-off-end ofY ; should be 4 successive entries of Y

ymm1           {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},

ymm2 {v8_float = {0xb, 0xb, 0xb, 0xb, 0x0, 0x0, 0x0, 0x0}, #11= first entry of X


This suggests:

- either the kernel has undefined behaviour in the bounds-check-hit case(compilers often remove such code to make the defined case faster -http://blog.llvm.org/2011/05/what-every-c-programmer-should-know_14.html), though I can't see where- or, there is a bug in pocl; as this kernel has multiple barriers,possibly related to https://github.com/pocl/pocl/issues/553 and/orhttps://github.com/pocl/pocl/issues/683

https://github.com/clMathLibraries/clBLAS/issues/108 was a vaguelysimilar issue in the same kernel, but we already have the fix for that one.

Bug#920497: clblas: *ger out of bounds memory access under pocl

Reply via email to