On 11/19/20 3:39 PM, Andreas Beckmann wrote: > POCL built against LLVM 10 (sid) or LLVM 11 (experimental) causes a > autopkgtest regression on armhf in libgpuarray while it succeeded with > LLVM 9.
I finally managed to create a plain c reproducer (based on some pocl test) which dies with this backtrace on abel.d.o: #0 getEmissionKind () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/include/llvm/IR/DebugInfoMetadata.h:1244 #1 initialize () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LexicalScopes.cpp:53 #2 0xb13a82f0 in computeIntervals () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:979 #3 runOnMachineFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:996 #4 runOnMachineFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:1023 #5 0xb141d6c8 in runOnFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/MachineFunctionPass.cpp:73 #6 0xb1297494 in runOnFunction () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1481 #7 0xb1297750 in runOnModule () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1517 #8 0xb1297ba8 in runOnModule () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1582 #9 run () at /build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1694 #10 0xb6dfac82 in pocl_llvm_codegen (Device=Device@entry=0x1a3dfc8, Modp=0x20102d8, Output=Output@entry=0xbe9be8bc, OutputSize=OutputSize@entry=0xbe9be8d0) at ./lib/CL/pocl_llvm_wg.cc:624 #11 0xb6dbf1de in llvm_codegen (output=output@entry=0x1be75e0 "/home/anbe/.cache/pocl/kcache/AP/PNFEAPBKBFEAKGGNMALGHGJEEKGMJFBFBMDHA/Sdot_kernel/0-0-0/Sdot_kernel.so", device_i=device_i@entry=0, kernel=kernel@entry=0xbe9c0290, device=0x1a3dfc8, command=command@entry=0xbe9c02c8, specialize=specialize@entry=0) at ./lib/CL/devices/common.c:158 #12 0xb6dc0e44 in pocl_check_kernel_disk_cache (command=command@entry=0xbe9c02c8, specialized=specialized@entry=0) at ./lib/CL/devices/common.c:958 #13 0xb6dc1262 in pocl_check_kernel_dlhandle_cache (command=0xbe9c02c8, initial_refcount=0, specialize=0) at ./lib/CL/devices/common.c:1081 #14 0xb6d993d4 in program_compile_dynamic_wg_binaries (program=program@entry=0x1a18350) at ./lib/CL/pocl_build.c:179 #15 0xb6da9f20 in get_binary_sizes (sizes=0xbe9c03d4, program=0x1a18350) at ./lib/CL/clGetProgramInfo.c:36 #16 POclGetProgramInfo (program=0x1a18350, param_name=4453, param_value_size=128, param_value=0xbe9c03d4, param_value_size_ret=0xbe9c03d0) at ./lib/CL/clGetProgramInfo.c:115 #17 0x0045a070 in main () at 975931.c:238 I expect pocl built against llvm 11 (experimental) to fail similarily. pocl built against llvm 9 (testing) passes. Sylvestre, could you check whether this is an error on the LLVM side or is POCL using LLVM incorrectly? Andreas
#define CL_TARGET_OPENCL_VERSION 220 #include <CL/cl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> const char source[] = "#ifdef DOUBLE_PRECISION\n" " #ifdef cl_khr_fp64\n" " #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" " #else\n" " #pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" " #endif\n" "#endif\n" "\n" "__kernel void Sdot_kernel( __global float *_X, __global float *_Y, __global float *scratchBuff,\n" " uint N, uint offx, int incx, uint offy, int incy, int doConj )\n" "{\n" " __global float *X = _X + offx;\n" " __global float *Y = _Y + offy;\n" " float dotP = (float) 0.0;\n" "\n" " if ( incx < 0 ) {\n" " X = X + (N - 1) * abs(incx);\n" " }\n" " if ( incy < 0 ) {\n" " Y = Y + (N - 1) * abs(incy);\n" " }\n" "\n" " int gOffset;\n" " for( gOffset=(get_global_id(0) * 4); (gOffset + 4 - 1)<N; gOffset+=( get_global_size(0) * 4 ) )\n" " {\n" " float4 vReg1, vReg2, res;\n" "\n" " #ifdef INCX_NONUNITY\n" " vReg1 = (float4)( (X + (gOffset*incx))[0 + ( incx * 0)], (X + (gOffset*incx))[0 + ( incx * 1)], (X + (gOffset*incx))[0 + ( incx * 2)], (X + (gOffset*incx))[0 + ( incx * 3)]);\n" " #else\n" " vReg1 = vload4( 0, (__global float *) (X + gOffset) );\n" " #endif\n" "\n" " #ifdef INCY_NONUNITY\n" " vReg2 = (float4)( (Y + (gOffset*incy))[0 + ( incy * 0)], (Y + (gOffset*incy))[0 + ( incy * 1)], (Y + (gOffset*incy))[0 + ( incy * 2)], (Y + (gOffset*incy))[0 + ( incy * 3)]);\n" " #else\n" " vReg2 = vload4( 0, (__global float *) (Y + gOffset) );\n" " #endif\n" "\n" " ;\n" " res = vReg1 * vReg2 ;\n" " dotP += res .S0 + res .S1 + res .S2 + res .S3;\n" "; // Add-up elements in the vector to give a scalar\n" " }\n" "\n" " // Loop for the last thread to handle the tail part of the vector\n" " // Using the same gOffset used above\n" " for( ; gOffset<N; gOffset++ )\n" " {\n" " float sReg1, sReg2, res;\n" " sReg1 = X[gOffset * incx];\n" " sReg2 = Y[gOffset * incy];\n" "\n" " ;\n" " res = sReg1 * sReg2 ;\n" " dotP = dotP + res ;\n" " }\n" "\n" " // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier\n" " // dotP of work-item 0 will have the final reduced item of the work-group\n" " __local float p1753 [ 64 ];\n" " uint QKiD0 = get_local_id(0);\n" " p1753 [ QKiD0 ] = dotP ;\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " if( QKiD0 < 32 ) {\n" " p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 32 ];\n" " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " if( QKiD0 < 16 ) {\n" " p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 16 ];\n" " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " if( QKiD0 < 8 ) {\n" " p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 8 ];\n" " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " if( QKiD0 < 4 ) {\n" " p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 4 ];\n" " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " if( QKiD0 < 2 ) {\n" " p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 2 ];\n" " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " if( QKiD0 == 0 ) {\n" " dotP = p1753 [0] + p1753 [1];\n" " }\n" "\n" " if( (get_local_id(0)) == 0 ) {\n" " scratchBuff[ get_group_id(0) ] = dotP;\n" " }\n" "}\n" "\n" ; #define MAX_PLATFORMS 32 #define MAX_DEVICES 32 #define MAX_BINARIES 32 #define OPENCL_ERROR_CASE(ERR) \ case ERR: \ { fprintf (stderr, "" #ERR " in %s on line %i\n", func_name, line); \ return 1; } int check_cl_error (cl_int cl_err, int line, const char* func_name) { switch (cl_err) { case CL_SUCCESS: return 0; OPENCL_ERROR_CASE (CL_DEVICE_NOT_FOUND) OPENCL_ERROR_CASE (CL_DEVICE_NOT_AVAILABLE) OPENCL_ERROR_CASE (CL_COMPILER_NOT_AVAILABLE) OPENCL_ERROR_CASE (CL_MEM_OBJECT_ALLOCATION_FAILURE) OPENCL_ERROR_CASE (CL_OUT_OF_RESOURCES) OPENCL_ERROR_CASE (CL_OUT_OF_HOST_MEMORY) OPENCL_ERROR_CASE (CL_PROFILING_INFO_NOT_AVAILABLE) OPENCL_ERROR_CASE (CL_MEM_COPY_OVERLAP) OPENCL_ERROR_CASE (CL_IMAGE_FORMAT_MISMATCH) OPENCL_ERROR_CASE (CL_IMAGE_FORMAT_NOT_SUPPORTED) OPENCL_ERROR_CASE (CL_BUILD_PROGRAM_FAILURE) OPENCL_ERROR_CASE (CL_MAP_FAILURE) OPENCL_ERROR_CASE (CL_MISALIGNED_SUB_BUFFER_OFFSET) OPENCL_ERROR_CASE (CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) OPENCL_ERROR_CASE (CL_COMPILE_PROGRAM_FAILURE) OPENCL_ERROR_CASE (CL_LINKER_NOT_AVAILABLE) OPENCL_ERROR_CASE (CL_LINK_PROGRAM_FAILURE) OPENCL_ERROR_CASE (CL_DEVICE_PARTITION_FAILED) OPENCL_ERROR_CASE (CL_KERNEL_ARG_INFO_NOT_AVAILABLE) OPENCL_ERROR_CASE (CL_INVALID_VALUE) OPENCL_ERROR_CASE (CL_INVALID_DEVICE_TYPE) OPENCL_ERROR_CASE (CL_INVALID_PLATFORM) OPENCL_ERROR_CASE (CL_INVALID_DEVICE) OPENCL_ERROR_CASE (CL_INVALID_CONTEXT) OPENCL_ERROR_CASE (CL_INVALID_QUEUE_PROPERTIES) OPENCL_ERROR_CASE (CL_INVALID_COMMAND_QUEUE) OPENCL_ERROR_CASE (CL_INVALID_HOST_PTR) OPENCL_ERROR_CASE (CL_INVALID_MEM_OBJECT) OPENCL_ERROR_CASE (CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) OPENCL_ERROR_CASE (CL_INVALID_IMAGE_SIZE) OPENCL_ERROR_CASE (CL_INVALID_SAMPLER) OPENCL_ERROR_CASE (CL_INVALID_BINARY) OPENCL_ERROR_CASE (CL_INVALID_BUILD_OPTIONS) OPENCL_ERROR_CASE (CL_INVALID_PROGRAM) OPENCL_ERROR_CASE (CL_INVALID_PROGRAM_EXECUTABLE) OPENCL_ERROR_CASE (CL_INVALID_KERNEL_NAME) OPENCL_ERROR_CASE (CL_INVALID_KERNEL_DEFINITION) OPENCL_ERROR_CASE (CL_INVALID_KERNEL) OPENCL_ERROR_CASE (CL_INVALID_ARG_INDEX) OPENCL_ERROR_CASE (CL_INVALID_ARG_VALUE) OPENCL_ERROR_CASE (CL_INVALID_ARG_SIZE) OPENCL_ERROR_CASE (CL_INVALID_KERNEL_ARGS) OPENCL_ERROR_CASE (CL_INVALID_WORK_DIMENSION) OPENCL_ERROR_CASE (CL_INVALID_WORK_GROUP_SIZE) OPENCL_ERROR_CASE (CL_INVALID_WORK_ITEM_SIZE) OPENCL_ERROR_CASE (CL_INVALID_GLOBAL_OFFSET) OPENCL_ERROR_CASE (CL_INVALID_EVENT_WAIT_LIST) OPENCL_ERROR_CASE (CL_INVALID_EVENT) OPENCL_ERROR_CASE (CL_INVALID_OPERATION) OPENCL_ERROR_CASE (CL_INVALID_GL_OBJECT) OPENCL_ERROR_CASE (CL_INVALID_BUFFER_SIZE) OPENCL_ERROR_CASE (CL_INVALID_MIP_LEVEL) OPENCL_ERROR_CASE (CL_INVALID_GLOBAL_WORK_SIZE) OPENCL_ERROR_CASE (CL_INVALID_PROPERTY) OPENCL_ERROR_CASE (CL_INVALID_IMAGE_DESCRIPTOR) OPENCL_ERROR_CASE (CL_INVALID_COMPILER_OPTIONS) OPENCL_ERROR_CASE (CL_INVALID_LINKER_OPTIONS) OPENCL_ERROR_CASE (CL_INVALID_DEVICE_PARTITION_COUNT) default: printf ("Unknown OpenCL error %i in %s on line %i\n", cl_err, func_name, line); return 1; } } #define _POCLU_CHECK_CL_ERROR_INNER(cond, func, line) \ do \ { \ if (check_cl_error (cond, line, func)) \ return (EXIT_FAILURE); \ } \ while (0) #define CHECK_CL_ERROR(cond) _POCLU_CHECK_CL_ERROR_INNER(cond, __PRETTY_FUNCTION__, __LINE__) #define CHECK_OPENCL_ERROR_IN(message) _POCLU_CHECK_CL_ERROR_INNER(err, message, __LINE__) #define TEST_ASSERT(EXP) \ do { \ if (!(EXP)) { \ fprintf(stderr, "Assertion: \n" #EXP "\nfailed on %s:%i\n", \ __FILE__, __LINE__); \ return EXIT_FAILURE; \ } \ } while (0) int main(){ cl_int err; cl_platform_id platforms[MAX_PLATFORMS]; cl_uint nplatforms; cl_device_id devices[MAX_DEVICES]; cl_uint ndevices; cl_program program = NULL; size_t binsizes[MAX_BINARIES]; size_t nbinaries; CHECK_CL_ERROR(clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms)); TEST_ASSERT(nplatforms > 0); CHECK_CL_ERROR(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &ndevices)); TEST_ASSERT(ndevices > 0); cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateContext"); const char * src[] = {source}; program = clCreateProgramWithSource(context, 1, src, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource"); CHECK_CL_ERROR(clBuildProgram(program, 1, devices, "-g -DINCX_NONUNITY -DINCY_NONUNITY", NULL, NULL)); CHECK_CL_ERROR(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsizes), binsizes, &nbinaries)); printf("binary size: %zd\n", binsizes[0]); CHECK_CL_ERROR(clReleaseProgram(program)); CHECK_CL_ERROR (clReleaseContext (context)); printf ("OK\n"); return EXIT_SUCCESS; }