Source: openmpi Version: 4.0.2-4 Severity: serious Control: block 946422 with -1 Control: block 946582 with -1
Hi, this bug was initially observed as a silx autopkgtest failure (#946422) in pocl (#946582). I've reduced it to a python-free OpenCL example in C and now I'm convinced that it is caused by by OpenMPI 4. Attached you can find a trivial OpenCL example (yes, that needs a bit of boilerplate code) that is linked against MPI and calls MPI_Init(). This works fine with OpenMPI 3 in buster, MPICH in sid (and no MPI as well), but fails with OpenMPI 4 in sid. You need opencl-dev and libopenmpi-dev to build it and pocl-opencl-icd to run it. You build it with mpicc -o test_mpi_ocl test_mpi_ocl.c -lOpenCL and get this when running successfully: # ./test_mpi_ocl Success but the failure with OpenMPI 4 is # ./test_mpi_ocl pocl error: lt_dlopen("(null)") or lt_dlsym() failed with 'can't close resident module'. note: missing symbols in the kernel binary might be reported as 'file not found' errors. Aborted OpenMPI 4 seems to change some state causing subsequent lt_dlopen() to fail. In gdb we have (gdb) bt #0 0x00007ffff7d10081 in raise () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x00007ffff7cfb535 in abort () from /lib/x86_64-linux-gnu/libc.so.6 #2 0x00007fffed2b66d1 in pocl_check_kernel_dlhandle_cache (cmd=cmd@entry=0x55555562b730, initial_refcount=initial_refcount@entry=1) at ./lib/CL/devices/common.c:1097 #3 0x00007fffed2bc327 in pocl_pthread_prepare_kernel (cmd=0x55555562b730, data=0x5555556cb5e0) at ./lib/CL/devices/pthread/pthread_scheduler.c:413 #4 pocl_pthread_exec_command (td=0x5555556cd200, cmd=0x55555562b730) at ./lib/CL/devices/pthread/pthread_scheduler.c:450 #5 pocl_pthread_driver_thread (p=<optimized out>) at ./lib/CL/devices/pthread/pthread_scheduler.c:496 #6 0x00007ffff79aafb7 in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0 #7 0x00007ffff7dd02df in clone () from /lib/x86_64-linux-gnu/libc.so.6 The error stems from line lib/CL/devices/common.c 1062 ci->dlhandle = lt_dlopen (module_fn); where module_fn = "//.cache/pocl/kcache/IL/PFLJNNHLKAHONADOJOEENLMDLFHDJKOMFJHEO/foo/1-1-1/foo.so" OK, we can further minimize this testcase if we just take foo.so (amd64 version attached) and lt_dlopen() it: // mpicc -o test_lt_dlopen test_lt_dlopen.c -lltdl #include <stdio.h> #include <ltdl.h> #include <mpi.h> int main(int argc, char **argv) { MPI_Init(&argc, &argv); lt_dlinit(); lt_dlhandle handle = lt_dlopen ("./foo.so"); const char * dl_error = lt_dlerror (); printf("%p %s\n", handle, dl_error ? dl_error : "(null)"); } Without OpenMPI 4 we succeed: # ./test_lt_dlopen 0x55c4accfc480 (null) but with OpenMPI 4 we run into the same problem: # ./test_lt_dlopen 0x559b5c923250 can't close resident module Andreas
// install packages // apt-get install libopenmpi-dev opencl-dev pocl-opencl-icd // compile with // mpicc -o test_mpi_ocl test_mpi_ocl.c -lOpenCL // based on https://wiki.aalto.fi/display/HPEC/OpenCL+tutorial #define CL_TARGET_OPENCL_VERSION 100 // 0a_trivial.c #include <stdio.h> #include <stdlib.h> #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #include <mpi.h> /* A kernel which does nothing */ const char * source_str = "__kernel void foo(void)" "{" "" "}"; int main(int argc, char** argv) { MPI_Init(&argc, &argv); /* Get platform and device information */ cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint num_devices; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &num_platforms); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices); /* Create an OpenCL context */ cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* Create a command queue */ cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* Create a program from the kernel source */ cl_program program = clCreateProgramWithSource(context, 1, &source_str, NULL, &ret); /* Build the program */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create the OpenCL kernel */ cl_kernel kernel = clCreateKernel(program, "foo", &ret); /* Execute the OpenCL kernel */ size_t global_item_size = 1; size_t local_item_size = 1; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); ret = clFlush(command_queue); ret = clFinish(command_queue); if (ret == CL_SUCCESS) printf("Success\n"); else printf("OpenCL error executing kernel: %d\n", ret); /* Clean up */ ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; }
// mpicc -o test_lt_dlopen test_lt_dlopen.c -lltdl #include <stdio.h> #include <ltdl.h> #include <mpi.h> int main(int argc, char **argv) { MPI_Init(&argc, &argv); lt_dlinit(); lt_dlhandle handle = lt_dlopen ("./foo.so"); const char * dl_error = lt_dlerror (); printf("%p %s\n", handle, dl_error ? dl_error : "(null)"); }
foo.so
Description: application/sharedlib