Source: openmpi
Version: 4.0.2-4
Severity: serious
Control: block 946422 with -1
Control: block 946582 with -1

Hi,

this bug was initially observed as a silx autopkgtest failure (#946422)
in pocl (#946582). I've reduced it to a python-free OpenCL example in C
and now I'm convinced that it is caused by by OpenMPI 4.

Attached you can find a trivial OpenCL example (yes, that needs a bit of
boilerplate code) that is linked against MPI and calls MPI_Init().
This works fine with OpenMPI 3 in buster, MPICH in sid (and no MPI as
well), but fails with OpenMPI 4 in sid.

You need opencl-dev and libopenmpi-dev to build it and pocl-opencl-icd
to run it.
You build it with 
     mpicc -o test_mpi_ocl test_mpi_ocl.c -lOpenCL
and get this when running successfully:

# ./test_mpi_ocl 
Success

but the failure with OpenMPI 4 is

# ./test_mpi_ocl 
pocl error: lt_dlopen("(null)") or lt_dlsym() failed with 'can't close resident 
module'.
note: missing symbols in the kernel binary might be reported as 'file not 
found' errors.
Aborted

OpenMPI 4 seems to change some state causing subsequent lt_dlopen() to fail.

In gdb we have 
(gdb) bt
#0  0x00007ffff7d10081 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007ffff7cfb535 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007fffed2b66d1 in pocl_check_kernel_dlhandle_cache 
(cmd=cmd@entry=0x55555562b730, initial_refcount=initial_refcount@entry=1) at 
./lib/CL/devices/common.c:1097
#3  0x00007fffed2bc327 in pocl_pthread_prepare_kernel (cmd=0x55555562b730, 
data=0x5555556cb5e0) at ./lib/CL/devices/pthread/pthread_scheduler.c:413
#4  pocl_pthread_exec_command (td=0x5555556cd200, cmd=0x55555562b730) at 
./lib/CL/devices/pthread/pthread_scheduler.c:450
#5  pocl_pthread_driver_thread (p=<optimized out>) at 
./lib/CL/devices/pthread/pthread_scheduler.c:496
#6  0x00007ffff79aafb7 in start_thread () from 
/lib/x86_64-linux-gnu/libpthread.so.0
#7  0x00007ffff7dd02df in clone () from /lib/x86_64-linux-gnu/libc.so.6

The error stems from line lib/CL/devices/common.c 1062
    ci->dlhandle = lt_dlopen (module_fn);
where module_fn = 
"//.cache/pocl/kcache/IL/PFLJNNHLKAHONADOJOEENLMDLFHDJKOMFJHEO/foo/1-1-1/foo.so"

OK, we can further minimize this testcase if we just take foo.so
(amd64 version attached) and lt_dlopen() it:

// mpicc -o test_lt_dlopen test_lt_dlopen.c -lltdl 

#include <stdio.h>
#include <ltdl.h>
#include <mpi.h>

int main(int argc, char **argv)
{
        MPI_Init(&argc, &argv);
        lt_dlinit();
        lt_dlhandle handle = lt_dlopen ("./foo.so");
        const char * dl_error = lt_dlerror ();
        printf("%p %s\n", handle, dl_error ? dl_error : "(null)");
}

Without OpenMPI 4 we succeed:
# ./test_lt_dlopen
0x55c4accfc480 (null)

but with OpenMPI 4 we run into the same problem:
# ./test_lt_dlopen
0x559b5c923250 can't close resident module

Andreas
// install packages
//     apt-get install libopenmpi-dev opencl-dev pocl-opencl-icd
// compile with
//     mpicc -o test_mpi_ocl test_mpi_ocl.c -lOpenCL

// based on https://wiki.aalto.fi/display/HPEC/OpenCL+tutorial

#define CL_TARGET_OPENCL_VERSION 100

// 0a_trivial.c
#include <stdio.h>
#include <stdlib.h>
 
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#include <mpi.h>

/* A kernel which does nothing */
const char * source_str  = "__kernel void foo(void)"
                           "{"
                           ""
                           "}";

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    /* Get platform and device information */
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint num_devices;
    cl_uint num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, 
&num_devices);
 
    /* Create an OpenCL context */
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, 
&ret);
 
    /* Create a command queue */
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 
0, &ret);
 
    /* Create a program from the kernel source */
    cl_program program = clCreateProgramWithSource(context, 1, &source_str, 
NULL, &ret);
 
    /* Build the program */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
    /* Create the OpenCL kernel */
    cl_kernel kernel = clCreateKernel(program, "foo", &ret);
 
    /* Execute the OpenCL kernel */
    size_t global_item_size = 1;
    size_t local_item_size = 1;
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
            &global_item_size, &local_item_size, 0, NULL, NULL);

    ret = clFlush(command_queue);
    ret = clFinish(command_queue);

    if (ret == CL_SUCCESS)
        printf("Success\n");
    else
        printf("OpenCL error executing kernel: %d\n", ret); 
 
    /* Clean up */
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    return 0;
}
// mpicc -o test_lt_dlopen test_lt_dlopen.c -lltdl 

#include <stdio.h>
#include <ltdl.h>
#include <mpi.h>

int main(int argc, char **argv)
{
        MPI_Init(&argc, &argv);
        lt_dlinit();
        lt_dlhandle handle = lt_dlopen ("./foo.so");
        const char * dl_error = lt_dlerror ();
        printf("%p %s\n", handle, dl_error ? dl_error : "(null)");
}

Attachment: foo.so
Description: application/sharedlib

Reply via email to