Right, so I have managed to get the nvidia tools installed and I can see the files now:

root@zorn:~/slurm-23.02.3# find / -xdev -name libnvidia-ml.so
/usr/local/cuda-12.2/targets/x86_64-linux/lib/stubs/libnvidia-ml.so
/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so
root@zorn:~/slurm-23.02.3# find / -xdev -name nvml.h
/usr/local/cuda-12.2/targets/x86_64-linux/include/nvml.h


and I build with:

root@zorn:~/slurm-23.02.3# ./configure --with-nvml=/usr/local/cuda-12.2

However the config.log seems odd:

root@zorn:~/slurm-23.02.3# grep nvml config.log
  $ ./configure --with-nvml=/usr/local/cuda-12.2
configure:23416: checking for nvml.h
conftest.c:134:10: fatal error: nvml.h: No such file or directory
  134 | #include <nvml.h>
| #include <nvml.h>
configure:23424: checking for nvmlInit in -lnvidia-ml
| char nvmlInit ();
| return nvmlInit ();
configure:23416: checking for nvml.h
configure:23424: checking for nvmlInit in -lnvidia-ml
config.status:1769: creating src/plugins/gpu/nvml/Makefile
config.status:2075: cd src/plugins/gpu/nvml && sed -e '/# am--include-marker/d' Makefile | make -f - am--depfiles
ac_cv_header_nvml_h=yes
ac_cv_lib_nvidia_ml_nvmlInit=yes

It seems to say it can't find nvml.h at first, but then sets 'ac_cv_header_nvml_h=yes'. I tried to run make anyway:

root@zorn:~/slurm-23.02.3# make 2>&1 | tee make.log

According to make.log, it looks like gpu_nvml.o was built without error:

root@zorn:~/slurm-23.02.3# grep nvml make.log
Making all in nvml
make[5]: Entering directory '/root/slurm-23.02.3/src/plugins/gpu/nvml'
/bin/bash ../../../../libtool --tag=CC --mode=compile gcc -DHAVE_CONFIG_H -I. -I../../../.. -I../../../../slurm -DSLURM_PLUGIN_DEBUG -I../../../.. -I../../../../src/common -I/usr/local/cuda-12.2/include -DNUMA_VERSION1_COMPATIBILITY -g -O2 -fno-omit-frame-pointer -pthread -ggdb3 -Wall -g -O1 -fno-strict-aliasing -MT gpu_nvml.lo -MD -MP -MF .deps/gpu_nvml.Tpo -c -o gpu_nvml.lo gpu_nvml.c libtool: compile: gcc -DHAVE_CONFIG_H -I. -I../../../.. -I../../../../slurm -DSLURM_PLUGIN_DEBUG -I../../../.. -I../../../../src/common -I/usr/local/cuda-12.2/include -DNUMA_VERSION1_COMPATIBILITY -g -O2 -fno-omit-frame-pointer -pthread -ggdb3 -Wall -g -O1 -fno-strict-aliasing -MT gpu_nvml.lo -MD -MP -MF .deps/gpu_nvml.Tpo -c gpu_nvml.c -fPIC -DPIC -o .libs/gpu_nvml.o libtool: compile: gcc -DHAVE_CONFIG_H -I. -I../../../.. -I../../../../slurm -DSLURM_PLUGIN_DEBUG -I../../../.. -I../../../../src/common -I/usr/local/cuda-12.2/include -DNUMA_VERSION1_COMPATIBILITY -g -O2 -fno-omit-frame-pointer -pthread -ggdb3 -Wall -g -O1 -fno-strict-aliasing -MT gpu_nvml.lo -MD -MP -MF .deps/gpu_nvml.Tpo -c gpu_nvml.c -o gpu_nvml.o >/dev/null 2>&1
mv -f .deps/gpu_nvml.Tpo .deps/gpu_nvml.Plo
/bin/bash ../../../../libtool --tag=CC --mode=link gcc -DNUMA_VERSION1_COMPATIBILITY -g -O2 -fno-omit-frame-pointer -pthread -ggdb3 -Wall -g -O1 -fno-strict-aliasing -module -avoid-version --export-dynamic -o gpu_nvml.la -rpath /usr/local/lib/slurm gpu_nvml.lo ../common/libgpu_common.la -lpthread -lm -lresolv libtool: link: gcc -shared -fPIC -DPIC .libs/gpu_nvml.o -Wl,--whole-archive ../common/.libs/libgpu_common.a -Wl,--no-whole-archive -lpthread -lm -lresolv -g -O2 -pthread -ggdb3 -g -O1 -pthread -Wl,-soname -Wl,gpu_nvml.so -o .libs/gpu_nvml.so libtool: link: (cd .libs/gpu_nvml.lax/libgpu_common.a && ar x "/root/slurm-23.02.3/src/plugins/gpu/nvml/../common/.libs/libgpu_common.a") libtool: link: ar cr .libs/gpu_nvml.a gpu_nvml.o .libs/gpu_nvml.lax/libgpu_common.a/gpu_common.o
libtool: link: ranlib .libs/gpu_nvml.a
libtool: link: rm -fr .libs/gpu_nvml.lax
libtool: link: ( cd ".libs" && rm -f "gpu_nvml.la" && ln -s "../gpu_nvml.la" "gpu_nvml.la" )
make[5]: Leaving directory '/root/slurm-23.02.3/src/plugins/gpu/nvml'


On the other hand, slurmd doesn't seem to load libnvidia-ml.so, as I would have expected:

root@zorn:~/slurm-23.02.3# ldd /usr/local/sbin/slurmd
        linux-vdso.so.1 (0x00007ffd110f5000)
liblz4.so.1 => /lib/x86_64-linux-gnu/liblz4.so.1 (0x00007faee7d3e000) libslurmfull.so => /usr/local/lib/slurm/libslurmfull.so (0x00007faee7b47000) libnuma.so.1 => /lib/x86_64-linux-gnu/libnuma.so.1 (0x00007faee7b3a000) libpam.so.0 => /lib/x86_64-linux-gnu/libpam.so.0 (0x00007faee7b28000) libpam_misc.so.0 => /lib/x86_64-linux-gnu/libpam_misc.so.0 (0x00007faee7b23000)
        libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007faee7a42000)
libresolv.so.2 => /lib/x86_64-linux-gnu/libresolv.so.2 (0x00007faee7a31000)
        libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007faee7850000)
        /lib64/ld-linux-x86-64.so.2 (0x00007faee7da6000)
libaudit.so.1 => /lib/x86_64-linux-gnu/libaudit.so.1 (0x00007faee781f000) libcap-ng.so.0 => /lib/x86_64-linux-gnu/libcap-ng.so.0 (0x00007faee7817000)

Also, slurmd.log doesn't list the GPU even when I set SlurmdDebug=debug2 in the config file (but I see other entries for debug2).

I've set 'AutoDetect=nvml' in gres.conf and 'GresTypes=gpu' in slurm.conf; shouldn't it work by now? Or is my build of slurm still not good?


On 19/07/2023 12:26, Timo Rothenpieler wrote:
On 19/07/2023 11:47, Jan Andersen wrote:
I'm trying to build slurm with nvml support, but configure doesn't find it:

root@zorn:~/slurm-23.02.3# ./configure --with-nvml
...
checking for hwloc installation... /usr
checking for nvml.h... no
checking for nvmlInit in -lnvidia-ml... yes
configure: error: unable to locate libnvidia-ml.so and/or nvml.h

But:

root@zorn:~/slurm-23.02.3# find / -xdev -name nvml.h
/usr/include/hwloc/nvml.h

It's not looking for the hwloc header, but for the nvidia one.
If you have your CUDA SDK installed in for example /opt/cuda, you got to point it there: --with-nvml=/opt/cuda

root@zorn:~/slurm-23.02.3# find / -xdev -name libnvidia-ml.so
/usr/lib32/libnvidia-ml.so
/usr/lib/x86_64-linux-gnu/libnvidia-ml.so

I tried to figure out how to tell configure where to find them, but the script is a bit eye-watering; how should I do?




Reply via email to