Hi, When I was testing slurm-19.05.3 with openmpi-4.0.1 、pmix-3.1.3rc4 and ucx-1.6.1(with IB) ,I got a different error unlike Bug 7646(https://bugs.schedmd.com/show_bug.cgi?id=7646).At first , the job like "srun --mpi=pmix_v3 xxx" could run with "SLURM_PMIX_DIRECT_CONN=true" and "SLURM_PMIX_DIRECT_CONN_UCX=false",but the job was ended immediately when "SLURM_PMIX_DIRECT_CONN_UCX=true" was configured. And I got an error like "Cannot get polling fd" after "Fail to create UCX worker: Input/output error" I've confirmed that the error msg comes from ucp_worker_create. so I tried to get config after ucp_config_read("SLURM", NULL, &config) and ucp_context after ucp_init(&ucp_params, config, &ucp_context). UCX_NET_DEVICES=all UCX_SHM_DEVICES=all UCX_ACC_DEVICES=all UCX_SELF_DEVICES=all UCX_TLS=all UCX_ALLOC_PRIO=md:sysv,md:posix,huge,thp,md:*,mmap,heap UCX_SOCKADDR_AUX_TLS=ud,ud_x UCX_WARN_INVALID_CONFIG=y UCX_BCOPY_THRESH=0 UCX_RNDV_THRESH=auto UCX_RNDV_SEND_NBR_THRESH=256k UCX_RNDV_THRESH_FALLBACK=inf UCX_RNDV_PERF_DIFF=1.000 UCX_MAX_EAGER_RAILS=1 UCX_MAX_RNDV_RAILS=1 UCX_RNDV_SCHEME=auto UCX_ZCOPY_THRESH=auto UCX_BCOPY_BW=5800m UCX_ATOMIC_MODE=guess UCX_MAX_WORKER_NAME=32 UCX_USE_MT_MUTEX=n UCX_ADAPTIVE_PROGRESS=y UCX_SEG_SIZE=8k UCX_TM_THRESH=1k UCX_TM_MAX_BB_SIZE=1k UCX_TM_FORCE_THRESH=8k UCX_NUM_EPS=auto UCX_RNDV_FRAG_SIZE=256k UCX_MEMTYPE_CACHE=y UCX_FLUSH_WORKER_EPS=y UCX_UNIFIED_MODE=n hello 2 UCX_NET_DEVICES=all UCX_SHM_DEVICES=all UCX_ACC_DEVICES=all UCX_SELF_DEVICES=all UCX_TLS=all UCX_ALLOC_PRIO=md:sysv,md:posix,huge,thp,md:*,mmap,heap UCX_SOCKADDR_AUX_TLS=ud,ud_x UCX_WARN_INVALID_CONFIG=y UCX_BCOPY_THRESH=0 UCX_RNDV_THRESH=auto UCX_RNDV_SEND_NBR_THRESH=256k UCX_RNDV_THRESH_FALLBACK=inf UCX_RNDV_PERF_DIFF=1.000 UCX_MAX_EAGER_RAILS=1 UCX_MAX_RNDV_RAILS=1 UCX_RNDV_SCHEME=auto UCX_ZCOPY_THRESH=auto UCX_BCOPY_BW=5800m UCX_ATOMIC_MODE=guess UCX_MAX_WORKER_NAME=32 UCX_USE_MT_MUTEX=n UCX_ADAPTIVE_PROGRESS=y UCX_SEG_SIZE=8k UCX_TM_THRESH=1k UCX_TM_MAX_BB_SIZE=1k UCX_TM_FORCE_THRESH=8k UCX_NUM_EPS=auto UCX_RNDV_FRAG_SIZE=256k UCX_MEMTYPE_CACHE=y UCX_FLUSH_WORKER_EPS=y UCX_UNIFIED_MODE=n # # UCP context # # md 0 : self # md 1 : tcp # md 2 : ib/mlx5_3 # md 3 : ib/mlx5_2 # md 4 : ib/mlx5_1 # md 5 : ib/mlx5_0 # md 6 : rdmacm # md 7 : sysv # md 8 : posix # md 9 : cma # md 10 : knem # # resource 0 : md 0 dev 0 flags -- self/self # resource 1 : md 1 dev 1 flags -- tcp/ib0 # resource 2 : md 1 dev 2 flags -- tcp/eno1 # resource 3 : md 2 dev 3 flags -- rc/mlx5_3:1 # resource 4 : md 2 dev 3 flags -- rc_mlx5/mlx5_3:1 # resource 5 : md 2 dev 3 flags -- dc/mlx5_3:1 # resource 6 : md 2 dev 3 flags -- dc_mlx5/mlx5_3:1 # resource 7 : md 2 dev 3 flags -- ud/mlx5_3:1 # resource 8 : md 2 dev 3 flags -- ud_mlx5/mlx5_3:1 # resource 9 : md 2 dev 3 flags -- cm/mlx5_3:1 # resource 10 : md 3 dev 4 flags -- rc/mlx5_2:1 # resource 11 : md 3 dev 4 flags -- rc_mlx5/mlx5_2:1 # resource 12 : md 3 dev 4 flags -- dc/mlx5_2:1 # resource 13 : md 3 dev 4 flags -- dc_mlx5/mlx5_2:1 # resource 14 : md 3 dev 4 flags -- ud/mlx5_2:1 # resource 15 : md 3 dev 4 flags -- ud_mlx5/mlx5_2:1 # resource 16 : md 3 dev 4 flags -- cm/mlx5_2:1 # resource 17 : md 4 dev 5 flags -- rc/mlx5_1:1 # resource 18 : md 4 dev 5 flags -- rc_mlx5/mlx5_1:1 # resource 19 : md 4 dev 5 flags -- dc/mlx5_1:1 # resource 20 : md 4 dev 5 flags -- dc_mlx5/mlx5_1:1 # resource 21 : md 4 dev 5 flags -- ud/mlx5_1:1 # resource 22 : md 4 dev 5 flags -- ud_mlx5/mlx5_1:1 # resource 23 : md 4 dev 5 flags -- cm/mlx5_1:1 # resource 24 : md 5 dev 6 flags -- rc/mlx5_0:1 # resource 25 : md 5 dev 6 flags -- rc_mlx5/mlx5_0:1 # resource 26 : md 5 dev 6 flags -- dc/mlx5_0:1 # resource 27 : md 5 dev 6 flags -- dc_mlx5/mlx5_0:1 # resource 28 : md 5 dev 6 flags -- ud/mlx5_0:1 # resource 29 : md 5 dev 6 flags -- ud_mlx5/mlx5_0:1 # resource 30 : md 5 dev 6 flags -- cm/mlx5_0:1 # resource 31 : md 6 dev 7 flags -s rdmacm/sockaddr # resource 32 : md 7 dev 8 flags -- mm/sysv # resource 33 : md 8 dev 9 flags -- mm/posix # resource 34 : md 9 dev 10 flags -- cma/cma # resource 35 : md 10 dev 11 flags -- knem/knem #
Looking forward to your reply.