I thought I had taken care of this a while back but it appears the issue has returned. A very simply sbatch slurmhello.sh: cat slurmhello.sh #!/bin/sh #SBATCH -o my.stdout #SBATCH -N 3 #SBATCH --ntasks=16 module add shared openmpi/gcc/64/1.10.7 slurm mpirun hello
sbatch slurmhello.sh Submitted batch job 419 squeue JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 419 defq slurmhel root PD 0:00 3 (Resources) In /etc/slurm/slurm.conf: # Nodes NodeName=node[001-003] CoresPerSocket=12 RealMemory=196489092 Sockets=2 Gres=gpu:1 Logs show: [2019-08-29T14:24:40.025] error: _slurm_rpc_node_registration node=node001: Invalid argument [2019-08-29T14:24:40.025] error: Node node002 has low real_memory size (191840 < 196489092) [2019-08-29T14:24:40.025] error: _slurm_rpc_node_registration node=node002: Invalid argument [2019-08-29T14:24:40.026] error: Node node003 has low real_memory size (191840 < 196489092) [2019-08-29T14:24:40.026] error: _slurm_rpc_node_registration node=node003: Invalid argument scontrol show jobid -dd 419 JobId=419 JobName=slurmhello.sh UserId=root(0) GroupId=root(0) MCS_label=N/A Priority=4294901759 Nice=0 Account=root QOS=normal JobState=PENDING Reason=Resources Dependency=(null) Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 DerivedExitCode=0:0 RunTime=00:00:00 TimeLimit=UNLIMITED TimeMin=N/A SubmitTime=2019-08-28T09:54:22 EligibleTime=2019-08-28T09:54:22 StartTime=Unknown EndTime=Unknown Deadline=N/A PreemptTime=None SuspendTime=None SecsPreSuspend=0 LastSchedEval=2019-08-28T09:57:22 Partition=defq AllocNode:Sid=ourcluster:194152 ReqNodeList=(null) ExcNodeList=(null) NodeList=(null) NumNodes=3-3 NumCPUs=16 NumTasks=16 CPUs/Task=1 ReqB:S:C:T=0:0:*:* TRES=cpu=16,node=3 Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=* MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0 Features=(null) DelayBoot=00:00:00 Gres=(null) Reservation=(null) OverSubscribe=YES Contiguous=0 Licenses=(null) Network=(null) Command=/root/slurmhello.sh WorkDir=/root StdErr=/root/my.stdout StdIn=/dev/null StdOut=/root/my.stdout Power= scontrol show nodes node001 NodeName=node001 Arch=x86_64 CoresPerSocket=12 CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.06 AvailableFeatures=(null) ActiveFeatures=(null) Gres=gpu:1 NodeAddr=node001 NodeHostName=node001 Version=17.11 OS=Linux 3.10.0-862.2.3.el7.x86_64 #1 SMP Wed May 9 18:05:47 UTC 2018 RealMemory=196489092 AllocMem=0 FreeMem=99923 Sockets=2 Boards=1 State=IDLE+DRAIN ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=defq BootTime=2019-07-18T12:08:41 SlurmdStartTime=2019-07-18T12:09:44 CfgTRES=cpu=24,mem=196489092M,billing=24 AllocTRES= CapWatts=n/a CurrentWatts=0 LowestJoules=0 ConsumedJoules=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s Reason=Low RealMemory [slurm@2019-07-18T10:17:24] [root@ciscluster ~]# scontrol show nodes| grep -i mem RealMemory=196489092 AllocMem=0 FreeMem=99923 Sockets=2 Boards=1 CfgTRES=cpu=24,mem=196489092M,billing=24 Reason=Low RealMemory [slurm@2019-07-18T10:17:24] RealMemory=196489092 AllocMem=0 FreeMem=180969 Sockets=2 Boards=1 CfgTRES=cpu=24,mem=196489092M,billing=24 Reason=Low RealMemory [slurm@2019-07-18T10:17:24] RealMemory=196489092 AllocMem=0 FreeMem=178999 Sockets=2 Boards=1 CfgTRES=cpu=24,mem=196489092M,billing=24 Reason=Low RealMemory [slurm@2019-07-18T10:17:24] sinfo -R REASON USER TIMESTAMP NODELIST Low RealMemory slurm 2019-07-18T10:17:24 node[001-003] sinfo -N NODELIST NODES PARTITION STATE node001 1 defq* drain node002 1 defq* drain node003 1 defq* drain pdsh -w node00[1-3] "lscpu | grep -iE 'socket|core'" node002: Thread(s) per core: 1 node002: Core(s) per socket: 12 node002: Socket(s): 2 node001: Thread(s) per core: 1 node001: Core(s) per socket: 12 node001: Socket(s): 2 node003: Thread(s) per core: 2 node003: Core(s) per socket: 12 node003: Socket(s): 2 scontrol show nodes| grep -i mem RealMemory=196489092 AllocMem=0 FreeMem=100054 Sockets=2 Boards=1 CfgTRES=cpu=24,mem=196489092M,billing=24 Reason=Low RealMemory [slurm@2019-07-18T10:17:24] RealMemory=196489092 AllocMem=0 FreeMem=181101 Sockets=2 Boards=1 CfgTRES=cpu=24,mem=196489092M,billing=24 Reason=Low RealMemory [slurm@2019-07-18T10:17:24] RealMemory=196489092 AllocMem=0 FreeMem=179004 Sockets=2 Boards=1 CfgTRES=cpu=24,mem=196489092M,billing=24 Reason=Low RealMemory Does anything look off?