Hi Team, we are facing some issue in our environment. The resources are free but job is going into the QUEUE state but not running.
i have attached the slurm.conf file here. scenario:- There are job only in the 2 partitions: 344 jobs are in PD state in normal partition and the node belongs from the normal partitions are full and no more job can run. 1300 JOBS are in GPUsmall partition are in queue and enough CPU is avaiable to execute the jobs but i see the jobs are not scheduling on free nodes. Rest there are no pend jobs in any other partition . eg:- node status:- node18 NodeName=node18 Arch=x86_64 CoresPerSocket=18 CPUAlloc=6 CPUErr=0 CPUTot=36 CPULoad=4.07 AvailableFeatures=K2200 ActiveFeatures=K2200 Gres=gpu:2 NodeAddr=node18 NodeHostName=node18 Version=17.11 OS=Linux 4.4.140-94.42-default #1 SMP Tue Jul 17 07:44:50 UTC 2018 (0b375e4) RealMemory=1 AllocMem=0 FreeMem=79532 Sockets=2 Boards=1 State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=GPUsmall,pm_shared BootTime=2019-12-10T14:16:37 SlurmdStartTime=2019-12-10T14:24:08 CfgTRES=cpu=36,mem=1M,billing=36 AllocTRES=cpu=6 CapWatts=n/a CurrentWatts=0 LowestJoules=0 ConsumedJoules=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s node19:- NodeName=node19 Arch=x86_64 CoresPerSocket=18 CPUAlloc=16 CPUErr=0 CPUTot=36 CPULoad=15.43 AvailableFeatures=K2200 ActiveFeatures=K2200 Gres=gpu:2 NodeAddr=node19 NodeHostName=node19 Version=17.11 OS=Linux 4.12.14-94.41-default #1 SMP Wed Oct 31 12:25:04 UTC 2018 (3090901) RealMemory=1 AllocMem=0 FreeMem=63998 Sockets=2 Boards=1 State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=GPUsmall,pm_shared BootTime=2020-03-12T06:51:54 SlurmdStartTime=2020-03-12T06:53:14 CfgTRES=cpu=36,mem=1M,billing=36 AllocTRES=cpu=16 CapWatts=n/a CurrentWatts=0 LowestJoules=0 ConsumedJoules=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s could you please help me to understand what could be the reason?
cat /etc/slurm/slurm.conf # slurm.conf file generated by configurator.html. # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # #Running_config_start #ControlMachine=node0 ControlMachine=slurmmaster ControlAddr=192.168.150.21 AuthType=auth/munge CryptoType=crypto/munge CacheGroups=1 ReturnToService=0 ProctrackType=proctrack/linuxproc SlurmctldPort=6817 SlurmdPort=6818 SchedulerPort=7321 SlurmctldPidFile=/var/slurm/slurmctld.pid SlurmdPidFile=/var/slurm/slurmd.pid SlurmdSpoolDir=/var/slurm/spool/slurmd.%n.spool StateSaveLocation=/var/slurm/state SlurmctldLogFile=/var/slurm/log/slurmctld.log SlurmdLogFile=/var/slurm/log/slurmd.%n.log.%h SlurmUser=hpcadmin MpiDefault=none SwitchType=switch/none TaskPlugin=task/affinity TaskPluginParam=Sched SlurmctldTimeout=120 SlurmdTimeout=300 InactiveLimit=0 KillWait=30 MinJobAge=3600 FastSchedule=1 SchedulerType=sched/builtin #SchedulerParameters=enable_user_top SelectType=select/cons_res #SelectTypeParameters=CR_Core_Memory SelectTypeParameters=CR_Core AccountingStorageEnforce=associations AccountingStorageHost=155.250.126.30 AccountingStorageType=accounting_storage/slurmdbd #AccountingStoreJobComment=YES ClusterName=merckhpc JobCompType=jobcomp/slurmdbd JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=5 SlurmdDebug=5 Waittime=0 #Running_config_end #ControlAddr= #BackupController= #BackupAddr= # #CheckpointType=checkpoint/none #DisableRootJobs=NO #EnforcePartLimits=NO Epilog=/etc/slurm/slurm.epilog.clean #EpilogSlurmctld= #FirstJobId=1 #MaxJobId=999999 GresTypes=gpu #GroupUpdateForce=0 #GroupUpdateTime=600 #JobCheckpointDir=/var/slurm/checkpoint #JobCredentialPrivateKey= #JobCredentialPublicCertificate= #JobFileAppend=0 #JobRequeue=1 #JobSubmitPlugins=1 #KillOnBadExit=0 #Licenses=foo*4,bar #MailProg=/bin/mail #MaxJobCount=5000 MaxJobCount=5000000 #MaxStepCount=40000 #MaxTasksPerNode=128 #MpiParams=ports=#-# #PluginDir= #PlugStackConfig= #PrivateData=jobs #Prolog= #PrologSlurmctld= #PropagatePrioProcess=0 #PropagateResourceLimits= #PropagateResourceLimitsExcept= #SallocDefaultCommand= #SrunEpilog= #SrunProlog= #TaskEpilog= #TaskProlog= #TopologyPlugin=topology/tree #TmpFs=/tmp #TrackWCKey=no #TreeWidth= #UnkillableStepProgram= #UsePAM=0 #UsePAM=0 # # # TIMERS #BatchStartTimeout=10 #CompleteWait=0 #EpilogMsgTime=2000 #GetEnvTimeout=2 #HealthCheckInterval=0 #HealthCheckProgram= MessageTimeout=100 #ResvOverRun=0 #OverTimeLimit=0 #UnkillableStepTimeout=60 #VSizeFactor=0 SchedulerParameters=enable_user_top,default_queue_depth=1000000 # # # SCHEDULING #DefMemPerCPU=0 #MaxMemPerCPU=0 #SchedulerRootFilter=1 #SchedulerTimeSlice=30 # # # JOB PRIORITY PriorityType=priority/multifactor #PriortyFlags=Ticket_Based #PriorityDecayHalfLife=1-0 PriorityDecayHalfLife=2 #PriorityCalcPeriod= #PriorityFavorSmall=YES #PriorityMaxAge=7-0 PriorityUsageResetPeriod=DAILY #PriorityWeightAge=1000 PriorityWeightFairshare=500000 #PriorityWeightJobSize=1000 #PriorityWeightPartition=1000 #PriorityWeightQOS= PriorityFlags=FAIR_TREE # # # LOGGING AND ACCOUNTING #AccountingStorageHost= #AccountingStorageLoc= #AccountingStoragePass= #AccountingStoragePort= #AccountingStorageUser= #DebugFlags= #JobCompHost= #JobCompLoc= #JobCompPass= #JobCompPort= #JobCompUser= #SlurmSchedLogFile= #SlurmSchedLogLevel= # # # POWER SAVE SUPPORT FOR IDLE NODES (optional) #SuspendProgram= #ResumeProgram= #SuspendTimeout= #ResumeTimeout= #ResumeRate= #SuspendExcNodes= #SuspendExcParts= #SuspendRate= #SuspendTime= # # # COMPUTE NODES NodeName=node[1-12] Sockets=2 CoresPerSocket=10 State=UNKNOWN NodeName=node[13-16] Sockets=2 CoresPerSocket=10 Feature=HIGHMEM State=UNKNOWN NodeName=node32 Sockets=2 CoresPerSocket=10 State=UNKNOWN NodeName=node[17-26] Sockets=2 CoresPerSocket=18 Feature=K2200 Gres=gpu:2 NodeName=node[27] Sockets=2 CoresPerSocket=18 Feature=K40 Gres=gpu:2 NodeName=node[28-31] Sockets=2 CoresPerSocket=28 PartitionName=normal Nodes=node[1-10,13-16,28-31],node32 Default=YES MaxTime=INFINITE State=UP Shared=YES PartitionName=medium Nodes=node32 Default=NO MaxTime=INFINITE State=UP Shared=YES PartitionName=GPUsmall Nodes=node[18-19,21-26] Default=NO MaxTime=INFINITE State=UP Shared=YES PartitionName=priority Nodes=node[17,20] Default=NO MaxTime=INFINITE State=UP Shared=YES Priority=2000 OverSubscribe=NO PartitionName=smalljobs Nodes=node[12,17,20] Default=NO MaxTime=INFINITE State=UP Shared=YES Priority=100 OverSubscribe=NO PartitionName=big_scratch Nodes=node[13-16,27-31] Default=NO MaxTime=INFINITE State=UP Shared=YES Priority=100 PartitionName=GPUbig Nodes=node[27] Default=NO MaxTime=INFINITE State=UP Shared=YES PartitionName=shared Nodes=node[1-10,13-31],lc1 Default=NO MaxTime=INFINITE State=UP Shared=YES