在 2021/10/13 9:22, Brian Andrus 写道:

Something is very odd when you have the node reporting:

RealMemory=1 AllocMem=0 FreeMem=47563 Sockets=2 Boards=1

What do you get when you run ‘slurmd -C’ on the node?

# slurmd -C
NodeName=apollo CPUs=36 Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore=1 RealMemory=128306
UpTime=22-16:14:48

Brian Andrus

*From: *Adam Xu <mailto:adam...@adagene.com.cn>
*Sent: *Tuesday, October 12, 2021 6:07 PM
*To: *slurm-users@lists.schedmd.com
*Subject: *Re: [slurm-users] job is pending but resources are available

在 2021/10/12 21:21, Adam Xu 写道:

    Hi All,

    OS: Rocky Linux 8.4

    slurm version: 20.11.7

    the partition's name is apollo. the node's name is apollo too. the
    node has 36 cpu cores and 8GPUs in it.

    partition info

    $ scontrol show partition apollo
    PartitionName=apollo
       AllowGroups=ALL AllowAccounts=ALL AllowQos=ALL
       AllocNodes=ALL Default=NO QoS=N/A
       DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=NO
    GraceTime=0 Hidden=NO
       MaxNodes=UNLIMITED MaxTime=UNLIMITED MinNodes=0 LLN=NO
    MaxCPUsPerNode=UNLIMITED
       Nodes=apollo
       PriorityJobFactor=1 PriorityTier=1 RootOnly=NO ReqResv=NO
    OverSubscribe=YES:36
       OverTimeLimit=NONE PreemptMode=OFF
       State=UP TotalCPUs=36 TotalNodes=1 SelectTypeParameters=NONE
       JobDefaults=(null)
       DefMemPerNode=UNLIMITED MaxMemPerNode=UNLIMITED

    node info

    $ scontrol show node apollo
    NodeName=apollo Arch=x86_64 CoresPerSocket=18
       CPUAlloc=28 CPUTot=36 CPULoad=7.02
       AvailableFeatures=(null)
       ActiveFeatures=(null)
       Gres=gpu:v100:8,mps:v100:800
       NodeAddr=apollo NodeHostName=apollo Version=20.11.7
       OS=Linux 4.18.0-305.19.1.el8_4.x86_64 #1 SMP Wed Sep 15
    19:12:32 UTC 2021
       RealMemory=1 AllocMem=0 FreeMem=47563 Sockets=2 Boards=1
       State=MIXED ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A
    MCS_label=N/A
       Partitions=apollo
       BootTime=2021-09-20T23:43:49 SlurmdStartTime=2021-10-12T16:55:44
       CfgTRES=cpu=36,mem=1M,billing=36
       AllocTRES=cpu=28
       CapWatts=n/a
       CurrentWatts=0 AveWatts=0
       ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
       Comment=(null)

    Now I have 7 jobs running but when I submit 8th job, the status of
    the job is pending beacuse Resources.

    $ squeue
                 JOBID PARTITION     NAME     USER ST       TIME NODES
    NODELIST(REASON)
                   879    apollo    do.sh zhining_ PD       0:00 1
    (Resources)
                   489    apollo    do.sh zhining_  R 13-12:50:45 1
    apollo
                   490    apollo    do.sh zhining_  R 13-12:41:00 1
    apollo
                   592    apollo runme-gp junwen_f  R 4-12:42:31 1 apollo
                   751    apollo runme-gp junwen_f  R 1-12:48:20 1 apollo
                   752    apollo runme-gp junwen_f  R 1-12:48:10 1 apollo
                   871    apollo runme-gp junwen_f  R    7:13:45 1 apollo
                   872    apollo runme-gp junwen_f  R    7:12:42 1 apollo

    $ scontrol show job 879
    JobId=879 JobName=do.sh
       UserId=zhining_wan(1001) GroupId=zhining_wan(1001) MCS_label=N/A
       Priority=4294900882 Nice=0 Account=(null) QOS=(null)
       JobState=PENDING Reason=Resources Dependency=(null)
       Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
       RunTime=00:00:00 TimeLimit=UNLIMITED TimeMin=N/A
       SubmitTime=2021-10-12T16:29:29 EligibleTime=2021-10-12T16:29:29
       AccrueTime=2021-10-12T16:29:29
       StartTime=2021-10-12T21:17:41 EndTime=Unknown Deadline=N/A
       SuspendTime=None SecsPreSuspend=0
    LastSchedEval=2021-10-12T21:17:39
       Partition=apollo AllocNode:Sid=sms:1281191
       ReqNodeList=(null) ExcNodeList=(null)
       NodeList=(null) SchedNodeList=apollo
       NumNodes=1-1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
       TRES=cpu=4,node=1,billing=4
       Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
       MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
       Features=(null) DelayBoot=00:00:00
       OverSubscribe=YES Contiguous=0 Licenses=(null) Network=(null)
    
Command=/home/zhining_wan/job/2021/20210603_ctla4_double_bilayer/final_pdb_minimize/amber/nolipid/test/do.sh

    
WorkDir=/home/zhining_wan/job/2021/20210603_ctla4_double_bilayer/final_pdb_minimize/amber/nolipid/test

    
StdErr=/home/zhining_wan/job/2021/20210603_ctla4_double_bilayer/final_pdb_minimize/amber/nolipid/test/slurm-879.out

       StdIn=/dev/null
    
StdOut=/home/zhining_wan/job/2021/20210603_ctla4_double_bilayer/final_pdb_minimize/amber/nolipid/test/slurm-879.out

       Power=
       TresPerNode=gpu:1
       NtasksPerTRES:0

    After running 7 jobs, the node has 8 cpu cores and 1 gpu left, so
    I can be sure that the remaining resources are sufficient. but why
    the job is pending with reason "Resources"?

Some information to add:

I have killed some jobs with kill instead of scancle, Could this be the cause of this result?

Reply via email to