Hi,

yes ContrainDevices is set:

###
# Slurm cgroup support configuration file
###
CgroupAutomount=yes
#
#CgroupMountpoint="/sys/fs/cgroup"
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
#
#

I attached the slurm configuration file as well

Cheers
Dominik

Am 27.10.2022 um 17:57 schrieb Sean Maxwell:
Hi Dominik,

Do you have ConstrainDevices=yes set in your cgroup.conf?

Best,

-Sean

On Thu, Oct 27, 2022 at 11:49 AM Dominik Baack <dominik.ba...@cs.uni-dortmund.de> wrote:

    Hi,

    We are in the process of setting up SLURM on some DGX A100 nodes . We
    are experiencing the problem that all GPUs are available for
    users, even
    for jobs where only one should be assigned.

    It seems the requirement is forwarded correctly to the node, at least
    CUDA_VISIBLE_DEVICES is set to the correct id only discarded by
    the rest
    of the system.

    Cheers
    Dominik Baack

    Example:

    baack@gwkilab:~$ srun --gpus=1 nvidia-smi
    Thu Oct 27 17:39:04 2022
    
+-----------------------------------------------------------------------------+
    | NVIDIA-SMI 470.141.03   Driver Version: 470.141.03   CUDA Version:
    11.4     |
    
|-------------------------------+----------------------+----------------------+
    | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile
    Uncorr. ECC |
    | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util
    Compute M. |
    |                               | |               MIG M. |
    
|===============================+======================+======================|
    |   0  NVIDIA A100-SXM...  On   | 00000000:07:00.0 Off
    |                    0 |
    | N/A   28C    P0    52W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+
    |   1  NVIDIA A100-SXM...  On   | 00000000:0F:00.0 Off
    |                    0 |
    | N/A   28C    P0    51W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+
    |   2  NVIDIA A100-SXM...  On   | 00000000:47:00.0 Off
    |                    0 |
    | N/A   28C    P0    52W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+
    |   3  NVIDIA A100-SXM...  On   | 00000000:4E:00.0 Off
    |                    0 |
    | N/A   29C    P0    54W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+
    |   4  NVIDIA A100-SXM...  On   | 00000000:87:00.0 Off
    |                    0 |
    | N/A   34C    P0    57W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+
    |   5  NVIDIA A100-SXM...  On   | 00000000:90:00.0 Off
    |                    0 |
    | N/A   31C    P0    55W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+
    |   6  NVIDIA A100-SXM...  On   | 00000000:B7:00.0 Off
    |                    0 |
    | N/A   31C    P0    51W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+
    |   7  NVIDIA A100-SXM...  On   | 00000000:BD:00.0 Off
    |                    0 |
    | N/A   32C    P0    52W / 400W |      0MiB / 40536MiB | 0%     
    Default |
    |                               | |             Disabled |
    
+-------------------------------+----------------------+----------------------+

    
+-----------------------------------------------------------------------------+
    | Processes: |
    |  GPU   GI   CI        PID   Type   Process name GPU Memory |
    |        ID   ID Usage      |
    
|=============================================================================|
    |  No running processes
    found                                                 |
    
+-----------------------------------------------------------------------------+

# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=ML2R
SlurmctldHost=gwkilab
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=/etc/slurm/epilog.d/*
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
GresTypes=gpu
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
MpiDefault=none
#MpiParams=ports=#-#
#PluginDir=
PlugStackConfig=/etc/slurm/plugstack.conf
#PrivateData=jobs
ProctrackType=proctrack/cgroup
Prolog=/etc/slurm/prolog.d/*
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=0
SlurmctldPidFile=/run/slurm/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/run/slurm/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
SrunPortRange=60001-63000
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
#TaskEpilog=
TaskPlugin=task/affinity
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
MessageTimeout=60
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/builtin
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU_Memory
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/basic
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
AccountingStorageType=accounting_storage/none
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=ml2ran01 NodeAddr=ml2ran01e0 CPUs=256 Sockets=2 RealMemory=1031875 
CoresPerSocket=64 ThreadsPerCore=2 Gres=gpu:nvidia_a100-sxm4-40gb:8 
State=UNKNOWN
NodeName=ml2ran03 NodeAddr=ml2ran03e0 CPUs=256 Sockets=2 RealMemory=1031875 
CoresPerSocket=64 ThreadsPerCore=2 Gres=gpu:nvidia_a100-sxm4-40gb:8 
State=UNKNOWN
NodeName=ml2ran05 NodeAddr=ml2ran05e0 CPUs=256 Sockets=2 RealMemory=1031875 
CoresPerSocket=64 ThreadsPerCore=2 Gres=gpu:nvidia_a100-sxm4-40gb:8 
State=UNKNOWN

Reply via email to