Hi All, Please help me to resolve this issue
My compute node (snode) status is UNKNOWN and Reason=NO NETWORK ADDRESS FOUND Master node (smaster) : [root@smaster ~]# cat /etc/slurm/slurm.conf # slurm.conf file generated by configurator easy.html. # Put this file on all nodes of your cluster. # See the slurm.conf man page for more information. # ControlMachine=smaster ControlAddr=192.168.1.195 # #MailProg=/bin/mail MpiDefault=none #MpiParams=ports=#-# ProctrackType=proctrack/pgid ReturnToService=1 SlurmctldPidFile=/var/run/slurmctld.pid #SlurmctldPort=6817 SlurmdPidFile=/var/run/slurmd.pid #SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurmd SlurmUser=slurm #SlurmdUser=root StateSaveLocation=/var/spool/slurmctld SwitchType=switch/none TaskPlugin=task/none # # # TIMERS #KillWait=30 #MinJobAge=300 #SlurmctldTimeout=120 #SlurmdTimeout=300 # # # SCHEDULING SchedulerType=sched/backfill SelectType=select/cons_tres SelectTypeParameters=CR_Core # # LOGGING AND ACCOUNTING AccountingStorageType=accounting_storage/none ClusterName=scluster #JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/none #SlurmctldDebug=3 SlurmctldLogFile=/var/log/slurmctld.log #SlurmdDebug=3 SlurmdLogFile=/var/log/slurmd.log # # # COMPUTE NODES NodeName=smaster NodeAddr=192.168.1.195 CPUs=2 RealMemory=1024 State=UNKNOWN NodeName=sndode NodeAddr=192.168.1.196 CPUs=2 RealMemory=1024 State=UNKNOWN #PartitionName=debug Nodes=sndode Default=YES MaxTime=INFINITE State=UP PartitionName=debug Nodes=sndode Default=YES MaxTime=INFINITE State=UP PartitionName=hpc Nodes=smaster Default=YES MaxTime=INFINITE State=UP *On Master Node (smaster):* [root@smaster ~]# sinfo -Nl Tue Feb 02 18:11:00 2021 NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON smaster 1 hpc* idle 2 2:1:1 1024 0 1 (null) none sndode 1 debug unknown* 2 2:1:1 1024 0 1 (null) NO NETWORK ADDRESS F [root@smaster ~]# scontrol show nodes NodeName=smaster Arch=x86_64 CoresPerSocket=1 CPUAlloc=0 CPUTot=2 CPULoad=0.01 AvailableFeatures=(null) ActiveFeatures=(null) Gres=(null) NodeAddr=192.168.1.195 NodeHostName=smaster Version=20.11.2 OS=Linux 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020 RealMemory=1024 AllocMem=0 FreeMem=4500 Sockets=2 Boards=1 State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=hpc BootTime=2021-02-02T10:53:56 SlurmdStartTime=2021-02-02T13:21:10 CfgTRES=cpu=2,mem=1G,billing=2 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s Comment=(null) NodeName=sndode CoresPerSocket=1 CPUAlloc=0 CPUTot=2 CPULoad=N/A AvailableFeatures=(null) ActiveFeatures=(null) Gres=(null) NodeAddr=192.168.1.196 NodeHostName=sndode RealMemory=1024 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1 State=UNKNOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=debug BootTime=None SlurmdStartTime=None CfgTRES=cpu=2,mem=1G,billing=2 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s Reason=NO NETWORK ADDRESS FOUND [slurm@2021-02-02T10:58:11] Comment=(null) [root@smaster ~]# *Conpute Node:* [root@snode ~]# for i in munge slurmd; do service $i status; done Redirecting to /bin/systemctl status munge.service ● munge.service - MUNGE authentication service Loaded: loaded (/usr/lib/systemd/system/munge.service; enabled; vendor preset: disabled) Active: active (running) since Tue 2021-02-02 13:29:11 IST; 4h 43min ago Docs: man:munged(8) Process: 17759 ExecStart=/usr/sbin/munged (code=exited, status=0/SUCCESS) Main PID: 17761 (munged) Tasks: 4 Memory: 600.0K CGroup: /system.slice/munge.service └─17761 /usr/sbin/munged Feb 02 13:29:11 snode.calligotech.com systemd[1]: Starting MUNGE authentication service... Feb 02 13:29:11 snode.calligotech.com systemd[1]: Started MUNGE authentication service. Redirecting to /bin/systemctl status slurmd.service ● slurmd.service - Slurm node daemon Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled) Active: failed (Result: exit-code) since Tue 2021-02-02 13:29:12 IST; 4h 43min ago Process: 17785 ExecStart=/usr/sbin/slurmd -D $SLURMD_OPTIONS (code=exited, status=1/FAILURE) Main PID: 17785 (code=exited, status=1/FAILURE) Feb 02 13:29:11 snode.calligotech.com systemd[1]: Started Slurm node daemon. Feb 02 13:29:12 snode.calligotech.com systemd[1]: slurmd.service: main process exited, code=exited, status=1/FAILURE Feb 02 13:29:12 snode.calligotech.com systemd[1]: Unit slurmd.service entered failed state. Feb 02 13:29:12 snode.calligotech.com systemd[1]: slurmd.service failed. [root@snode ~]# sinfo -Nl Tue Feb 02 18:12:47 2021 NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON smaster 1 hpc* idle 2 2:1:1 1024 0 1 (null) none sndode 1 debug unknown* 2 2:1:1 1024 0 1 (null) NO NETWORK ADDRESS F [root@snode ~]# scontrol show nodes NodeName=smaster Arch=x86_64 CoresPerSocket=1 CPUAlloc=0 CPUTot=2 CPULoad=0.01 AvailableFeatures=(null) ActiveFeatures=(null) Gres=(null) NodeAddr=192.168.1.195 NodeHostName=smaster Version=20.11.2 OS=Linux 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020 RealMemory=1024 AllocMem=0 FreeMem=4502 Sockets=2 Boards=1 State=IDLE ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=hpc BootTime=2021-02-02T10:53:56 SlurmdStartTime=2021-02-02T13:21:10 CfgTRES=cpu=2,mem=1G,billing=2 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s Comment=(null) NodeName=sndode CoresPerSocket=1 CPUAlloc=0 CPUTot=2 CPULoad=N/A AvailableFeatures=(null) ActiveFeatures=(null) Gres=(null) NodeAddr=192.168.1.196 NodeHostName=sndode RealMemory=1024 AllocMem=0 FreeMem=N/A Sockets=2 Boards=1 State=UNKNOWN* ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A Partitions=debug BootTime=None SlurmdStartTime=None CfgTRES=cpu=2,mem=1G,billing=2 AllocTRES= CapWatts=n/a CurrentWatts=0 AveWatts=0 ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s Reason=NO NETWORK ADDRESS FOUND [slurm@2021-02-02T10:58:11] Comment=(null) [root@snode ~]# sinfo PARTITION AVAIL TIMELIMIT NODES STATE NODELIST debug up 1:00:00 1 unk* sndode hpc* up infinite 1 idle smaster [root@snode ~]# Please help me to resolve this issue. Regards, Zain