Hi all,

We had to restart the slurmdbd service on one of our clusters running Slurm 
17.11.7 yesterday, since folks were experiencing errors with job scheduling, 
and running 'sacct':

-----
$ sacct -X -p -o 
jobid,jobname,user,partition%-30,nodelist,alloccpus,reqmem,cputime,qos,state,exitcode,AllocTRES%-50
 -s R --allusers                         
sacct: error: slurm_persist_conn_open_without_init: failed to open persistent 
connection to captain1:6819: Connection refused
sacct: error: slurmdbd: Sending PersistInit msg: Connection refused
sacct: error: Problem talking to the database: Connection refused
-----

Looking in the logs post-restart, I see a large number of messages such as 
these:

-----
[2019-05-07T07:35:17.000] debug2: DBD_MODIFY_RESV: called
[2019-05-07T07:35:17.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
[2019-05-07T07:35:17.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
[2019-05-07T07:35:35.000] debug2: DBD_MODIFY_RESV: called
[2019-05-07T07:35:35.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
[2019-05-07T07:35:35.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
[2019-05-07T07:35:53.000] debug2: DBD_MODIFY_RESV: called
[2019-05-07T07:35:53.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
[2019-05-07T07:35:53.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
[2019-05-07T07:36:11.000] debug2: DBD_MODIFY_RESV: called
[2019-05-07T07:36:11.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
[2019-05-07T07:36:11.001] error: There is no reservation by id 4, time_start 
1555628209, and cluster 'rescluster'
-----

I read today's list message entitled "Slurm database failure messages", and 
although different, I saw that there was a linked bug report that had to do 
with problems with reservations. It suggested gathering data via three 
commands, the output of which from our cluster are seen here:

-----
root@captain1:/var/log# scontrol show reservations
ReservationName=res17-pc2 StartTime=2019-02-25T14:58:40 
EndTime=2029-02-22T14:58:40 Duration=3650-00:00:00
   Nodes=res17-pc2 NodeCnt=1 CoreCnt=6 Features=(null) PartitionName=desktops 
Flags=SPEC_NODES
   TRES=cpu=12
   Users=samuel Accounts=(null) Licenses=(null) State=ACTIVE BurstBuffer=(null) 
Watts=n/a

ReservationName=res18-pc5 StartTime=2019-04-25T11:47:05 
EndTime=2020-04-24T11:47:05 Duration=365-00:00:00
   Nodes=res18-pc5 NodeCnt=1 CoreCnt=6 Features=(null) PartitionName=(null) 
Flags=SPEC_NODES
   TRES=cpu=12
   Users=grv Accounts=(null) Licenses=(null) State=ACTIVE BurstBuffer=(null) 
Watts=n/a


root@captain1:/var/log# sacctmgr show reservations
   Cluster            Name                           TRES           TimeStart   
          TimeEnd UnusedWall
---------- --------------- ------------------------------ ------------------- 
------------------- ----------
 rescluster        res17-pc2                         cpu=12 2019-04-24T13:29:52 
2029-02-22T14:58:40   0.000000


mysql> select * from rescluster_resv_table\G
*************************** 1. row ***************************
    id_resv: 1
    deleted: 1
  assoclist: 12
      flags: 65535
   nodelist: res17-pc2,captain2,server13k,server15k,server25k
   node_inx: 0-4
  resv_name: res17-pc2
 time_start: 1551135476
   time_end: 1551135512
       tres: 1=140
unused_wall: 36
*************************** 2. row ***************************
    id_resv: 2
    deleted: 0
  assoclist: 12
      flags: 32768
   nodelist: res17-pc2
   node_inx: 0
  resv_name: res17-pc2
 time_start: 1551135520
   time_end: 1551141705
       tres: 1=12
unused_wall: 6176.5
*************************** 3. row ***************************
    id_resv: 2
    deleted: 0
  assoclist: 12
      flags: 32768
   nodelist: res17-pc2
   node_inx: 0
  resv_name: res17-pc2
 time_start: 1551141705
   time_end: 1551734095
       tres: 1=12
unused_wall: 581590
*************************** 4. row ***************************
    id_resv: 2
    deleted: 0
  assoclist: 12
      flags: 32768
   nodelist: res17-pc2
   node_inx: 0
  resv_name: res17-pc2
 time_start: 1551734095
   time_end: 1551847812
       tres: 1=12
unused_wall: 117173.666667
*************************** 5. row ***************************
    id_resv: 2
    deleted: 0
  assoclist: 12
      flags: 32768
   nodelist: res17-pc2
   node_inx: 0
  resv_name: res17-pc2
 time_start: 1551847812
   time_end: 1552353438
       tres: 1=12
unused_wall: 480521
*************************** 6. row ***************************
    id_resv: 2
    deleted: 0
  assoclist: 12
      flags: 32768
   nodelist: res17-pc2
   node_inx: 0
  resv_name: res17-pc2
 time_start: 1552353438
   time_end: 1554771615
       tres: 1=12
unused_wall: 2367043
*************************** 7. row ***************************
    id_resv: 2
    deleted: 0
  assoclist: 12
      flags: 32768
   nodelist: res17-pc2
   node_inx: 0
  resv_name: res17-pc2
 time_start: 1554771615
   time_end: 1556137792
       tres: 1=12
unused_wall: 2006236
*************************** 8. row ***************************
    id_resv: 2
    deleted: 0
  assoclist: 12
      flags: 32768
   nodelist: res17-pc2
   node_inx: 0
  resv_name: res17-pc2
 time_start: 1556137792
   time_end: 1866495520
       tres: 1=12
unused_wall: 0
8 rows in set (0.00 sec)
-----

So it seems to me that the reservations are messed up; how to go about fixing 
this?

Thanks in advance for any help provided...


Reply via email to