Package: watchdog
Severity: normal
Tags: patch

Hi,

Sorry for the big fat pause on this - the hardware which I was debugging
is now in production, and I don't have another example to test on at the
moment.  I got the Dell PE860 working by switching to the iTCO_wdt
watchdog device instead.

I'm currently setting up another Box - an Intel SSR212MC2 which is
showing similar behaviour with this ipmi_watchdog module.

Some observations:

The watchdog init scripts seem to be broken.  They do not correctly wait
for processes to exit before attempting to start replacement processes.

Upon receiving SIGTERM, watchdog daemon sometimes waits 5 seconds
between closing the watchdog device and exiting....  This unnecessarily
delays the boot / shutdown process, and could also be potentially risk
(where no_way_out is used it delays the start of the wd_keepalive
daemon).

I've made some changes to the init scripts which seem to fix the issues
on the SSR212MC2, but I'm not currently able to retest things on the
PE860.

Tim.
--- /root/watchdog.old  2010-01-08 15:57:17.000000000 +0000
+++ /root/watchdog      2010-01-08 15:54:32.000000000 +0000
@@ -31,10 +31,12 @@
 [ -e /etc/default/watchdog ] && . /etc/default/watchdog
 
 NAME=watchdog
-WD_NAME=wd_keepalive
+KEEPALIVE_NAME=wd_keepalive
 
 DAEMON=/usr/sbin/watchdog
-WD_DAEMON=/usr/sbin/wd_keepalive
+KEEPALIVE_DAEMON=/usr/sbin/wd_keepalive
+
+STOP_RETRY_SCHEDULE='TERM/10/forever/KILL/1'
 
 # Get lsb functions
 . /lib/lsb/init-functions
@@ -47,15 +49,14 @@
        [ ${watchdog_module:-none} != "none" ] && /sbin/modprobe 
$watchdog_module
        # make sure that wd_keepalive is stopped
        log_begin_msg "Stopping watchdog keepalive daemon..."
-        start-stop-daemon --stop --quiet -oknodo \
-            --pidfile  /var/run/$WD_NAME.pid
+        start-stop-daemon --stop --quiet -oknodo --retry $STOP_RETRY_SCHEDULE \
+            --pidfile  /var/run/$KEEPALIVE_NAME.pid
         log_end_msg $?
 
-       sleep 5
        # Unconditionally start watchdog daemon because we want to run it even
        # if wd_keepalive wasn't running
        log_begin_msg "Starting watchdog daemon..."
-       start-stop-daemon --start --quiet \
+       start-stop-daemon --start --quiet --pidfile /var/run/$NAME.pid \
              --exec $DAEMON -- $watchdog_options
        log_end_msg $?
     fi
@@ -65,17 +66,13 @@
     if [ $run_watchdog = 1 ]
     then
        log_begin_msg "Stopping watchdog daemon..."
-       if start-stop-daemon --stop --quiet \
+       start-stop-daemon --stop --quiet --retry $STOP_RETRY_SCHEDULE \
            --pidfile /var/run/$NAME.pid
-       then
-           log_end_msg $?
-
-           sleep 5
-           # make sure that wd_keepalive is started
-           log_begin_msg "Starting watchdog keepalive daemon..."
-            start-stop-daemon --start --quiet \
-               --exec $WD_DAEMON -- $watchdog_options
-       fi
+       log_end_msg $?
+       # make sure that wd_keepalive is started
+       log_begin_msg "Starting watchdog keepalive daemon..."
+       start-stop-daemon --start --quiet \
+           --exec $KEEPALIVE_DAEMON -- $watchdog_options
        log_end_msg $?
     fi
     ;;
@@ -86,9 +83,11 @@
 
   force-reload)
     if [ $run_watchdog = 0 ]; then exit 0; fi
-    log_begin_msg "Restarting $NAME daemon..."
-    start-stop-daemon --stop --quiet --pidfile /var/run/$NAME.pid
-    sleep 5
+    log_daemon_msg "Restarting ${NAME}"
+    log_progress_msg "Stopping $NAME daemon..."
+    start-stop-daemon --stop --pidfile /var/run/$NAME.pid --quiet \
+       --retry $STOP_RETRY_SCHEDULE || log_end_msg $?
+    log_progress_msg "Starting $NAME daemon..."
     start-stop-daemon --start --quiet --pidfile /var/run/$NAME.pid \
        --exec $DAEMON -- $watchdog_options
     log_end_msg $?
--- /root/wd_keepalive.old      2010-01-08 15:57:15.000000000 +0000
+++ /root/wd_keepalive  2010-01-08 15:54:36.000000000 +0000
@@ -32,6 +32,8 @@
 NAME=wd_keepalive
 DAEMON=/usr/sbin/wd_keepalive
 
+STOP_RETRY_SCHEDULE='TERM/10/forever/KILL/1'
+
 case "$1" in
   start)
     if [ $run_watchdog = 1 ]
@@ -52,8 +54,8 @@
     if [ $run_watchdog = 1 ]
     then
        echo -n "Stopping watchdog keepalive daemon: "
-       if start-stop-daemon --stop --quiet \
-           --pidfile /var/run/$NAME.pid
+       if start-stop-daemon --stop --quiet --pidfile /var/run/$NAME.pid \
+           --retry $STOP_RETRY_SCHEDULE
        then
            echo wd_keepalive.
        else
@@ -69,9 +71,8 @@
   force-reload)
     if [ $run_watchdog = 0 ]; then exit 0; fi
     echo -n "Restarting $NAME daemon."
-    start-stop-daemon --stop --quiet --pidfile /var/run/$NAME.pid
-    echo -n "."
-    sleep 5
+    start-stop-daemon --stop --quiet --pidfile /var/run/$NAME.pid \
+       --retry $STOP_RETRY_SCHEDULE
     echo -n "."
     if start-stop-daemon --start --quiet --pidfile /var/run/$NAME.pid \
        --exec $DAEMON -- $watchdog_options

Reply via email to