On Tue,  6 Dec 2016 11:17:13 -0500 Don Zickus <[email protected]> wrote:

> On an overloaded system, it is possible that a change in the watchdog 
> threshold
> can be delayed long enough to trigger a false positive.
> 
> This can easily be achieved by having a cpu spinning indefinitely on a task,
> while another cpu updates watchdog threshold.
> 
> What happens is while trying to park the watchdog threads, the hrtimers on the
> other cpus trigger and reprogram themselves with the new slower watchdog
> threshold.  Meanwhile, the nmi watchdog is still programmed with the old 
> faster
> threshold.
> 
> Because the one cpu is blocked, it prevents the thread parking on the other
> cpus from completing, which is needed to shutdown the nmi watchdog and
> reprogram it correctly.  As a result, a false positive from the nmi watchdog 
> is
> reported.
> 
> Fix this by setting a park_in_progress flag to block all lockups
> until the parking is complete.
> 
> Fix provided by Ulrich Obergfell.
> 
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -111,6 +111,7 @@ static inline bool trigger_single_cpu_backtrace(int cpu)
>  extern unsigned long watchdog_enabled;
>  extern DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
>  extern unsigned long *watchdog_cpumask_bits;
> +extern atomic_t park_in_progress;
>  extern int sysctl_softlockup_all_cpu_backtrace;
>  extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;

A kernel-wide identifier should really identify which subsystem it
belongs to...


--- 
a/include/linux/nmi.h~kernel-watchdog-prevent-false-hardlockup-on-overloaded-system-fix
+++ a/include/linux/nmi.h
@@ -110,7 +110,7 @@ extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 extern unsigned long *watchdog_cpumask_bits;
-extern atomic_t park_in_progress;
+extern atomic_t watchdog_park_in_progress;
 #ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
--- 
a/kernel/watchdog.c~kernel-watchdog-prevent-false-hardlockup-on-overloaded-system-fix
+++ a/kernel/watchdog.c
@@ -49,7 +49,7 @@ unsigned long *watchdog_cpumask_bits = c
 #define for_each_watchdog_cpu(cpu) \
        for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
-atomic_t park_in_progress = ATOMIC_INIT(0);
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
 
 /*
  * The 'watchdog_running' variable is set to 1 when the watchdog threads
@@ -262,7 +262,7 @@ static enum hrtimer_restart watchdog_tim
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
-       if (atomic_read(&park_in_progress) != 0)
+       if (atomic_read(&watchdog_park_in_progress) != 0)
                return HRTIMER_NORESTART;
 
        /* kick the hardlockup detector */
@@ -472,7 +472,7 @@ static int watchdog_park_threads(void)
 {
        int cpu, ret = 0;
 
-       atomic_set(&park_in_progress, 1);
+       atomic_set(&watchdog_park_in_progress, 1);
 
        for_each_watchdog_cpu(cpu) {
                ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
@@ -480,7 +480,7 @@ static int watchdog_park_threads(void)
                        break;
        }
 
-       atomic_set(&park_in_progress, 0);
+       atomic_set(&watchdog_park_in_progress, 0);
 
        return ret;
 }
--- 
a/kernel/watchdog_hld.c~kernel-watchdog-prevent-false-hardlockup-on-overloaded-system-fix
+++ a/kernel/watchdog_hld.c
@@ -84,7 +84,7 @@ static void watchdog_overflow_callback(s
        /* Ensure the watchdog never gets throttled */
        event->hw.interrupts = 0;
 
-       if (atomic_read(&park_in_progress) != 0)
+       if (atomic_read(&watchdog_park_in_progress) != 0)
                return;
 
        if (__this_cpu_read(watchdog_nmi_touch) == true) {
_


Reply via email to