On Mon, Jan 04, 2021 at 04:20:55PM +0100, Frederic Weisbecker wrote:
> Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP
> kthread (rcuog) to be serviced.
> 
> Usually a wake up happening while running the idle task is spotted in
> one of the need_resched() checks carefully placed within the idle loop
> that can break to the scheduler.

Urgh, this is horrific and fragile :/ You having had to audit and fix a
number of rcu_idle_enter() callers should've made you realize that
making rcu_idle_enter() return something would've been saner.

Also, I might hope that when RCU does do that wakeup, it will not have
put RCU in idle mode? So it is a natural 'fail' state for
rcu_idle_enter(), *sigh* it continues to put RCU to sleep, so that needs
fixing too.

I'm thinking that rcu_user_enter() will have the exact same problem? Did
you audit that?

Something like the below, combined with a fixup for all callers (which
the compiler will help us find thanks to __must_check).

---

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de0826411311..612f66c16078 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -95,10 +95,10 @@ static inline void rcu_sysrq_end(void) { }
 #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
-void rcu_user_enter(void);
+bool __must_check rcu_user_enter(void);
 void rcu_user_exit(void);
 #else
-static inline void rcu_user_enter(void) { }
+static inline bool __must_check rcu_user_enter(void) { return true; }
 static inline void rcu_user_exit(void) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index df578b73960f..9ba0c5d9e99e 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -43,7 +43,7 @@ bool rcu_gp_might_be_stalled(void);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
 
-void rcu_idle_enter(void);
+bool __must_check rcu_idle_enter(void);
 void rcu_idle_exit(void);
 void rcu_irq_enter(void);
 void rcu_irq_exit(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 40e5e3dd253e..13e19e5db0b8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -625,7 +625,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
  * the possibility of usermode upcalls having messed up our count
  * of interrupt nesting level during the prior busy period.
  */
-static noinstr void rcu_eqs_enter(bool user)
+static noinstr bool rcu_eqs_enter(bool user)
 {
        struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
@@ -636,7 +636,7 @@ static noinstr void rcu_eqs_enter(bool user)
        if (rdp->dynticks_nesting != 1) {
                // RCU will still be watching, so just do accounting and leave.
                rdp->dynticks_nesting--;
-               return;
+               return true;
        }
 
        lockdep_assert_irqs_disabled();
@@ -644,7 +644,14 @@ static noinstr void rcu_eqs_enter(bool user)
        trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, 
atomic_read(&rdp->dynticks));
        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && 
!is_idle_task(current));
        rdp = this_cpu_ptr(&rcu_data);
-       do_nocb_deferred_wakeup(rdp);
+       if (do_nocb_deferred_wakeup(rdp)) {
+               /*
+                * We did the wakeup, don't enter EQS, we'll need to abort idle
+                * and schedule.
+                */
+               return false;
+       }
+
        rcu_prepare_for_idle();
        rcu_preempt_deferred_qs(current);
 
@@ -657,6 +664,8 @@ static noinstr void rcu_eqs_enter(bool user)
        rcu_dynticks_eqs_enter();
        // ... but is no longer watching here.
        rcu_dynticks_task_enter();
+
+       return true;
 }
 
 /**
@@ -670,10 +679,10 @@ static noinstr void rcu_eqs_enter(bool user)
  * If you add or remove a call to rcu_idle_enter(), be sure to test with
  * CONFIG_RCU_EQS_DEBUG=y.
  */
-void rcu_idle_enter(void)
+bool rcu_idle_enter(void)
 {
        lockdep_assert_irqs_disabled();
-       rcu_eqs_enter(false);
+       return rcu_eqs_enter(false);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
@@ -689,10 +698,10 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
  * If you add or remove a call to rcu_user_enter(), be sure to test with
  * CONFIG_RCU_EQS_DEBUG=y.
  */
-noinstr void rcu_user_enter(void)
+noinstr bool rcu_user_enter(void)
 {
        lockdep_assert_irqs_disabled();
-       rcu_eqs_enter(true);
+       return rcu_eqs_enter(true);
 }
 #endif /* CONFIG_NO_HZ_FULL */
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7708ed161f4a..9226f4021a36 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, 
struct rcu_head *rhp,
 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
                                 unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_cpu_nocb_kthread(int cpu);
 static void __init rcu_spawn_nocb_kthreads(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e291ce0a1d6..8ca41b3fe4f9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1631,7 +1631,7 @@ bool rcu_is_nocb_cpu(int cpu)
  * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
  * and this function releases it.
  */
-static void wake_nocb_gp(struct rcu_data *rdp, bool force,
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
                           unsigned long flags)
        __releases(rdp->nocb_lock)
 {
@@ -1654,8 +1654,11 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool 
force,
                trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
        }
        raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-       if (needwake)
+       if (needwake) {
                wake_up_process(rdp_gp->nocb_gp_kthread);
+               return true;
+       }
+       return false;
 }
 
 /*
@@ -2155,17 +2158,19 @@ static int rcu_nocb_need_deferred_wakeup(struct 
rcu_data *rdp)
 static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
 {
        unsigned long flags;
+       bool ret;
        int ndw;
 
        rcu_nocb_lock_irqsave(rdp, flags);
        if (!rcu_nocb_need_deferred_wakeup(rdp)) {
                rcu_nocb_unlock_irqrestore(rdp, flags);
-               return;
+               return false;
        }
        ndw = READ_ONCE(rdp->nocb_defer_wakeup);
        WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-       wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+       ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
        trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+       return ret;
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
@@ -2181,10 +2186,12 @@ static void do_nocb_deferred_wakeup_timer(struct 
timer_list *t)
  * This means we do an inexact common-case check.  Note that if
  * we miss, ->nocb_timer will eventually clean things up.
  */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
 {
        if (rcu_nocb_need_deferred_wakeup(rdp))
-               do_nocb_deferred_wakeup_common(rdp);
+               return do_nocb_deferred_wakeup_common(rdp);
+
+       return false;
 }
 
 void __init rcu_init_nohz(void)
@@ -2518,8 +2525,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data 
*rdp)
        return false;
 }
 
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
 {
+       return false
 }
 
 static void rcu_spawn_cpu_nocb_kthread(int cpu)

Reply via email to