* Paul E. McKenney <[email protected]> wrote:

> There is some confusion as to which of cond_resched() or
> cond_resched_rcu_qs() should be added to long in-kernel loops.
> This commit therefore eliminates the decision by adding RCU quiescent
> states to cond_resched().  This commit also simplifies the code that
> used to interact with cond_resched_rcu_qs(), and that now interacts with
> cond_resched(), to reduce its overhead.  This reduction is necessary to
> allow the heavier-weight cond_resched_rcu_qs() mechanism to be invoked
> everywhere that cond_resched() is invoked.
> 
> Part of that reduction in overhead converts the jiffies_till_sched_qs
> kernel parameter to read-only at runtime, thus eliminating the need for
> bounds checking.
> 
> Reported-by: Michal Hocko <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> [ paulmck: Tuning for performance issues reported by 0day Test Robot. ]
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8337e2db0bb2..d2f291a3a44a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1522,10 +1522,11 @@ static inline int test_tsk_need_resched(struct 
> task_struct *tsk)
>   * cond_resched_lock() will drop the spinlock before scheduling,
>   * cond_resched_softirq() will enable bhs before scheduling.
>   */
> +void rcu_all_qs(void);
>  #ifndef CONFIG_PREEMPT
>  extern int _cond_resched(void);
>  #else
> -static inline int _cond_resched(void) { return 0; }
> +static inline int _cond_resched(void) { rcu_all_qs(); return 0; }
>  #endif
>  
>  #define cond_resched() ({                    \
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 51d4c3acf32d..e40cb5190783 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -537,8 +537,8 @@ module_param(rcu_kick_kthreads, bool, 0644);
>   * How long the grace period must be before we start recruiting
>   * quiescent-state help from rcu_note_context_switch().
>   */
> -static ulong jiffies_till_sched_qs = HZ / 20;
> -module_param(jiffies_till_sched_qs, ulong, 0644);
> +static ulong jiffies_till_sched_qs = HZ / 10;
> +module_param(jiffies_till_sched_qs, ulong, 0444);
>  
>  static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node 
> *rnp,
>                                 struct rcu_data *rdp);
> @@ -1230,7 +1230,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data 
> *rdp)
>       unsigned long jtsq;
>       bool *rnhqp;
>       bool *ruqp;
> -     unsigned long rjtsc;
>       struct rcu_node *rnp;
>  
>       /*
> @@ -1247,23 +1246,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data 
> *rdp)
>               return 1;
>       }
>  
> -     /* Compute and saturate jiffies_till_sched_qs. */
> -     jtsq = jiffies_till_sched_qs;
> -     rjtsc = rcu_jiffies_till_stall_check();
> -     if (jtsq > rjtsc / 2) {
> -             WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
> -             jtsq = rjtsc / 2;
> -     } else if (jtsq < 1) {
> -             WRITE_ONCE(jiffies_till_sched_qs, 1);
> -             jtsq = 1;
> -     }
> -
>       /*
>        * Has this CPU encountered a cond_resched_rcu_qs() since the
>        * beginning of the grace period?  For this to be the case,
>        * the CPU has to have noticed the current grace period.  This
>        * might not be the case for nohz_full CPUs looping in the kernel.
>        */
> +     jtsq = jiffies_till_sched_qs;
>       rnp = rdp->mynode;
>       ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
>       if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
> @@ -1271,7 +1260,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data 
> *rdp)
>           READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
>               trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
>               return 1;
> -     } else {
> +     } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) {
>               /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
>               smp_store_release(ruqp, true);
>       }
> @@ -1299,10 +1288,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data 
> *rdp)
>        * updates are only once every few jiffies, the probability of
>        * lossage (and thus of slight grace-period extension) is
>        * quite low.
> -      *
> -      * Note that if the jiffies_till_sched_qs boot/sysfs parameter
> -      * is set too high, we override with half of the RCU CPU stall
> -      * warning delay.
>        */
>       rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
>       if (!READ_ONCE(*rnhqp) &&
> @@ -1311,7 +1296,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data 
> *rdp)
>               WRITE_ONCE(*rnhqp, true);
>               /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
>               smp_store_release(ruqp, true);
> -             rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
> +             rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */
>       }
>  
>       /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 17c667b427b4..9433633012ba 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4808,6 +4808,7 @@ int __sched _cond_resched(void)
>               preempt_schedule_common();
>               return 1;
>       }
> +     rcu_all_qs();
>       return 0;
>  }
>  EXPORT_SYMBOL(_cond_resched);

So I'm a bit uneasy about this change:

- There's hundreds of uses of cond_resched(), some of them in commonly inlined
  functions.

- cond_resched() typically gets called in functions that _might_ take a long 
time
  to execute, but that's not a given.

- it's definitely getting called opportunistically as well, under
  PREEMPT_VOLUNTARY, from common lightweight helpers that we know are in
  schedulable contexts. We risk adding significant overhead here.

So what we risk here is turning a known to be super simple function into 
something 
much slower - and exporting slowdowns to literally thousands of explicit and 
implicit usage sites.

Thanks,

        Ingo

Reply via email to