On Tue, May 26, 2026 at 08:21:37AM -0700, Boqun Feng wrote:
> From: Joel Fernandes <[email protected]>
> 
> Move NMI nesting tracking from the preempt_count bits to a separate per-CPU
> counter (nmi_nesting). This is to free up the NMI bits in the preempt_count,
> allowing those bits to be repurposed for other uses.
> 
> Reduce NMI_BITS from 4 to 1, using it only to detect if we're in an NMI.
> The per-CPU counter currently caps nesting at 15.
> 
> [boqun: Solve Steven Rostedt's comment on the BUG_ON() condition]
> 
> Suggested-by: Boqun Feng <[email protected]>
> Signed-off-by: Joel Fernandes <[email protected]>
> Signed-off-by: Lyude Paul <[email protected]>
> Signed-off-by: Boqun Feng <[email protected]>
> Link: https://patch.msgid.link/[email protected]
> ---
>  include/linux/hardirq.h                        | 17 +++++++++++++----
>  include/linux/preempt.h                        |  9 +++++++--
>  kernel/softirq.c                               |  2 ++
>  tools/testing/selftests/bpf/bpf_experimental.h |  2 +-
>  4 files changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index d57cab4d4c06..1a0360a1000f 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -10,6 +10,8 @@
>  #include <linux/vtime.h>
>  #include <asm/hardirq.h>
>  
> +DECLARE_PER_CPU(unsigned int, nmi_nesting);
> +
>  extern void synchronize_irq(unsigned int irq);
>  extern bool synchronize_hardirq(unsigned int irq);
>  
> @@ -102,14 +104,17 @@ void irq_exit_rcu(void);
>   */
>  
>  /*
> - * nmi_enter() can nest up to 15 times; see NMI_BITS.
> + * nmi_enter() can nest - nesting is tracked in a per-CPU counter.
>   */
>  #define __nmi_enter()                                                \
>       do {                                                    \
>               lockdep_off();                                  \
>               arch_nmi_enter();                               \
> -             BUG_ON(in_nmi() == NMI_MASK);                   \
> -             __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);       \
> +             /* Maximum NMI nesting is 15. */                \
> +             BUG_ON(__this_cpu_read(nmi_nesting) >= 15);     \
> +             __this_cpu_inc(nmi_nesting);                    \
> +             __preempt_count_add(HARDIRQ_OFFSET);            \
> +             preempt_count_set(preempt_count() | NMI_MASK);  \
>       } while (0)
>  
>  #define nmi_enter()                                          \
> @@ -124,8 +129,12 @@ void irq_exit_rcu(void);
>  
>  #define __nmi_exit()                                         \
>       do {                                                    \
> +             unsigned int nesting;                           \
>               BUG_ON(!in_nmi());                              \
> -             __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);       \
> +             __preempt_count_sub(HARDIRQ_OFFSET);            \
> +             nesting = __this_cpu_dec_return(nmi_nesting);   \
> +             if (!nesting)                                   \
> +                     __preempt_count_sub(NMI_OFFSET);        \

We have an issue here in the following case:

  // nmi_nesting == 1
  __nmi_exit():
    ..
    nesting = __this_cpu_dec_return(nmi_nesting); // <- nesting == 0
    <another NMI comes>
      __nmi_enter()
      // nmi_nesting becomes 1
      __nmi_exit():
        nesting = __this_cpu_dec_return(nmi_nesting); // <- nesting == 0
        if (!nesting)
          __preempt_count_sub(NMI_OFFSET);
        // NMI_OFFSET bit is 0
    if (!nesting)
       __preempt_count_sub(NMI_OFFSET); // underflow!

I think we need to do:
      
#define __nmi_exit()                                            \
        do {                                                    \
                unsigned int nesting;                           \
                BUG_ON(!in_nmi());                              \
                __preempt_count_sub(HARDIRQ_OFFSET);            \
                nesting = __this_cpu_dec_return(nmi_nesting);   \
                if (!nesting)                                   \
                        preempt_count_set(preempt_count() & ~NMI_MASK); \
                arch_nmi_exit();                                \
                lockdep_on();                                   \
        } while (0)

@Joel, thoughts?

Similarly, we have this issue in patch #10 as well.

Regards,
Boqun

>               arch_nmi_exit();                                \
>               lockdep_on();                                   \
>       } while (0)
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index d964f965c8ff..586f96688325 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -17,6 +17,8 @@
>   *
>   * - bits 0-7 are the preemption count (max preemption depth: 256)
>   * - bits 8-15 are the softirq count (max # of softirqs: 256)
> + * - bits 16-19 are the hardirq count (max # of hardirqs: 16)
> + * - bit 20 is the NMI flag (no nesting count, tracked separately)
>   *
>   * The hardirq count could in theory be the same as the number of
>   * interrupts in the system, but we run all interrupt handlers with
> @@ -24,16 +26,19 @@
>   * there are a few palaeontologic drivers which reenable interrupts in
>   * the handler, so we need more than one bit here.
>   *
> + * NMI nesting depth is tracked in a separate per-CPU variable
> + * (nmi_nesting) to save bits in preempt_count.
> + *
>   *         PREEMPT_MASK:     0x000000ff
>   *         SOFTIRQ_MASK:     0x0000ff00
>   *         HARDIRQ_MASK:     0x000f0000
> - *             NMI_MASK:     0x00f00000
> + *             NMI_MASK:     0x00100000
>   * PREEMPT_NEED_RESCHED:     0x80000000
>   */
>  #define PREEMPT_BITS 8
>  #define SOFTIRQ_BITS 8
>  #define HARDIRQ_BITS 4
> -#define NMI_BITS     4
> +#define NMI_BITS     1
>  
>  #define PREEMPT_SHIFT        0
>  #define SOFTIRQ_SHIFT        (PREEMPT_SHIFT + PREEMPT_BITS)
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 4425d8dce44b..10af5ed859e7 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -88,6 +88,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
>  EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
>  #endif
>  
> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> +
>  /*
>   * SOFTIRQ_OFFSET usage:
>   *
> diff --git a/tools/testing/selftests/bpf/bpf_experimental.h 
> b/tools/testing/selftests/bpf/bpf_experimental.h
> index 2234bd6bc9d3..2d4256ff471f 100644
> --- a/tools/testing/selftests/bpf/bpf_experimental.h
> +++ b/tools/testing/selftests/bpf/bpf_experimental.h
> @@ -449,7 +449,7 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, 
> const char *name__str,
>  #define PREEMPT_BITS 8
>  #define SOFTIRQ_BITS 8
>  #define HARDIRQ_BITS 4
> -#define NMI_BITS     4
> +#define NMI_BITS     1
>  
>  #define PREEMPT_SHIFT        0
>  #define SOFTIRQ_SHIFT        (PREEMPT_SHIFT + PREEMPT_BITS)
> -- 
> 2.50.1 (Apple Git-155)
> 

Reply via email to