On Tue, May 26, 2026 at 08:21:37AM -0700, Boqun Feng wrote:
> From: Joel Fernandes <[email protected]>
>
> Move NMI nesting tracking from the preempt_count bits to a separate per-CPU
> counter (nmi_nesting). This is to free up the NMI bits in the preempt_count,
> allowing those bits to be repurposed for other uses.
>
> Reduce NMI_BITS from 4 to 1, using it only to detect if we're in an NMI.
> The per-CPU counter currently caps nesting at 15.
>
> [boqun: Solve Steven Rostedt's comment on the BUG_ON() condition]
>
> Suggested-by: Boqun Feng <[email protected]>
> Signed-off-by: Joel Fernandes <[email protected]>
> Signed-off-by: Lyude Paul <[email protected]>
> Signed-off-by: Boqun Feng <[email protected]>
> Link: https://patch.msgid.link/[email protected]
> ---
> include/linux/hardirq.h | 17 +++++++++++++----
> include/linux/preempt.h | 9 +++++++--
> kernel/softirq.c | 2 ++
> tools/testing/selftests/bpf/bpf_experimental.h | 2 +-
> 4 files changed, 23 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index d57cab4d4c06..1a0360a1000f 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -10,6 +10,8 @@
> #include <linux/vtime.h>
> #include <asm/hardirq.h>
>
> +DECLARE_PER_CPU(unsigned int, nmi_nesting);
> +
> extern void synchronize_irq(unsigned int irq);
> extern bool synchronize_hardirq(unsigned int irq);
>
> @@ -102,14 +104,17 @@ void irq_exit_rcu(void);
> */
>
> /*
> - * nmi_enter() can nest up to 15 times; see NMI_BITS.
> + * nmi_enter() can nest - nesting is tracked in a per-CPU counter.
> */
> #define __nmi_enter() \
> do { \
> lockdep_off(); \
> arch_nmi_enter(); \
> - BUG_ON(in_nmi() == NMI_MASK); \
> - __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
> + /* Maximum NMI nesting is 15. */ \
> + BUG_ON(__this_cpu_read(nmi_nesting) >= 15); \
> + __this_cpu_inc(nmi_nesting); \
> + __preempt_count_add(HARDIRQ_OFFSET); \
> + preempt_count_set(preempt_count() | NMI_MASK); \
> } while (0)
>
> #define nmi_enter() \
> @@ -124,8 +129,12 @@ void irq_exit_rcu(void);
>
> #define __nmi_exit() \
> do { \
> + unsigned int nesting; \
> BUG_ON(!in_nmi()); \
> - __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
> + __preempt_count_sub(HARDIRQ_OFFSET); \
> + nesting = __this_cpu_dec_return(nmi_nesting); \
> + if (!nesting) \
> + __preempt_count_sub(NMI_OFFSET); \
We have an issue here in the following case:
// nmi_nesting == 1
__nmi_exit():
..
nesting = __this_cpu_dec_return(nmi_nesting); // <- nesting == 0
<another NMI comes>
__nmi_enter()
// nmi_nesting becomes 1
__nmi_exit():
nesting = __this_cpu_dec_return(nmi_nesting); // <- nesting == 0
if (!nesting)
__preempt_count_sub(NMI_OFFSET);
// NMI_OFFSET bit is 0
if (!nesting)
__preempt_count_sub(NMI_OFFSET); // underflow!
I think we need to do:
#define __nmi_exit() \
do { \
unsigned int nesting; \
BUG_ON(!in_nmi()); \
__preempt_count_sub(HARDIRQ_OFFSET); \
nesting = __this_cpu_dec_return(nmi_nesting); \
if (!nesting) \
preempt_count_set(preempt_count() & ~NMI_MASK); \
arch_nmi_exit(); \
lockdep_on(); \
} while (0)
@Joel, thoughts?
Similarly, we have this issue in patch #10 as well.
Regards,
Boqun
> arch_nmi_exit(); \
> lockdep_on(); \
> } while (0)
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index d964f965c8ff..586f96688325 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -17,6 +17,8 @@
> *
> * - bits 0-7 are the preemption count (max preemption depth: 256)
> * - bits 8-15 are the softirq count (max # of softirqs: 256)
> + * - bits 16-19 are the hardirq count (max # of hardirqs: 16)
> + * - bit 20 is the NMI flag (no nesting count, tracked separately)
> *
> * The hardirq count could in theory be the same as the number of
> * interrupts in the system, but we run all interrupt handlers with
> @@ -24,16 +26,19 @@
> * there are a few palaeontologic drivers which reenable interrupts in
> * the handler, so we need more than one bit here.
> *
> + * NMI nesting depth is tracked in a separate per-CPU variable
> + * (nmi_nesting) to save bits in preempt_count.
> + *
> * PREEMPT_MASK: 0x000000ff
> * SOFTIRQ_MASK: 0x0000ff00
> * HARDIRQ_MASK: 0x000f0000
> - * NMI_MASK: 0x00f00000
> + * NMI_MASK: 0x00100000
> * PREEMPT_NEED_RESCHED: 0x80000000
> */
> #define PREEMPT_BITS 8
> #define SOFTIRQ_BITS 8
> #define HARDIRQ_BITS 4
> -#define NMI_BITS 4
> +#define NMI_BITS 1
>
> #define PREEMPT_SHIFT 0
> #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 4425d8dce44b..10af5ed859e7 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -88,6 +88,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
> EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
> #endif
>
> +DEFINE_PER_CPU(unsigned int, nmi_nesting);
> +
> /*
> * SOFTIRQ_OFFSET usage:
> *
> diff --git a/tools/testing/selftests/bpf/bpf_experimental.h
> b/tools/testing/selftests/bpf/bpf_experimental.h
> index 2234bd6bc9d3..2d4256ff471f 100644
> --- a/tools/testing/selftests/bpf/bpf_experimental.h
> +++ b/tools/testing/selftests/bpf/bpf_experimental.h
> @@ -449,7 +449,7 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup,
> const char *name__str,
> #define PREEMPT_BITS 8
> #define SOFTIRQ_BITS 8
> #define HARDIRQ_BITS 4
> -#define NMI_BITS 4
> +#define NMI_BITS 1
>
> #define PREEMPT_SHIFT 0
> #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
> --
> 2.50.1 (Apple Git-155)
>