On Mon, Mar 30, 2026 at 06:10:44PM -0400, Aaron Tomlin wrote:
> +static bool blk_mq_validate(struct blk_mq_queue_map *qmap,
> +                         const struct cpumask *active_hctx)
> +{
> +     /*
> +      * Verify if the mapping is usable when housekeeping
> +      * configuration is enabled
> +      */
> +
> +     for (int queue = 0; queue < qmap->nr_queues; queue++) {
> +             int cpu;
> +
> +             if (cpumask_test_cpu(queue, active_hctx)) {
> +                     /*
> +                      * This htcx has at least one online CPU thus it

Typo, should say "hctx".

> +                      * is able to serve any assigned isolated CPU.
> +                      */
> +                     continue;
> +             }
> +
> +             /*
> +              * There is no housekeeping online CPU for this hctx, all
> +              * good as long as all non houskeeping CPUs are also

Typo, "housekeeping".

...

>  void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>  {
> -     const struct cpumask *masks;
> +     struct cpumask *masks __free(kfree) = NULL;
> +     const struct cpumask *constraint;
>       unsigned int queue, cpu, nr_masks;
> +     cpumask_var_t active_hctx;
>  
> -     masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
> -     if (!masks) {
> -             for_each_possible_cpu(cpu)
> -                     qmap->mq_map[cpu] = qmap->queue_offset;
> -             return;
> -     }
> +     if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
> +             goto fallback;
> +
> +     if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
> +             constraint = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +     else
> +             constraint = cpu_possible_mask;
> +
> +     /* Map CPUs to the hardware contexts (hctx) */
> +     masks = group_mask_cpus_evenly(qmap->nr_queues, constraint, &nr_masks);
> +     if (!masks)
> +             goto free_fallback;
>  
>       for (queue = 0; queue < qmap->nr_queues; queue++) {
> -             for_each_cpu(cpu, &masks[queue % nr_masks])
> -                     qmap->mq_map[cpu] = qmap->queue_offset + queue;
> +             unsigned int idx = (qmap->queue_offset + queue) % nr_masks;
> +
> +             for_each_cpu(cpu, &masks[idx]) {
> +                     qmap->mq_map[cpu] = idx;

I think there's something off with this when we have multiple queue maps. The
wrapping loses the offset when we've isolated CPUs, so I think the index would
end up incorrect.

Trying this series out when "nvme.poll_queues=2" with isolcpus set, I am
getting a kernel panic:

 nvme nvme0: 8/0/2 default/read/poll queues
 BUG: unable to handle page fault for address: ffff889101898da0
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 4e01067 P4D 4e01067 PUD 0
 Oops: Oops: 0000 [#1] SMP PTI
 CPU: 11 UID: 0 PID: 201 Comm: kworker/u64:19 Not tainted 
7.0.0-rc4-00222-g065cad526374 #1586 PREEMPT
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014
 Workqueue: async async_run_entry_fn
 RIP: 0010:nvme_init_hctx_common+0x6f/0x190 [nvme]
 Code: 85 78 01 00 00 0f 85 86 00 00 00 45 8b b5 88 01 00 00 4c 89 f0 4d 89 f1 
48 c1 e0 04 49 89 c7 4c 8d 94 03 38 0b 00 00 49 01 df <49> 83 bf 40 0b 00 00 00 
74 64 44 89 d0 49 81 fa 00 f0 ff ff 77 27
 RSP: 0018:ffffc9000083ba90 EFLAGS: 00010286
 RAX: 0000000ffffffff0 RBX: ffff888101898270 RCX: ffffffffa008bd40
 RDX: 0000000000000008 RSI: ffff888101898270 RDI: ffff888101900800
 RBP: ffffc9000083bac8 R08: 0000000000000060 R09: 00000000ffffffff
 R10: ffff889101898d98 R11: ffff888101ddf000 R12: ffff8881087f36c0
 R13: ffff888101900800 R14: 00000000ffffffff R15: ffff889101898260
 FS:  0000000000000000(0000) GS:ffff8890bb50a000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffff889101898da0 CR3: 0000000101fe8001 CR4: 0000000000770ef0
 PKRU: 55555554
 Call Trace:
  <TASK>
  blk_mq_alloc_and_init_hctx+0x11e/0x3a0
  __blk_mq_realloc_hw_ctxs+0x185/0x220
  blk_mq_init_allocated_queue+0xeb/0x3b0
  ? percpu_ref_init+0x6a/0x130
  blk_mq_alloc_queue+0x7a/0xd0
  __blk_mq_alloc_disk+0x14/0x60
  nvme_alloc_ns+0xac/0xb30 [nvme_core]
  ? blk_mq_run_hw_queue+0x117/0x270
  nvme_scan_ns+0x279/0x350 [nvme_core]
  async_run_entry_fn+0x2e/0x130
  process_one_work+0x16c/0x3a0
  worker_thread+0x173/0x2e0
  ? __pfx_worker_thread+0x10/0x10
  kthread+0xe0/0x120
  ? __pfx_kthread+0x10/0x10
  ret_from_fork+0x207/0x270
  ? __pfx_kthread+0x10/0x10
  ret_from_fork_asm+0x1a/0x30
  </TASK>

Reply via email to