On Mon, Sep 29, 2025 at 04:37:37PM +0200, Maciej Fijalkowski wrote: > On Mon, Sep 29, 2025 at 10:57:45AM +0200, Bastien Curutchet wrote: > > On 9/27/25 1:19 PM, Alexei Starovoitov wrote: > > > On Fri, Sep 26, 2025 at 12:47 PM Maciej Fijalkowski > > > <[email protected]> wrote: > > > > > > > > On Fri, Sep 26, 2025 at 08:39:28AM +0200, Bastien Curutchet wrote: > > > > > Hi Maciej, > > > > > > > > > > On 9/25/25 3:32 PM, Maciej Fijalkowski wrote: > > > > > > On Wed, Sep 24, 2025 at 04:49:39PM +0200, Bastien Curutchet (eBPF > > > > > > Foundation) wrote: > > > > > > > testapp_stats_rx_dropped() generates pkt_stream twice. The last > > > > > > > generated is released by pkt_stream_restore_default() at the end > > > > > > > of the > > > > > > > test but we lose the pointer of the first pkt_stream. > > > > > > > > > > > > > > Release the 'middle' pkt_stream when it's getting replaced to > > > > > > > prevent > > > > > > > memory leaks. > > > > > > > > > > > > > > Signed-off-by: Bastien Curutchet (eBPF Foundation) > > > > > > > <[email protected]> > > > > > > > --- > > > > > > > tools/testing/selftests/bpf/test_xsk.c | 7 +++++++ > > > > > > > 1 file changed, 7 insertions(+) > > > > > > > > > > > > > > diff --git a/tools/testing/selftests/bpf/test_xsk.c > > > > > > > b/tools/testing/selftests/bpf/test_xsk.c > > > > > > > index > > > > > > > 8d7c38eb32ca3537cb019f120c3350ebd9f8c6bc..eb18288ea1e4aa1c9337d16333b7174ecaed0999 > > > > > > > 100644 > > > > > > > --- a/tools/testing/selftests/bpf/test_xsk.c > > > > > > > +++ b/tools/testing/selftests/bpf/test_xsk.c > > > > > > > @@ -536,6 +536,13 @@ static void pkt_stream_receive_half(struct > > > > > > > test_spec *test) > > > > > > > struct pkt_stream *pkt_stream = > > > > > > > test->ifobj_tx->xsk->pkt_stream; > > > > > > > u32 i; > > > > > > > + if (test->ifobj_rx->xsk->pkt_stream != > > > > > > > test->rx_pkt_stream_default) > > > > > > > + /* Packet stream has already been replaced so we have > > > > > > > to release this one. > > > > > > > + * The newly created one will be freed by the > > > > > > > restore_default() at the > > > > > > > + * end of the test > > > > > > > + */ > > > > > > > + pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream); > > > > > > > > > > > > I don't see why this one is not addressed within test case > > > > > > (testapp_stats_rx_dropped()) and other fix is > > > > > > (testapp_xdp_shared_umem()). > > > > > > > > > > > > > > > > pkt_stream_receive_half() can be used by other tests. I thought it > > > > > would be > > > > > > > > So is pkt_stream_replace_half() and other routines that eventually call > > > > pkt_stream_generate() and overwrite the pkt_stream, right? > > > > > > > > It just feels odd to have a special treatment in one function and other > > > > are left as-is just because currently we don't have another abusive test > > > > case. > > > > > > > > Maybe it's enough of bike-shedding here, just wanted to clarify on my > > > > POV. > > > > > > > > In the end don't get me wrong here, this interface is a bit PITA for me > > > > and thanks for whole effort! > > > > > > My reading of this discussion that it doesn't block the series > > > and can be done in the follow up if necessary. > > > > > > So I was planning to apply it, but it found real bugs: > > > > > > ./test_progs -t xsk > > > [ 18.066989] bpf_testmod: loading out-of-tree module taints kernel. > > > [ 32.204881] BUG: Bad page state in process test_progs pfn:11c98b > > > [ 32.207167] page: refcount:0 mapcount:0 mapping:0000000000000000 > > > index:0x0 pfn:0x11c98b > > > [ 32.210084] flags: 0x1fffe0000000000(node=0|zone=1|lastcpupid=0x7fff) > > > [ 32.212493] raw: 01fffe0000000000 dead000000000040 ff11000123c9b000 > > > 0000000000000000 > > > [ 32.218056] raw: 0000000000000000 0000000000000001 00000000ffffffff > > > 0000000000000000 > > > [ 32.220900] page dumped because: page_pool leak > > > [ 32.222636] Modules linked in: bpf_testmod(O) bpf_preload > > > [ 32.224632] CPU: 6 UID: 0 PID: 3612 Comm: test_progs Tainted: G > > > O 6.17.0-rc5-gfec474d29325 #6969 PREEMPT > > > [ 32.224638] Tainted: [O]=OOT_MODULE > > > [ 32.224639] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), > > > BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 > > > [ 32.224641] Call Trace: > > > [ 32.224644] <IRQ> > > > [ 32.224646] dump_stack_lvl+0x4b/0x70 > > > [ 32.224653] bad_page.cold+0xbd/0xe0 > > > [ 32.224657] __free_frozen_pages+0x838/0x10b0 > > > [ 32.224660] ? skb_pp_cow_data+0x782/0xc30 > > > [ 32.224665] bpf_xdp_shrink_data+0x221/0x530 > > > [ 32.224668] ? skb_pp_cow_data+0x6d1/0xc30 > > > [ 32.224671] bpf_xdp_adjust_tail+0x598/0x810 > > > [ 32.224673] ? xsk_destruct_skb+0x321/0x800 > > > [ 32.224678] bpf_prog_004ac6bb21de57a7_xsk_xdp_adjust_tail+0x52/0xd6 > > > [ 32.224681] veth_xdp_rcv_skb+0x45d/0x15a0 > > > [ 32.224684] ? get_stack_info_noinstr+0x16/0xe0 > > > [ 32.224688] ? veth_set_channels+0x920/0x920 > > > [ 32.224691] ? get_stack_info+0x2f/0x80 > > > [ 32.224693] ? unwind_next_frame+0x3af/0x1df0 > > > [ 32.224697] veth_xdp_rcv.constprop.0+0x38a/0xbe0 > > > [ 32.224700] ? common_startup_64+0x13e/0x148 > > > [ 32.224703] ? veth_xdp_rcv_one+0xcd0/0xcd0 > > > [ 32.224706] ? stack_trace_save+0x84/0xa0 > > > [ 32.224709] ? stack_depot_save_flags+0x28/0x820 > > > [ 32.224713] ? __resched_curr.constprop.0+0x332/0x3b0 > > > [ 32.224716] ? timerqueue_add+0x217/0x320 > > > [ 32.224719] veth_poll+0x115/0x5e0 > > > [ 32.224722] ? veth_xdp_rcv.constprop.0+0xbe0/0xbe0 > > > [ 32.224726] ? update_load_avg+0x1cb/0x12d0 > > > [ 32.224730] ? update_cfs_group+0x121/0x2c0 > > > [ 32.224733] __napi_poll+0xa0/0x420 > > > [ 32.224736] net_rx_action+0x901/0xe90 > > > [ 32.224740] ? run_backlog_napi+0x50/0x50 > > > [ 32.224743] ? clockevents_program_event+0x1cc/0x280 > > > [ 32.224746] ? hrtimer_interrupt+0x31e/0x7c0 > > > [ 32.224749] handle_softirqs+0x151/0x430 > > > [ 32.224752] do_softirq+0x3f/0x60 > > > [ 32.224755] </IRQ> > > > [ 32.224756] <TASK> > > > [ 32.224757] __local_bh_enable_ip+0x58/0x60 > > > [ 32.224759] __dev_direct_xmit+0x295/0x540 > > > [ 32.224762] __xsk_generic_xmit+0x180a/0x2df0 > > > [ 32.224764] ? ___kmalloc_large_node+0xdf/0x130 > > > [ 32.224767] ? __mutex_unlock_slowpath.isra.0+0x330/0x330 > > > [ 32.224770] ? __rtnl_unlock+0x65/0xd0 > > > [ 32.224773] ? xsk_create+0x700/0x700 > > > [ 32.224774] ? netdev_run_todo+0xce/0xbe0 > > > [ 32.224777] ? _raw_spin_lock_irqsave+0x7b/0xc0 > > > [ 32.224780] xsk_sendmsg+0x365/0x770 > > > [ 32.224782] ? xsk_poll+0x640/0x640 > > > [ 32.224783] __sock_sendmsg+0xc1/0x150 > > > [ 32.224787] __sys_sendto+0x1d0/0x260 > > > [ 32.224790] ? __ia32_sys_getpeername+0xb0/0xb0 > > > [ 32.224793] ? fput+0x29/0x80 > > > [ 32.224796] ? __sys_bind+0x187/0x1c0 > > > [ 32.224798] ? __sys_bind_socket+0x90/0x90 > > > [ 32.224801] ? randomize_page+0x60/0x60 > > > [ 32.224804] ? fget+0x18e/0x230 > > > [ 32.224807] __x64_sys_sendto+0xe0/0x1b0 > > > [ 32.224810] ? fpregs_assert_state_consistent+0x57/0xe0 > > > [ 32.224812] do_syscall_64+0x46/0x180 > > > [ 32.224815] entry_SYSCALL_64_after_hwframe+0x4b/0x53 > > > > > > and at the end: > > > > > > # ERROR: [receive_pkts] Receive loop timed out > > > test_xsk:FAIL:Run test unexpected error: -1 (errno 12) > > > #251/32 ns_xsk_drv/XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF:FAIL > > > #251 ns_xsk_drv:FAIL > > > Summary: 1/67 PASSED, 0 SKIPPED, 1 FAILED > > > > > > [ 99.308243] page_pool_release_retry() stalled pool shutdown: id > > > 185, 48 inflight 60 sec > > > [ 159.724173] page_pool_release_retry() stalled pool shutdown: id > > > 185, 48 inflight 120 sec > > > > > > > > > The test is great and the work to make it run as part of test_progs > > > paid off big time. > > > > > > But we cannot enable it by default, since it will be crashing CI VMs. > > > > > > Please reproduce the above issue. > > > You might need > > > CONFIG_DEBUG_VM=y > > > and other mm debug flags. > > > > > > > I did reproduce the issue with CONFIG_DEBUG_VM=y > > > > > If the fix can be done quickly let's land the fix first. > > > If not, please respin the series, but disable the test by default > > > until the bug is fixed. > > > > I won't have much time this week to investigate this further, so I'll respin > > the series with this test in the 'flaky table'. > > I'll take a look at the splat then.
Fix is two-fold and needs to be done in generic xdp hook and in veth itself, it is aligned with what we discussed with Jakub at https://lore.kernel.org/netdev/[email protected]/ When skb_pp_cow_data() gives us skb backed by system page pool, this needs to be reflected on rxq.mem.type so that helpers when releasing a frag will use correct arguments in __xdp_return(). Below is a quick and dirty diff that silenced the splat on my side, let me think a bit how this could be polished. diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a3046142cb8e..cabd2dda8e58 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -791,6 +791,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct veth_stats *stats) { void *orig_data, *orig_data_end; + struct xdp_rxq_info rxq = {}; struct bpf_prog *xdp_prog; struct veth_xdp_buff vxbuf; struct xdp_buff *xdp = &vxbuf.xdp; @@ -811,6 +812,12 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, goto drop; vxbuf.skb = skb; + if (skb->pp_recycle) { + rxq.dev = skb->dev; + rxq.mem.type = MEM_TYPE_PAGE_POOL; + xdp->rxq = &rxq; + } + orig_data = xdp->data; orig_data_end = xdp->data_end; diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..233b2fc424db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5206,34 +5206,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, return NET_RX_DROP; } -static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct netdev_rx_queue *rxqueue; - - rxqueue = dev->_rx; - - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - - if (unlikely(index >= dev->real_num_rx_queues)) { - WARN_ONCE(dev->real_num_rx_queues > 1, - "%s received packet on queue %u, but number " - "of RX queues is %u\n", - dev->name, index, dev->real_num_rx_queues); - - return rxqueue; /* Return first rxqueue */ - } - rxqueue += index; - } - return rxqueue; -} - u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; - struct netdev_rx_queue *rxqueue; + struct xdp_rxq_info rxq = {}; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; @@ -5251,8 +5228,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, frame_sz = (void *)skb_end_pointer(skb) - hard_start; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - rxqueue = netif_get_rxqueue(skb); - xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + rxq.dev = skb->dev; + rxq.mem.type = skb->pp_recycle ? MEM_TYPE_PAGE_POOL : + MEM_TYPE_PAGE_SHARED; + + xdp_init_buff(xdp, frame_sz, &rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); if (skb_is_nonlinear(skb)) { > > > > > Best regards, > > -- > > Bastien Curutchet, Bootlin > > Embedded Linux and Kernel engineering > > https://bootlin.com > >

