On Mon, Sep 29, 2025 at 04:37:37PM +0200, Maciej Fijalkowski wrote:
> On Mon, Sep 29, 2025 at 10:57:45AM +0200, Bastien Curutchet wrote:
> > On 9/27/25 1:19 PM, Alexei Starovoitov wrote:
> > > On Fri, Sep 26, 2025 at 12:47 PM Maciej Fijalkowski
> > > <[email protected]> wrote:
> > > > 
> > > > On Fri, Sep 26, 2025 at 08:39:28AM +0200, Bastien Curutchet wrote:
> > > > > Hi Maciej,
> > > > > 
> > > > > On 9/25/25 3:32 PM, Maciej Fijalkowski wrote:
> > > > > > On Wed, Sep 24, 2025 at 04:49:39PM +0200, Bastien Curutchet (eBPF 
> > > > > > Foundation) wrote:
> > > > > > > testapp_stats_rx_dropped() generates pkt_stream twice. The last
> > > > > > > generated is released by pkt_stream_restore_default() at the end 
> > > > > > > of the
> > > > > > > test but we lose the pointer of the first pkt_stream.
> > > > > > > 
> > > > > > > Release the 'middle' pkt_stream when it's getting replaced to 
> > > > > > > prevent
> > > > > > > memory leaks.
> > > > > > > 
> > > > > > > Signed-off-by: Bastien Curutchet (eBPF Foundation) 
> > > > > > > <[email protected]>
> > > > > > > ---
> > > > > > >    tools/testing/selftests/bpf/test_xsk.c | 7 +++++++
> > > > > > >    1 file changed, 7 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/tools/testing/selftests/bpf/test_xsk.c 
> > > > > > > b/tools/testing/selftests/bpf/test_xsk.c
> > > > > > > index 
> > > > > > > 8d7c38eb32ca3537cb019f120c3350ebd9f8c6bc..eb18288ea1e4aa1c9337d16333b7174ecaed0999
> > > > > > >  100644
> > > > > > > --- a/tools/testing/selftests/bpf/test_xsk.c
> > > > > > > +++ b/tools/testing/selftests/bpf/test_xsk.c
> > > > > > > @@ -536,6 +536,13 @@ static void pkt_stream_receive_half(struct 
> > > > > > > test_spec *test)
> > > > > > >            struct pkt_stream *pkt_stream = 
> > > > > > > test->ifobj_tx->xsk->pkt_stream;
> > > > > > >            u32 i;
> > > > > > > + if (test->ifobj_rx->xsk->pkt_stream != 
> > > > > > > test->rx_pkt_stream_default)
> > > > > > > +         /* Packet stream has already been replaced so we have 
> > > > > > > to release this one.
> > > > > > > +          * The newly created one will be freed by the 
> > > > > > > restore_default() at the
> > > > > > > +          * end of the test
> > > > > > > +          */
> > > > > > > +         pkt_stream_delete(test->ifobj_rx->xsk->pkt_stream);
> > > > > > 
> > > > > > I don't see why this one is not addressed within test case
> > > > > > (testapp_stats_rx_dropped()) and other fix is 
> > > > > > (testapp_xdp_shared_umem()).
> > > > > > 
> > > > > 
> > > > > pkt_stream_receive_half() can be used by other tests. I thought it 
> > > > > would be
> > > > 
> > > > So is pkt_stream_replace_half() and other routines that eventually call
> > > > pkt_stream_generate() and overwrite the pkt_stream, right?
> > > > 
> > > > It just feels odd to have a special treatment in one function and other
> > > > are left as-is just because currently we don't have another abusive test
> > > > case.
> > > > 
> > > > Maybe it's enough of bike-shedding here, just wanted to clarify on my 
> > > > POV.
> > > > 
> > > > In the end don't get me wrong here, this interface is a bit PITA for me
> > > > and thanks for whole effort!
> > > 
> > > My reading of this discussion that it doesn't block the series
> > > and can be done in the follow up if necessary.
> > > 
> > > So I was planning to apply it, but it found real bugs:
> > > 
> > > ./test_progs -t xsk
> > > [   18.066989] bpf_testmod: loading out-of-tree module taints kernel.
> > > [   32.204881] BUG: Bad page state in process test_progs  pfn:11c98b
> > > [   32.207167] page: refcount:0 mapcount:0 mapping:0000000000000000
> > > index:0x0 pfn:0x11c98b
> > > [   32.210084] flags: 0x1fffe0000000000(node=0|zone=1|lastcpupid=0x7fff)
> > > [   32.212493] raw: 01fffe0000000000 dead000000000040 ff11000123c9b000
> > > 0000000000000000
> > > [   32.218056] raw: 0000000000000000 0000000000000001 00000000ffffffff
> > > 0000000000000000
> > > [   32.220900] page dumped because: page_pool leak
> > > [   32.222636] Modules linked in: bpf_testmod(O) bpf_preload
> > > [   32.224632] CPU: 6 UID: 0 PID: 3612 Comm: test_progs Tainted: G
> > >        O        6.17.0-rc5-gfec474d29325 #6969 PREEMPT
> > > [   32.224638] Tainted: [O]=OOT_MODULE
> > > [   32.224639] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> > > BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
> > > [   32.224641] Call Trace:
> > > [   32.224644]  <IRQ>
> > > [   32.224646]  dump_stack_lvl+0x4b/0x70
> > > [   32.224653]  bad_page.cold+0xbd/0xe0
> > > [   32.224657]  __free_frozen_pages+0x838/0x10b0
> > > [   32.224660]  ? skb_pp_cow_data+0x782/0xc30
> > > [   32.224665]  bpf_xdp_shrink_data+0x221/0x530
> > > [   32.224668]  ? skb_pp_cow_data+0x6d1/0xc30
> > > [   32.224671]  bpf_xdp_adjust_tail+0x598/0x810
> > > [   32.224673]  ? xsk_destruct_skb+0x321/0x800
> > > [   32.224678]  bpf_prog_004ac6bb21de57a7_xsk_xdp_adjust_tail+0x52/0xd6
> > > [   32.224681]  veth_xdp_rcv_skb+0x45d/0x15a0
> > > [   32.224684]  ? get_stack_info_noinstr+0x16/0xe0
> > > [   32.224688]  ? veth_set_channels+0x920/0x920
> > > [   32.224691]  ? get_stack_info+0x2f/0x80
> > > [   32.224693]  ? unwind_next_frame+0x3af/0x1df0
> > > [   32.224697]  veth_xdp_rcv.constprop.0+0x38a/0xbe0
> > > [   32.224700]  ? common_startup_64+0x13e/0x148
> > > [   32.224703]  ? veth_xdp_rcv_one+0xcd0/0xcd0
> > > [   32.224706]  ? stack_trace_save+0x84/0xa0
> > > [   32.224709]  ? stack_depot_save_flags+0x28/0x820
> > > [   32.224713]  ? __resched_curr.constprop.0+0x332/0x3b0
> > > [   32.224716]  ? timerqueue_add+0x217/0x320
> > > [   32.224719]  veth_poll+0x115/0x5e0
> > > [   32.224722]  ? veth_xdp_rcv.constprop.0+0xbe0/0xbe0
> > > [   32.224726]  ? update_load_avg+0x1cb/0x12d0
> > > [   32.224730]  ? update_cfs_group+0x121/0x2c0
> > > [   32.224733]  __napi_poll+0xa0/0x420
> > > [   32.224736]  net_rx_action+0x901/0xe90
> > > [   32.224740]  ? run_backlog_napi+0x50/0x50
> > > [   32.224743]  ? clockevents_program_event+0x1cc/0x280
> > > [   32.224746]  ? hrtimer_interrupt+0x31e/0x7c0
> > > [   32.224749]  handle_softirqs+0x151/0x430
> > > [   32.224752]  do_softirq+0x3f/0x60
> > > [   32.224755]  </IRQ>
> > > [   32.224756]  <TASK>
> > > [   32.224757]  __local_bh_enable_ip+0x58/0x60
> > > [   32.224759]  __dev_direct_xmit+0x295/0x540
> > > [   32.224762]  __xsk_generic_xmit+0x180a/0x2df0
> > > [   32.224764]  ? ___kmalloc_large_node+0xdf/0x130
> > > [   32.224767]  ? __mutex_unlock_slowpath.isra.0+0x330/0x330
> > > [   32.224770]  ? __rtnl_unlock+0x65/0xd0
> > > [   32.224773]  ? xsk_create+0x700/0x700
> > > [   32.224774]  ? netdev_run_todo+0xce/0xbe0
> > > [   32.224777]  ? _raw_spin_lock_irqsave+0x7b/0xc0
> > > [   32.224780]  xsk_sendmsg+0x365/0x770
> > > [   32.224782]  ? xsk_poll+0x640/0x640
> > > [   32.224783]  __sock_sendmsg+0xc1/0x150
> > > [   32.224787]  __sys_sendto+0x1d0/0x260
> > > [   32.224790]  ? __ia32_sys_getpeername+0xb0/0xb0
> > > [   32.224793]  ? fput+0x29/0x80
> > > [   32.224796]  ? __sys_bind+0x187/0x1c0
> > > [   32.224798]  ? __sys_bind_socket+0x90/0x90
> > > [   32.224801]  ? randomize_page+0x60/0x60
> > > [   32.224804]  ? fget+0x18e/0x230
> > > [   32.224807]  __x64_sys_sendto+0xe0/0x1b0
> > > [   32.224810]  ? fpregs_assert_state_consistent+0x57/0xe0
> > > [   32.224812]  do_syscall_64+0x46/0x180
> > > [   32.224815]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
> > > 
> > > and at the end:
> > > 
> > > # ERROR: [receive_pkts] Receive loop timed out
> > > test_xsk:FAIL:Run test unexpected error: -1 (errno 12)
> > > #251/32  ns_xsk_drv/XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF:FAIL
> > > #251     ns_xsk_drv:FAIL
> > > Summary: 1/67 PASSED, 0 SKIPPED, 1 FAILED
> > > 
> > > [   99.308243] page_pool_release_retry() stalled pool shutdown: id
> > > 185, 48 inflight 60 sec
> > > [  159.724173] page_pool_release_retry() stalled pool shutdown: id
> > > 185, 48 inflight 120 sec
> > > 
> > > 
> > > The test is great and the work to make it run as part of test_progs
> > > paid off big time.
> > > 
> > > But we cannot enable it by default, since it will be crashing CI VMs.
> > > 
> > > Please reproduce the above issue.
> > > You might need
> > > CONFIG_DEBUG_VM=y
> > > and other mm debug flags.
> > > 
> > 
> > I did reproduce the issue with CONFIG_DEBUG_VM=y
> > 
> > > If the fix can be done quickly let's land the fix first.
> > > If not, please respin the series, but disable the test by default
> > > until the bug is fixed.
> > 
> > I won't have much time this week to investigate this further, so I'll respin
> > the series with this test in the 'flaky table'.
> 
> I'll take a look at the splat then.

Fix is two-fold and needs to be done in generic xdp hook and in veth
itself, it is aligned with what we discussed with Jakub at
https://lore.kernel.org/netdev/[email protected]/

When skb_pp_cow_data() gives us skb backed by system page pool, this needs
to be reflected on rxq.mem.type so that helpers when releasing a frag will
use correct arguments in __xdp_return().

Below is a quick and dirty diff that silenced the splat on my side, let me
think a bit how this could be polished.

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a3046142cb8e..cabd2dda8e58 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -791,6 +791,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
                                        struct veth_stats *stats)
 {
        void *orig_data, *orig_data_end;
+       struct xdp_rxq_info rxq = {};
        struct bpf_prog *xdp_prog;
        struct veth_xdp_buff vxbuf;
        struct xdp_buff *xdp = &vxbuf.xdp;
@@ -811,6 +812,12 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
                goto drop;
        vxbuf.skb = skb;
 
+       if (skb->pp_recycle) {
+               rxq.dev = skb->dev;
+               rxq.mem.type = MEM_TYPE_PAGE_POOL;
+               xdp->rxq = &rxq;
+       }
+
        orig_data = xdp->data;
        orig_data_end = xdp->data_end;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 93a25d87b86b..233b2fc424db 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5206,34 +5206,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int 
cpu,
        return NET_RX_DROP;
 }
 
-static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
-{
-       struct net_device *dev = skb->dev;
-       struct netdev_rx_queue *rxqueue;
-
-       rxqueue = dev->_rx;
-
-       if (skb_rx_queue_recorded(skb)) {
-               u16 index = skb_get_rx_queue(skb);
-
-               if (unlikely(index >= dev->real_num_rx_queues)) {
-                       WARN_ONCE(dev->real_num_rx_queues > 1,
-                                 "%s received packet on queue %u, but number "
-                                 "of RX queues is %u\n",
-                                 dev->name, index, dev->real_num_rx_queues);
-
-                       return rxqueue; /* Return first rxqueue */
-               }
-               rxqueue += index;
-       }
-       return rxqueue;
-}
-
 u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             const struct bpf_prog *xdp_prog)
 {
        void *orig_data, *orig_data_end, *hard_start;
-       struct netdev_rx_queue *rxqueue;
+       struct xdp_rxq_info rxq = {};
        bool orig_bcast, orig_host;
        u32 mac_len, frame_sz;
        __be16 orig_eth_type;
@@ -5251,8 +5228,11 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct 
xdp_buff *xdp,
        frame_sz = (void *)skb_end_pointer(skb) - hard_start;
        frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-       rxqueue = netif_get_rxqueue(skb);
-       xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
+       rxq.dev = skb->dev;
+       rxq.mem.type = skb->pp_recycle ? MEM_TYPE_PAGE_POOL :
+                                        MEM_TYPE_PAGE_SHARED;
+
+       xdp_init_buff(xdp, frame_sz, &rxq);
        xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
                         skb_headlen(skb) + mac_len, true);
        if (skb_is_nonlinear(skb)) {

> 
> > 
> > Best regards,
> > -- 
> > Bastien Curutchet, Bootlin
> > Embedded Linux and Kernel engineering
> > https://bootlin.com
> > 

Reply via email to