On Fri, Mar 23, 2018 at 5:19 AM, Jesper Dangaard Brouer <bro...@redhat.com> wrote: > Changing API xdp_return_frame() to take struct xdp_frame as argument, > seems like a natural choice. But there are some subtle performance > details here that needs extra care, which is a deliberate choice. > > When de-referencing xdp_frame on a remote CPU during DMA-TX > completion, result in the cache-line is change to "Shared" > state. Later when the page is reused for RX, then this xdp_frame > cache-line is written, which change the state to "Modified". > > This situation already happens (naturally) for, virtio_net, tun and > cpumap as the xdp_frame pointer is the queued object. In tun and > cpumap, the ptr_ring is used for efficiently transferring cache-lines > (with pointers) between CPUs. Thus, the only option is to > de-referencing xdp_frame. > > It is only the ixgbe driver that had an optimization, in which it can > avoid doing the de-reference of xdp_frame. The driver already have > TX-ring queue, which (in case of remote DMA-TX completion) have to be > transferred between CPUs anyhow. In this data area, we stored a > struct xdp_mem_info and a data pointer, which allowed us to avoid > de-referencing xdp_frame. > > To compensate for this, a prefetchw is used for telling the cache > coherency protocol about our access pattern. My benchmarks show that > this prefetchw is enough to compensate the ixgbe driver. > > Signed-off-by: Jesper Dangaard Brouer <bro...@redhat.com>
I'm really not a fan of this patch. It seems like it is adding a ton of overhead for one case that is going to penalize all of the other use cases for XDP. Just the extra prefetch is going to have a significant impact on things like the XDP_DROP case. > --- > drivers/net/ethernet/intel/ixgbe/ixgbe.h | 4 +--- > drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 17 +++++++++++------ > drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 + > drivers/net/tun.c | 4 ++-- > drivers/net/virtio_net.c | 2 +- > include/net/xdp.h | 2 +- > kernel/bpf/cpumap.c | 6 +++--- > net/core/xdp.c | 4 +++- > 8 files changed, 23 insertions(+), 17 deletions(-) > > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h > b/drivers/net/ethernet/intel/ixgbe/ixgbe.h > index cbc20f199364..dfbc15a45cb4 100644 > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h > @@ -240,8 +240,7 @@ struct ixgbe_tx_buffer { > unsigned long time_stamp; > union { > struct sk_buff *skb; > - /* XDP uses address ptr on irq_clean */ > - void *data; > + struct xdp_frame *xdpf; > }; > unsigned int bytecount; > unsigned short gso_segs; > @@ -249,7 +248,6 @@ struct ixgbe_tx_buffer { > DEFINE_DMA_UNMAP_ADDR(dma); > DEFINE_DMA_UNMAP_LEN(len); > u32 tx_flags; > - struct xdp_mem_info xdp_mem; > }; > > struct ixgbe_rx_buffer { I suppose this makes most of my earlier suggestions pointless, though I would be interested in seeing what the extra cost is we are taking for having to initialize one more cache line than we were before to support the xdp_frame format. > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > index ff069597fccf..e6e9b28ecfba 100644 > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c > @@ -1207,7 +1207,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector > *q_vector, > > /* free the skb */ > if (ring_is_xdp(tx_ring)) > - xdp_return_frame(tx_buffer->data, > &tx_buffer->xdp_mem); > + xdp_return_frame(tx_buffer->xdpf); > else > napi_consume_skb(tx_buffer->skb, napi_budget); > > @@ -2376,6 +2376,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector > *q_vector, > xdp.data_hard_start = xdp.data - > ixgbe_rx_offset(rx_ring); > xdp.data_end = xdp.data + size; > + prefetchw(xdp.data_hard_start); /* xdp_frame write */ > > skb = ixgbe_run_xdp(adapter, rx_ring, &xdp); > } Do we really need to be prefetching yet another cache line? This is going to hurt general performance since for most cases you are now prefetching a cache line that will never be used. > @@ -5787,7 +5788,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring > *tx_ring) > > /* Free all the Tx ring sk_buffs */ > if (ring_is_xdp(tx_ring)) > - xdp_return_frame(tx_buffer->data, > &tx_buffer->xdp_mem); > + xdp_return_frame(tx_buffer->xdpf); > else > dev_kfree_skb_any(tx_buffer->skb); > > @@ -8333,16 +8334,21 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter > *adapter, > struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()]; > struct ixgbe_tx_buffer *tx_buffer; > union ixgbe_adv_tx_desc *tx_desc; > + struct xdp_frame *xdpf; > u32 len, cmd_type; > dma_addr_t dma; > u16 i; > > - len = xdp->data_end - xdp->data; > + xdpf = convert_to_xdp_frame(xdp); > + if (unlikely(!xdpf)) > + return -EOVERFLOW; > + > + len = xdpf->len; > > if (unlikely(!ixgbe_desc_unused(ring))) > return IXGBE_XDP_CONSUMED; > > - dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE); > + dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE); > if (dma_mapping_error(ring->dev, dma)) > return IXGBE_XDP_CONSUMED; > > @@ -8357,8 +8363,7 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter > *adapter, > > dma_unmap_len_set(tx_buffer, len, len); > dma_unmap_addr_set(tx_buffer, dma, dma); > - tx_buffer->data = xdp->data; > - tx_buffer->xdp_mem = xdp->rxq->mem; > + tx_buffer->xdpf = xdpf; > > tx_desc->read.buffer_addr = cpu_to_le64(dma); > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > index 2ac78b88fc3d..00b9b13d9fea 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > @@ -896,6 +896,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct > mlx5_cqe64 *cqe, > di->addr + wi->offset, > 0, frag_size, > DMA_FROM_DEVICE); > + prefetchw(va); /* xdp_frame data area */ > prefetch(data); > wi->offset += frag_size; > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 81fddf9cc58f..a7e42ae1b220 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -663,7 +663,7 @@ static void tun_ptr_free(void *ptr) > if (tun_is_xdp_frame(ptr)) { > struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); > > - xdp_return_frame(xdpf->data, &xdpf->mem); > + xdp_return_frame(xdpf); > } else { > __skb_array_destroy_skb(ptr); > } > @@ -2188,7 +2188,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, > struct tun_file *tfile, > struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); > > ret = tun_put_user_xdp(tun, tfile, xdpf, to); > - xdp_return_frame(xdpf->data, &xdpf->mem); > + xdp_return_frame(xdpf); > } else { > struct sk_buff *skb = ptr; > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 48c86accd3b8..479a80339fad 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -430,7 +430,7 @@ static bool __virtnet_xdp_xmit(struct virtnet_info *vi, > > /* Free up any pending old buffers before queueing new ones. */ > while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) > - xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem); > + xdp_return_frame(xdpf_sent); > > xdpf = convert_to_xdp_frame(xdp); > if (unlikely(!xdpf)) > diff --git a/include/net/xdp.h b/include/net/xdp.h > index 98b55eaf8fd7..35aa9825fdd0 100644 > --- a/include/net/xdp.h > +++ b/include/net/xdp.h > @@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff > *xdp) > return xdp_frame; > } > > -void xdp_return_frame(void *data, struct xdp_mem_info *mem); > +void xdp_return_frame(struct xdp_frame *xdpf); > > int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, > struct net_device *dev, u32 queue_index); > diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c > index bcdc4dea5ce7..c95b04ec103e 100644 > --- a/kernel/bpf/cpumap.c > +++ b/kernel/bpf/cpumap.c > @@ -219,7 +219,7 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) > > while ((xdpf = ptr_ring_consume(ring))) > if (WARN_ON_ONCE(xdpf)) > - xdp_return_frame(xdpf->data, &xdpf->mem); > + xdp_return_frame(xdpf); > } > > static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) > @@ -275,7 +275,7 @@ static int cpu_map_kthread_run(void *data) > > skb = cpu_map_build_skb(rcpu, xdpf); > if (!skb) { > - xdp_return_frame(xdpf->data, &xdpf->mem); > + xdp_return_frame(xdpf); > continue; > } > > @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry > *rcpu, > err = __ptr_ring_produce(q, xdpf); > if (err) { > drops++; > - xdp_return_frame(xdpf->data, &xdpf->mem); > + xdp_return_frame(xdpf); > } > processed++; > } > diff --git a/net/core/xdp.c b/net/core/xdp.c > index fe8e87abc266..6ed3d73a73be 100644 > --- a/net/core/xdp.c > +++ b/net/core/xdp.c > @@ -294,9 +294,11 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info > *xdp_rxq, > } > EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); > > -void xdp_return_frame(void *data, struct xdp_mem_info *mem) > +void xdp_return_frame(struct xdp_frame *xdpf) > { > struct xdp_mem_allocator *xa = NULL; > + struct xdp_mem_info *mem = &xdpf->mem; > + void *data = xdpf->data; > > rcu_read_lock(); > if (mem->id) >