The failure in ci/iol-broadcom-Functional on this patch seems to be unrelated to the patch.
> -----Original Message----- > From: Kathleen Capella <kathleen.cape...@arm.com> > Sent: Thursday, March 24, 2022 5:12 PM > To: Jingjing Wu <jingjing...@intel.com>; Beilei Xing <beilei.x...@intel.com> > Cc: dev@dpdk.org; nd <n...@arm.com>; Honnappa Nagarahalli > <honnappa.nagaraha...@arm.com>; Dharmik Thakkar > <dharmik.thak...@arm.com>; Kathleen Capella > <kathleen.cape...@arm.com> > Subject: [PATCH] net/iavf: remove extra copy step in Rx bulk path > > In the Rx bulk path, packets which are taken from the HW ring, are first > copied to the stage data structure and then later copied from the stage to the > rx_pkts array. For the number of packets requested immediately by the > receiving function, this two-step process adds extra overhead that is not > necessary. > > Instead, put requested number of packets directly into the rx_pkts array and > only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up to 4% > performance improvement. On x86, no difference in performance was > observed. > > Signed-off-by: Kathleen Capella <kathleen.cape...@arm.com> > Suggested-by: Dharmik Thakkar <dharmik.thak...@arm.com> > --- > drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------ > 1 file changed, 49 insertions(+), 25 deletions(-) > > diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index > 16e8d021f9..245dd225fd 100644 > --- a/drivers/net/iavf/iavf_rxtx.c > +++ b/drivers/net/iavf/iavf_rxtx.c > @@ -1813,7 +1813,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct > rte_mbuf **rx_pkts, > > #define IAVF_LOOK_AHEAD 8 > static inline int > -iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq) > +iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq, > + struct rte_mbuf **rx_pkts, > + uint16_t nb_pkts) > { > volatile union iavf_rx_flex_desc *rxdp; > struct rte_mbuf **rxep; > @@ -1822,6 +1824,7 @@ iavf_rx_scan_hw_ring_flex_rxd(struct > iavf_rx_queue *rxq) > uint16_t pkt_len; > int32_t s[IAVF_LOOK_AHEAD], var, nb_dd; > int32_t i, j, nb_rx = 0; > + int32_t nb_staged = 0; > uint64_t pkt_flags; > const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; > > @@ -1867,8 +1870,6 @@ iavf_rx_scan_hw_ring_flex_rxd(struct > iavf_rx_queue *rxq) #endif > } > > - nb_rx += nb_dd; > - > /* Translate descriptor info to mbuf parameters */ > for (j = 0; j < nb_dd; j++) { > IAVF_DUMP_RX_DESC(rxq, &rxdp[j], > @@ -1892,24 +1893,34 @@ iavf_rx_scan_hw_ring_flex_rxd(struct > iavf_rx_queue *rxq) > pkt_flags = > iavf_flex_rxd_error_to_pkt_flags(stat_err0); > > mb->ol_flags |= pkt_flags; > - } > > - for (j = 0; j < IAVF_LOOK_AHEAD; j++) > - rxq->rx_stage[i + j] = rxep[j]; > + /* Put up to nb_pkts directly into buffers */ > + if ((i + j) < nb_pkts) { > + rx_pkts[i + j] = rxep[j]; > + nb_rx++; > + } else { > + /* Stage excess pkts received */ > + rxq->rx_stage[nb_staged] = rxep[j]; > + nb_staged++; > + } > + } > > if (nb_dd != IAVF_LOOK_AHEAD) > break; > } > > + /* Update rxq->rx_nb_avail to reflect number of staged pkts */ > + rxq->rx_nb_avail = nb_staged; > + > /* Clear software ring entries */ > - for (i = 0; i < nb_rx; i++) > + for (i = 0; i < (nb_rx + nb_staged); i++) > rxq->sw_ring[rxq->rx_tail + i] = NULL; > > return nb_rx; > } > > static inline int > -iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq) > +iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf > +**rx_pkts, uint16_t nb_pkts) > { > volatile union iavf_rx_desc *rxdp; > struct rte_mbuf **rxep; > @@ -1919,6 +1930,7 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq) > uint32_t rx_status; > int32_t s[IAVF_LOOK_AHEAD], var, nb_dd; > int32_t i, j, nb_rx = 0; > + int32_t nb_staged = 0; > uint64_t pkt_flags; > const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; > > @@ -1970,8 +1982,6 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq) > #endif > } > > - nb_rx += nb_dd; > - > /* Translate descriptor info to mbuf parameters */ > for (j = 0; j < nb_dd; j++) { > IAVF_DUMP_RX_DESC(rxq, &rxdp[j], > @@ -2000,17 +2010,26 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue > *rxq) > pkt_flags |= iavf_rxd_build_fdir(&rxdp[j], > mb); > > mb->ol_flags |= pkt_flags; > - } > > - for (j = 0; j < IAVF_LOOK_AHEAD; j++) > - rxq->rx_stage[i + j] = rxep[j]; > + /* Put up to nb_pkts directly into buffers */ > + if ((i + j) < nb_pkts) { > + rx_pkts[i + j] = rxep[j]; > + nb_rx++; > + } else { /* Stage excess pkts received */ > + rxq->rx_stage[nb_staged] = rxep[j]; > + nb_staged++; > + } > + } > > if (nb_dd != IAVF_LOOK_AHEAD) > break; > } > > + /* Update rxq->rx_nb_avail to reflect number of staged pkts */ > + rxq->rx_nb_avail = nb_staged; > + > /* Clear software ring entries */ > - for (i = 0; i < nb_rx; i++) > + for (i = 0; i < (nb_rx + nb_staged); i++) > rxq->sw_ring[rxq->rx_tail + i] = NULL; > > return nb_rx; > @@ -2098,23 +2117,31 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf > **rx_pkts, uint16_t nb_pkts) > return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts); > > if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <= > IAVF_RXDID_LAST) > - nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq); > + nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq, > rx_pkts, > +nb_pkts); > else > - nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq); > + nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts, > nb_pkts); > + > rxq->rx_next_avail = 0; > - rxq->rx_nb_avail = nb_rx; > - rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx); > + rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail); > > if (rxq->rx_tail > rxq->rx_free_trigger) { > if (iavf_rx_alloc_bufs(rxq) != 0) { > - uint16_t i, j; > + uint16_t i, j, nb_staged; > > /* TODO: count rx_mbuf_alloc_failed here */ > > + nb_staged = rxq->rx_nb_avail; > rxq->rx_nb_avail = 0; > - rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx); > - for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) > + > + rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx + > nb_staged)); > + for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) { > + rxq->sw_ring[j] = rx_pkts[i]; > + rx_pkts[i] = NULL; > + } > + for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; > i++, > j++) { > rxq->sw_ring[j] = rxq->rx_stage[i]; > + rx_pkts[i] = NULL; > + } > > return 0; > } > @@ -2127,10 +2154,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf > **rx_pkts, uint16_t nb_pkts) > rxq->port_id, rxq->queue_id, > rxq->rx_tail, nb_rx); > > - if (rxq->rx_nb_avail) > - return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts); > - > - return 0; > + return nb_rx; > } > > static uint16_t > -- > 2.31.1