On Fri, Jun 06, 2025 at 06:17:07PM +0100, Anatoly Burakov wrote: > Currently, for 32-byte descriptor format, only SSE instruction set is > supported. Add implementation for AVX2 and AVX512 instruction sets. Since > we are using Rx descriptor definitions from common code, we can just use > the generic descriptor definition, as we only ever write the first 16 bytes > of it, and the layout is always the same for that part. > > Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > ---
Acked-by: Bruce Richardson <bruce.richard...@intel.com> Two small comments inline below. > drivers/net/intel/common/rx_vec_x86.h | 365 ++++++++++++++------------ > 1 file changed, 198 insertions(+), 167 deletions(-) > > diff --git a/drivers/net/intel/common/rx_vec_x86.h > b/drivers/net/intel/common/rx_vec_x86.h > index ecab8b30a6..86c599cda1 100644 > --- a/drivers/net/intel/common/rx_vec_x86.h > +++ b/drivers/net/intel/common/rx_vec_x86.h > @@ -43,206 +43,248 @@ _ci_rxq_rearm_get_bufs(struct ci_rx_queue *rxq) > return 0; > } > > -/* > - * SSE code path can handle both 16-byte and 32-byte descriptors with one > code > - * path, as we only ever write 16 bytes at a time. > +/** > + * Reformat data from mbuf to descriptor for one RX descriptor, using SSE > instruction set. > + * > + * @param mhdr pointer to first 16 bytes of mbuf header > + * @return 16-byte register in descriptor format. > */ > -static __rte_always_inline void > -_ci_rxq_rearm_sse(struct ci_rx_queue *rxq) > +static __rte_always_inline __m128i > +_ci_rxq_rearm_desc_sse(const __m128i *mhdr) > { > const __m128i hdroom = _mm_set1_epi64x(RTE_PKTMBUF_HEADROOM); > const __m128i zero = _mm_setzero_si128(); > + > + /* add headroom to address values */ > + __m128i reg = _mm_add_epi64(*mhdr, hdroom); > + > +#if RTE_IOVA_IN_MBUF > + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ Comment doesn't seem right here - we are not doing a load op. Perhaps reword. > + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > + offsetof(struct rte_mbuf, buf_addr) + 8); > + /* move IOVA to Packet Buffer Address, erase Header Buffer Address */ > + reg = _mm_unpackhi_epi64(reg, zero); > +#else > + /* erase Header Buffer Address */ > + reg = _mm_unpacklo_epi64(reg, zero); > +#endif > + return reg; > +} > + > +static __rte_always_inline void > +_ci_rxq_rearm_sse(struct ci_rx_queue *rxq) > +{ > const uint16_t rearm_thresh = CI_VPMD_RX_REARM_THRESH; > struct ci_rx_entry *rxp = &rxq->sw_ring[rxq->rxrearm_start]; > + /* SSE writes 16-bytes regardless of descriptor size */ > + const uint8_t desc_per_reg = 1; > + const uint8_t desc_per_iter = desc_per_reg * 2; > volatile union ci_rx_desc *rxdp; > int i; > > rxdp = &rxq->rx_ring[rxq->rxrearm_start]; > > /* Initialize the mbufs in vector, process 2 mbufs in one loop */ > - for (i = 0; i < rearm_thresh; i += 2, rxp += 2, rxdp += 2) { > - struct rte_mbuf *mb0 = rxp[0].mbuf; > - struct rte_mbuf *mb1 = rxp[1].mbuf; > - > -#if RTE_IOVA_IN_MBUF > - /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */ > - RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) != > - offsetof(struct rte_mbuf, buf_addr) + 8); > -#endif > - __m128i addr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr); > - __m128i addr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr); > - > - /* add headroom to address values */ > - addr0 = _mm_add_epi64(addr0, hdroom); > - addr1 = _mm_add_epi64(addr1, hdroom); > - > -#if RTE_IOVA_IN_MBUF > - /* move IOVA to Packet Buffer Address, erase Header Buffer > Address */ > - addr0 = _mm_unpackhi_epi64(addr0, zero); > - addr0 = _mm_unpackhi_epi64(addr1, zero); > -#else > - /* erase Header Buffer Address */ > - addr0 = _mm_unpacklo_epi64(addr0, zero); > - addr1 = _mm_unpacklo_epi64(addr1, zero); > -#endif > - > - /* flush desc with pa dma_addr */ > - _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[0]), addr0); > - _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[1]), addr1); > + for (i = 0; i < rearm_thresh; > + i += desc_per_iter, > + rxp += desc_per_iter, > + rxdp += desc_per_iter) { > + const __m128i reg0 = _ci_rxq_rearm_desc_sse( > + RTE_CAST_PTR(const __m128i *, rxp[0].mbuf)); > + const __m128i reg1 = _ci_rxq_rearm_desc_sse( > + RTE_CAST_PTR(const __m128i *, rxp[1].mbuf)); > + > + /* flush descriptors */ > + _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[0]), reg0); > + _mm_store_si128(RTE_CAST_PTR(__m128i *, &rxdp[desc_per_reg]), > reg1); > } > } > > -#ifdef RTE_NET_INTEL_USE_16BYTE_DESC > #ifdef __AVX2__ > -/* AVX2 version for 16-byte descriptors, handles 4 buffers at a time */ > -static __rte_always_inline void > -_ci_rxq_rearm_avx2(struct ci_rx_queue *rxq) > +/** > + * Reformat data from mbuf to descriptor for one RX descriptor, using AVX2 > instruction set. > + * > + * Note that for 32-byte descriptors, the second parameter must be zeroed > out. Don't need this note any more, since this function is not used for 32-byte descriptors. > + * > + * @param mhdr0 pointer to first 16-bytes of 1st mbuf header. > + * @param mhdr1 pointer to first 16-bytes of 2nd mbuf header. > + * > + * @return 32-byte register with two 16-byte descriptors in it. > + */ <snip>