Add a separate routine to handle no-fast-free offload in vector Tx path for multisegmented packets.
Signed-off-by: Ashwin Sekhar T K <[email protected]> --- drivers/net/cnxk/cn10k_tx.h | 124 +++++++++++++++++------------------- 1 file changed, 59 insertions(+), 65 deletions(-) diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h index 815cd2ff1f..a4c578354c 100644 --- a/drivers/net/cnxk/cn10k_tx.h +++ b/drivers/net/cnxk/cn10k_tx.h @@ -956,6 +956,14 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags) rte_io_wmb(); #endif m->next = NULL; + + /* Quickly handle single segmented packets. With this if-condition + * compiler will completely optimize out the below do-while loop + * from the Tx handler when NIX_TX_MULTI_SEG_F offload is not set. + */ + if (!(flags & NIX_TX_MULTI_SEG_F)) + goto done; + m = m_next; if (!m) goto done; @@ -1360,6 +1368,30 @@ cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1, } } +static __rte_always_inline uint16_t +cn10k_nix_prepare_mseg_vec_noff(struct rte_mbuf *m, uint64_t *cmd, + uint64x2_t *cmd0, uint64x2_t *cmd1, + uint64x2_t *cmd2, uint64x2_t *cmd3, + const uint32_t flags) +{ + uint16_t segdw; + + vst1q_u64(cmd, *cmd0); /* Send hdr */ + if (flags & NIX_TX_NEED_EXT_HDR) { + vst1q_u64(cmd + 2, *cmd2); /* ext hdr */ + vst1q_u64(cmd + 4, *cmd1); /* sg */ + } else { + vst1q_u64(cmd + 2, *cmd1); /* sg */ + } + + segdw = cn10k_nix_prepare_mseg(m, cmd, flags); + + if (flags & NIX_TX_OFFLOAD_TSTAMP_F) + vst1q_u64(cmd + segdw * 2 - 2, *cmd3); + + return segdw; +} + static __rte_always_inline void cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd, union nix_send_hdr_w0_u *sh, @@ -1389,17 +1421,6 @@ cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd, nb_segs = m->nb_segs - 1; m_next = m->next; - - /* Set invert df if buffer is not to be freed by H/W */ - if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) - sg_u |= (cnxk_nix_prefree_seg(m) << 55); - /* Mark mempool object as "put" since it is freed by NIX */ -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG - if (!(sg_u & (1ULL << 55))) - RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0); - rte_io_wmb(); -#endif - m->next = NULL; m = m_next; /* Fill mbuf segments */ @@ -1409,16 +1430,6 @@ cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd, len -= dlen; sg_u = sg_u | ((uint64_t)dlen << (i << 4)); *slist = rte_mbuf_data_iova(m); - /* Set invert df if buffer is not to be freed by H/W */ - if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) - sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55)); - /* Mark mempool object as "put" since it is freed by NIX - */ -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG - if (!(sg_u & (1ULL << (i + 55)))) - RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0); - rte_io_wmb(); -#endif slist++; i++; nb_segs--; @@ -1456,21 +1467,8 @@ cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0, union nix_send_hdr_w0_u sh; union nix_send_sg_s sg; - if (m->nb_segs == 1) { - if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) { - sg.u = vgetq_lane_u64(cmd1[0], 0); - sg.u |= (cnxk_nix_prefree_seg(m) << 55); - cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0); - } - -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG - sg.u = vgetq_lane_u64(cmd1[0], 0); - if (!(sg.u & (1ULL << 55))) - RTE_MEMPOOL_CHECK_COOKIES(m->pool, (void **)&m, 1, 0); - rte_io_wmb(); -#endif + if (m->nb_segs == 1) return; - } sh.u = vgetq_lane_u64(cmd0[0], 0); sg.u = vgetq_lane_u64(cmd1[0], 0); @@ -1491,16 +1489,32 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0, uint64_t *lmt_addr, __uint128_t *data128, uint8_t *shift, const uint16_t flags) { - uint8_t j, off, lmt_used; + uint8_t j, off, lmt_used = 0; + + if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) { + off = 0; + for (j = 0; j < NIX_DESCS_PER_LOOP; j++) { + if (off + segdw[j] > 8) { + *data128 |= ((__uint128_t)off - 1) << *shift; + *shift += 3; + lmt_used++; + lmt_addr += 16; + off = 0; + } + off += cn10k_nix_prepare_mseg_vec_noff(mbufs[j], + lmt_addr + off * 2, &cmd0[j], &cmd1[j], + &cmd2[j], &cmd3[j], flags); + } + *data128 |= ((__uint128_t)off - 1) << *shift; + *shift += 3; + lmt_used++; + return lmt_used; + } if (!(flags & NIX_TX_NEED_EXT_HDR) && !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) { /* No segments in 4 consecutive packets. */ if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) { - for (j = 0; j < NIX_DESCS_PER_LOOP; j++) - cn10k_nix_prepare_mseg_vec(mbufs[j], NULL, - &cmd0[j], &cmd1[j], - segdw[j], flags); vst1q_u64(lmt_addr, cmd0[0]); vst1q_u64(lmt_addr + 2, cmd1[0]); vst1q_u64(lmt_addr + 4, cmd0[1]); @@ -1517,18 +1531,10 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0, } } - lmt_used = 0; for (j = 0; j < NIX_DESCS_PER_LOOP;) { /* Fit consecutive packets in same LMTLINE. */ if ((segdw[j] + segdw[j + 1]) <= 8) { if (flags & NIX_TX_OFFLOAD_TSTAMP_F) { - cn10k_nix_prepare_mseg_vec(mbufs[j], NULL, - &cmd0[j], &cmd1[j], - segdw[j], flags); - cn10k_nix_prepare_mseg_vec(mbufs[j + 1], NULL, - &cmd0[j + 1], - &cmd1[j + 1], - segdw[j + 1], flags); /* TSTAMP takes 4 each, no segs. */ vst1q_u64(lmt_addr, cmd0[j]); vst1q_u64(lmt_addr + 2, cmd2[j]); @@ -1643,23 +1649,11 @@ cn10k_nix_xmit_store(struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr, { uint8_t off; - /* Handle no fast free when security is enabled without mseg */ - if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) && - (flags & NIX_TX_OFFLOAD_SECURITY_F) && - !(flags & NIX_TX_MULTI_SEG_F)) { - union nix_send_sg_s sg; - - sg.u = vgetq_lane_u64(cmd1, 0); - sg.u |= (cnxk_nix_prefree_seg(mbuf) << 55); - cmd1 = vsetq_lane_u64(sg.u, cmd1, 0); - -#ifdef RTE_LIBRTE_MEMPOOL_DEBUG - sg.u = vgetq_lane_u64(cmd1, 0); - if (!(sg.u & (1ULL << 55))) - RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf, 1, - 0); - rte_io_wmb(); -#endif + if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) { + cn10k_nix_prepare_mseg_vec_noff(mbuf, LMT_OFF(laddr, 0, 0), + &cmd0, &cmd1, &cmd2, &cmd3, + flags); + return; } if (flags & NIX_TX_MULTI_SEG_F) { if ((flags & NIX_TX_NEED_EXT_HDR) && -- 2.25.1

