Anatoly, I noticed you are consolidating the Intel NIC drivers into common code, which is good.
While you are at it, please also consider replacing some ancient code with functions doing the same: https://git.dpdk.org/dpdk/tree/drivers/net/intel/common/tx.h#n157 Something like (untested): static __rte_always_inline int ci_tx_free_bufs_vec(struct ci_tx_queue *txq, ci_desc_done_fn desc_done, bool ctx_descs) { int nb_free = 0; struct rte_mbuf *free[IETH_VPMD_TX_MAX_FREE_BUF]; struct rte_mbuf *m; /* check DD bits on threshold descriptor */ if (!desc_done(txq, txq->tx_next_dd)) return 0; const uint32_t n = txq->tx_rs_thresh >> ctx_descs; /* first buffer to free from S/W ring is at index * tx_next_dd - (tx_rs_thresh - 1) */ struct ci_tx_entry_vec *txep = txq->sw_ring_vec; txep += (txq->tx_next_dd >> ctx_descs) - (n - 1); - if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) { - struct rte_mempool *mp = txep[0].mbuf->pool; - void **cache_objs; - struct rte_mempool_cache *cache = rte_mempool_default_cache(mp, rte_lcore_id()); - - if (cache == NULL) - goto normal; - - cache_objs = &cache->objs[cache->len]; - - if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) { - rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n); - goto done; - } - - /* The cache follows the following algorithm - * 1. Add the objects to the cache - * 2. Anything greater than the cache min value (if it - * crosses the cache flush threshold) is flushed to the ring. - */ - /* Add elements back into the cache */ - uint32_t copied = 0; - /* n is multiple of 32 */ - while (copied < n) { - memcpy(&cache_objs[copied], &txep[copied], 32 * sizeof(void *)); - copied += 32; - } - cache->len += n; - - if (cache->len >= cache->flushthresh) { - rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size], - cache->len - cache->size); - cache->len = cache->size; - } - goto done; - } - -normal: - m = rte_pktmbuf_prefree_seg(txep[0].mbuf); - if (likely(m)) { - free[0] = m; - nb_free = 1; - for (uint32_t i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (likely(m)) { - if (likely(m->pool == free[0]->pool)) { - free[nb_free++] = m; - } else { - rte_mempool_put_bulk(free[0]->pool, (void *)free, nb_free); - free[0] = m; - nb_free = 1; - } - } - } - rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free); - } else { - for (uint32_t i = 1; i < n; i++) { - m = rte_pktmbuf_prefree_seg(txep[i].mbuf); - if (m) - rte_mempool_put(m->pool, m); - } - } - -done: + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) + rte_mempool_put_bulk(txep[0].mbuf->pool, (void **)txep, n); + else + rte_pktmbuf_free_bulk((void **)txep, n); + /* buffers were freed, update counters */ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh); txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh); if (txq->tx_next_dd >= txq->nb_tx_desc) txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1); return txq->tx_rs_thresh; } Note: My suggestion relies on the ci_tx_entry_vec structure effectively being the same as an mbuf pointer. The existing code path for RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE also relies on this. -Morten