On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment allocation in the RX refill path can cause 15-20% throughput regression under high connection counts (>16 TCP streams).
Add an ethtool private flag "full-page-rx" that allows the user to force one RX buffer per page, bypassing the page_pool fragment path. This restores line-rate(180+ Gbps) performance on affected platforms. Usage: ethtool --set-priv-flags eth0 full-page-rx on There is no behavioral change by default. The flag must be explicitly enabled by the user or udev rule. The existing single-buffer-per-page logic for XDP and jumbo frames is consolidated into a new helper mana_use_single_rxbuf_per_page(). Signed-off-by: Dipayaan Roy <[email protected]> --- Changes in v4: - Dropping the smbios string parsing and add ethtool priv flag to reconfigure the queues with full page rx buffers. Changes in v3: - changed u8* to char* Changes in v2: - separate reading string index and the string, remove inline. --- drivers/net/ethernet/microsoft/mana/mana_en.c | 22 ++- .../ethernet/microsoft/mana/mana_ethtool.c | 159 +++++++++++++++--- include/net/mana/mana.h | 8 + 3 files changed, 159 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 49c65cc1697c..59a1626c2be1 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da) return va; } +static bool +mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu) +{ + /* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation + * in the RX refill path (~2kB buffer) can cause significant throughput + * regression under high connection counts. Allow user to force one RX + * buffer per page via ethtool private flag to bypass the fragment + * path. + */ + if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) + return true; + + /* For xdp and jumbo frames make sure only one packet fits per page. */ + if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) + return true; + + return false; +} + /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */ static void mana_get_rxbuf_cfg(struct mana_port_context *apc, int mtu, u32 *datasize, u32 *alloc_size, @@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc, /* Calculate datasize first (consistent across all cases) */ *datasize = mtu + ETH_HLEN; - /* For xdp and jumbo frames make sure only one packet fits per page */ - if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) { + if (mana_use_single_rxbuf_per_page(apc, mtu)) { if (mana_xdp_get(apc)) { *headroom = XDP_PACKET_HEADROOM; *alloc_size = PAGE_SIZE; diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index 6a4b42fe0944..9f7393b71a34 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -133,58 +133,91 @@ static const struct mana_stats_desc mana_phy_stats[] = { { "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) }, }; +static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = { + [MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx" +}; + static int mana_get_sset_count(struct net_device *ndev, int stringset) { struct mana_port_context *apc = netdev_priv(ndev); unsigned int num_queues = apc->num_queues; - if (stringset != ETH_SS_STATS) + switch (stringset) { + case ETH_SS_STATS: + return ARRAY_SIZE(mana_eth_stats) + + ARRAY_SIZE(mana_phy_stats) + + ARRAY_SIZE(mana_hc_stats) + + num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT); + case ETH_SS_PRIV_FLAGS: + return MANA_PRIV_FLAG_MAX; + default: return -EINVAL; + } +} + +static void mana_get_strings_priv_flags(u8 **data) +{ + int i; - return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + ARRAY_SIZE(mana_hc_stats) + - num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT); + for (i = 0; i < MANA_PRIV_FLAG_MAX; i++) + ethtool_puts(data, mana_priv_flags[i]); } -static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) +static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data) { - struct mana_port_context *apc = netdev_priv(ndev); unsigned int num_queues = apc->num_queues; int i, j; - if (stringset != ETH_SS_STATS) - return; for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++) - ethtool_puts(&data, mana_eth_stats[i].name); + ethtool_puts(data, mana_eth_stats[i].name); for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++) - ethtool_puts(&data, mana_hc_stats[i].name); + ethtool_puts(data, mana_hc_stats[i].name); for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++) - ethtool_puts(&data, mana_phy_stats[i].name); + ethtool_puts(data, mana_phy_stats[i].name); for (i = 0; i < num_queues; i++) { - ethtool_sprintf(&data, "rx_%d_packets", i); - ethtool_sprintf(&data, "rx_%d_bytes", i); - ethtool_sprintf(&data, "rx_%d_xdp_drop", i); - ethtool_sprintf(&data, "rx_%d_xdp_tx", i); - ethtool_sprintf(&data, "rx_%d_xdp_redirect", i); - ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i); + ethtool_sprintf(data, "rx_%d_packets", i); + ethtool_sprintf(data, "rx_%d_bytes", i); + ethtool_sprintf(data, "rx_%d_xdp_drop", i); + ethtool_sprintf(data, "rx_%d_xdp_tx", i); + ethtool_sprintf(data, "rx_%d_xdp_redirect", i); + ethtool_sprintf(data, "rx_%d_pkt_len0_err", i); for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++) - ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2); + ethtool_sprintf(data, + "rx_%d_coalesced_cqe_%d", + i, + j + 2); } for (i = 0; i < num_queues; i++) { - ethtool_sprintf(&data, "tx_%d_packets", i); - ethtool_sprintf(&data, "tx_%d_bytes", i); - ethtool_sprintf(&data, "tx_%d_xdp_xmit", i); - ethtool_sprintf(&data, "tx_%d_tso_packets", i); - ethtool_sprintf(&data, "tx_%d_tso_bytes", i); - ethtool_sprintf(&data, "tx_%d_tso_inner_packets", i); - ethtool_sprintf(&data, "tx_%d_tso_inner_bytes", i); - ethtool_sprintf(&data, "tx_%d_long_pkt_fmt", i); - ethtool_sprintf(&data, "tx_%d_short_pkt_fmt", i); - ethtool_sprintf(&data, "tx_%d_csum_partial", i); - ethtool_sprintf(&data, "tx_%d_mana_map_err", i); + ethtool_sprintf(data, "tx_%d_packets", i); + ethtool_sprintf(data, "tx_%d_bytes", i); + ethtool_sprintf(data, "tx_%d_xdp_xmit", i); + ethtool_sprintf(data, "tx_%d_tso_packets", i); + ethtool_sprintf(data, "tx_%d_tso_bytes", i); + ethtool_sprintf(data, "tx_%d_tso_inner_packets", i); + ethtool_sprintf(data, "tx_%d_tso_inner_bytes", i); + ethtool_sprintf(data, "tx_%d_long_pkt_fmt", i); + ethtool_sprintf(data, "tx_%d_short_pkt_fmt", i); + ethtool_sprintf(data, "tx_%d_csum_partial", i); + ethtool_sprintf(data, "tx_%d_mana_map_err", i); + } +} + +static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) +{ + struct mana_port_context *apc = netdev_priv(ndev); + + switch (stringset) { + case ETH_SS_PRIV_FLAGS: + mana_get_strings_priv_flags(&data); + break; + + case ETH_SS_STATS: + mana_get_strings_stats(apc, &data); + break; } } @@ -573,6 +606,74 @@ static int mana_get_link_ksettings(struct net_device *ndev, return 0; } +static u32 mana_get_priv_flags(struct net_device *ndev) +{ + struct mana_port_context *apc = netdev_priv(ndev); + + return apc->priv_flags; +} + +static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags) +{ + struct mana_port_context *apc = netdev_priv(ndev); + u32 changed = apc->priv_flags ^ priv_flags; + u32 old_priv_flags = apc->priv_flags; + bool schedule_port_reset = false; + int err = 0; + + if (!changed) + return 0; + + /* Reject unknown bits */ + if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0)) + return -EINVAL; + + if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) { + apc->priv_flags = priv_flags; + + if (!apc->port_is_up) { + /* Port is down, flag updated to apply on next up + * so just return. + */ + return 0; + } + + /* Pre-allocate buffers to prevent failure in mana_attach + * later + */ + err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues); + if (err) { + netdev_err(ndev, + "Insufficient memory for new allocations\n"); + apc->priv_flags = old_priv_flags; + return err; + } + + err = mana_detach(ndev, false); + if (err) { + netdev_err(ndev, "mana_detach failed: %d\n", err); + apc->priv_flags = old_priv_flags; + goto out; + } + + err = mana_attach(ndev); + if (err) { + netdev_err(ndev, "mana_attach failed: %d\n", err); + apc->priv_flags = old_priv_flags; + schedule_port_reset = true; + } + } + +out: + mana_pre_dealloc_rxbufs(apc); + + if (err && schedule_port_reset) + queue_work(apc->ac->per_port_queue_reset_wq, + &apc->queue_reset_work); + + return err; +} + const struct ethtool_ops mana_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES, .get_ethtool_stats = mana_get_ethtool_stats, @@ -591,4 +692,6 @@ const struct ethtool_ops mana_ethtool_ops = { .set_ringparam = mana_set_ringparam, .get_link_ksettings = mana_get_link_ksettings, .get_link = ethtool_op_get_link, + .get_priv_flags = mana_get_priv_flags, + .set_priv_flags = mana_set_priv_flags, }; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 3336688fed5e..fd87e3d6c1f4 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -30,6 +30,12 @@ enum TRI_STATE { TRI_STATE_TRUE = 1 }; +/* MANA ethtool private flag bit positions */ +enum mana_priv_flag_bits { + MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0, + MANA_PRIV_FLAG_MAX, +}; + /* Number of entries for hardware indirection table must be in power of 2 */ #define MANA_INDIRECT_TABLE_MAX_SIZE 512 #define MANA_INDIRECT_TABLE_DEF_SIZE 64 @@ -531,6 +537,8 @@ struct mana_port_context { u32 rxbpre_headroom; u32 rxbpre_frag_count; + u32 priv_flags; + struct bpf_prog *bpf_prog; /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */ -- 2.43.0

