On certain systems configured with 4K PAGE_SIZE, utilizing page_pool fragments for RX buffers results in a significant throughput regression. Profiling reveals that this regression correlates with high overhead in the fragment allocation and reference counting paths on these specific platforms, rendering the multi-buffer-per-page strategy counterproductive.
To mitigate this, bypass the page_pool fragment path and force a single RX packet per page allocation when all the following conditions are met: 1. The system is configured with a 4K PAGE_SIZE. 2. A processor-specific quirk is detected via SMBIOS Type 4 data. This approach restores expected line-rate performance by ensuring predictable RX refill behavior on affected hardware. There is no behavioral change for systems using larger page sizes (16K/64K), or platforms where this processor-specific quirk do not apply. Signed-off-by: Dipayaan Roy <[email protected]> --- .../net/ethernet/microsoft/mana/gdma_main.c | 120 ++++++++++++++++++ drivers/net/ethernet/microsoft/mana/mana_en.c | 23 +++- include/net/mana/gdma.h | 10 ++ 3 files changed, 151 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 0055c231acf6..26bbe736a770 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -9,6 +9,7 @@ #include <linux/msi.h> #include <linux/irqdomain.h> #include <linux/export.h> +#include <linux/dmi.h> #include <net/mana/mana.h> #include <net/mana/hw_channel.h> @@ -1955,6 +1956,115 @@ static bool mana_is_pf(unsigned short dev_id) return dev_id == MANA_PF_DEVICE_ID; } +/* + * Table for Processor Version strings found from SMBIOS Type 4 information, + * for processors that needs to force single RX buffer per page quirk for + * meeting line rate performance with ARM64 + 4K pages. + * Note: These strings are exactly matched with version fetched from SMBIOS. + */ +static const char * const mana_single_rxbuf_per_page_quirk_tbl[] = { + "Cobalt 200", +}; + +static const char *smbios_get_string(const struct dmi_header *hdr, u8 idx) +{ + const u8 *start, *end; + u8 i; + + /* Indexing starts from 1. */ + if (!idx) + return NULL; + + start = (const u8 *)hdr + hdr->length; + end = start + SMBIOS_STR_AREA_MAX; + + for (i = 1; i < idx; i++) { + while (start < end && *start) + start++; + if (start < end) + start++; + if (start + 1 < end && start[0] == 0 && start[1] == 0) + return NULL; + } + + if (start >= end || *start == 0) + return NULL; + + return (const char *)start; +} + +/* On some systems with 4K PAGE_SIZE, page_pool RX fragments can + * trigger a throughput regression. Hence identify those processors + * from the extracted SMBIOS table and apply the quirk to forces one + * RX buffer per page to avoid the fragment allocation/refcounting + * overhead in the RX refill path for those processors only. + */ +static bool mana_needs_single_rxbuf_per_page(struct gdma_context *gc) +{ + int i = 0; + const char *ver = gc->processor_version; + + if (!ver) + return false; + + if (PAGE_SIZE != SZ_4K) + return false; + + while (i < ARRAY_SIZE(mana_single_rxbuf_per_page_quirk_tbl)) { + if (!strcmp(ver, mana_single_rxbuf_per_page_quirk_tbl[i])) + return true; + i++; + } + + return false; +} + +static void mana_get_proc_ver_from_smbios(const struct dmi_header *hdr, + void *data) +{ + struct gdma_context *gc = data; + const char *ver_str; + u8 idx; + + /* We are only looking for Type 4: Processor Information */ + if (hdr->type != SMBIOS_TYPE_4_PROCESSOR_INFO) + return; + + /* Ensure the record is long enough to contain the Processor Version + * field + */ + if (hdr->length <= SMBIOS_TYPE4_PROC_VERSION_OFFSET) + return; + + /* The 'Processor Version' string is located at index pointed by + * SMBIOS_TYPE4_PROC_VERSION_OFFSET. If found make a copy of it. + * There could be multiple Type 4 tables so read and copy the + * processor version found the first time. + */ + idx = ((const u8 *)hdr)[SMBIOS_TYPE4_PROC_VERSION_OFFSET]; + ver_str = smbios_get_string(hdr, idx); + if (ver_str && !gc->processor_version) + gc->processor_version = kstrdup(ver_str, GFP_KERNEL); +} + +/* Check and initialize all processor optimizations/quirks here */ +static bool mana_init_processor_optimization(struct gdma_context *gc) +{ + bool opt_initialized = false; + + gc->processor_version = NULL; + dmi_walk(mana_get_proc_ver_from_smbios, gc); + if (!gc->processor_version) + return false; + + if (mana_needs_single_rxbuf_per_page(gc)) { + gc->force_full_page_rx_buffer = true; + opt_initialized = true; + } + + return opt_initialized; +} + static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct gdma_context *gc; @@ -2009,6 +2119,11 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gc->mana_pci_debugfs = debugfs_create_dir(pci_slot_name(pdev->slot), mana_debugfs_root); + if (mana_init_processor_optimization(gc)) + dev_info(&pdev->dev, + "Processor specific optimization initialized on: %s\n", + gc->processor_version); + err = mana_gd_setup(pdev); if (err) goto unmap_bar; @@ -2051,6 +2166,8 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_iounmap(pdev, bar0_va); free_gc: pci_set_drvdata(pdev, NULL); + kfree(gc->processor_version); + gc->processor_version = NULL; vfree(gc); release_region: pci_release_regions(pdev); @@ -2106,6 +2223,9 @@ static void mana_gd_remove(struct pci_dev *pdev) pci_iounmap(pdev, gc->bar0_va); + kfree(gc->processor_version); + gc->processor_version = NULL; + vfree(gc); pci_release_regions(pdev); diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 91c418097284..a53a8921050b 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -748,6 +748,26 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da) return va; } +static inline bool +mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu) +{ + struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; + + /* On some systems with 4K PAGE_SIZE, page_pool RX fragments can + * trigger a throughput regression. Hence forces one RX buffer per page + * to avoid the fragment allocation/refcounting overhead in the RX + * refill path for those processors only. + */ + if (gc->force_full_page_rx_buffer) + return true; + + /* For xdp and jumbo frames make sure only one packet fits per page. */ + if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) + return true; + + return false; +} + /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */ static void mana_get_rxbuf_cfg(struct mana_port_context *apc, int mtu, u32 *datasize, u32 *alloc_size, @@ -758,8 +778,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc, /* Calculate datasize first (consistent across all cases) */ *datasize = mtu + ETH_HLEN; - /* For xdp and jumbo frames make sure only one packet fits per page */ - if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) { + if (mana_use_single_rxbuf_per_page(apc, mtu)) { if (mana_xdp_get(apc)) { *headroom = XDP_PACKET_HEADROOM; *alloc_size = PAGE_SIZE; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index a59bd4035a99..0ef2d6ac5203 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -9,6 +9,14 @@ #include "shm_channel.h" +#define SMBIOS_STR_AREA_MAX 4096 + +/* SMBIOS Type 4: Processor Information table */ +#define SMBIOS_TYPE_4_PROCESSOR_INFO 4 + +/* Byte offset containing the Processor Version string number.*/ +#define SMBIOS_TYPE4_PROC_VERSION_OFFSET 0x10 + #define GDMA_STATUS_MORE_ENTRIES 0x00000105 #define GDMA_STATUS_CMD_UNSUPPORTED 0xffffffff @@ -436,6 +444,8 @@ struct gdma_context { struct workqueue_struct *service_wq; unsigned long flags; + u8 *processor_version; + bool force_full_page_rx_buffer; }; static inline bool mana_gd_is_mana(struct gdma_dev *gd) -- 2.43.0

