From: Stewart Hildebrand <[email protected]> This code is expected to only be used by privileged domains, unprivileged domains should not get access to the SR-IOV capability.
Implement RW handlers for PCI_SRIOV_CTRL register to dynamically map/unmap VF BARS. Recalculate BAR sizes before mapping VFs to account for possible changes in the system page size register. Allow forcing vpci_modify_bars to not defer the actual mapping changes, which is needed to fix the sequential calls to vpci_modify_bars when enabling VFs from Dom0. Signed-off-by: Roger Pau Monné <[email protected]> Signed-off-by: Stewart Hildebrand <[email protected]> Signed-off-by: Mykyta Poturai <[email protected]> --- v1->v2: * switch to VF discovery by Xen * fix sequential vpci_modify_bars calls * move documentation changes to a separate commit --- xen/drivers/vpci/Makefile | 2 +- xen/drivers/vpci/header.c | 17 +- xen/drivers/vpci/sriov.c | 363 ++++++++++++++++++++++++++++++++++++++ xen/include/xen/vpci.h | 12 +- 4 files changed, 385 insertions(+), 9 deletions(-) create mode 100644 xen/drivers/vpci/sriov.c diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile index a7c8a30a89..fe1e57b64d 100644 --- a/xen/drivers/vpci/Makefile +++ b/xen/drivers/vpci/Makefile @@ -1,2 +1,2 @@ -obj-y += vpci.o header.o rebar.o +obj-y += vpci.o header.o rebar.o sriov.o obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c index 284964f0d4..c55c3380d4 100644 --- a/xen/drivers/vpci/header.c +++ b/xen/drivers/vpci/header.c @@ -264,7 +264,7 @@ bool vpci_process_pending(struct vcpu *v) return false; } -static int __init apply_map(struct domain *d, const struct pci_dev *pdev, +static int apply_map(struct domain *d, const struct pci_dev *pdev, uint16_t cmd) { struct vpci_header *header = &pdev->vpci->header; @@ -323,7 +323,8 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) raise_softirq(SCHEDULE_SOFTIRQ); } -int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) +int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only, + bool no_defer) { struct vpci_header *header = &pdev->vpci->header; struct pci_dev *tmp; @@ -519,7 +520,7 @@ int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only) d = dom_xen; } - if ( system_state < SYS_STATE_active ) + if ( system_state < SYS_STATE_active || no_defer ) { /* * Mappings might be created when building Dom0 if the memory decoding @@ -566,7 +567,7 @@ static void cf_check cmd_write( * memory decoding bit has not been changed, so leave everything as-is, * hoping the guest will realize and try again. */ - vpci_modify_bars(pdev, cmd, false); + vpci_modify_bars(pdev, cmd, false, false); else pci_conf_write16(pdev->sbdf, reg, cmd); } @@ -736,7 +737,7 @@ static void cf_check rom_write( */ else if ( vpci_modify_bars(pdev, new_enabled ? PCI_COMMAND_MEMORY : 0, - true) ) + true, false) ) /* * No memory has been added or removed from the p2m (because the actual * p2m changes are deferred in defer_map) and the ROM enable bit has @@ -954,6 +955,9 @@ int vpci_init_header(struct pci_dev *pdev) header->guest_cmd = cmd; + if ( pdev->info.is_virtfn ) + return vf_init_header(pdev); + /* Disable memory decoding before sizing. */ if ( !is_hwdom || (cmd & PCI_COMMAND_MEMORY) ) pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd & ~PCI_COMMAND_MEMORY); @@ -1062,7 +1066,8 @@ int vpci_init_header(struct pci_dev *pdev) goto fail; } - return (cmd & PCI_COMMAND_MEMORY) ? vpci_modify_bars(pdev, cmd, false) : 0; + return (cmd & PCI_COMMAND_MEMORY) + ? vpci_modify_bars(pdev, cmd, false, false) : 0; fail: pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd); diff --git a/xen/drivers/vpci/sriov.c b/xen/drivers/vpci/sriov.c new file mode 100644 index 0000000000..6f691149e9 --- /dev/null +++ b/xen/drivers/vpci/sriov.c @@ -0,0 +1,363 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Handlers for accesses to the SR-IOV capability structure. + * + * Copyright (C) 2026 Citrix Systems R&D + */ + +#include <xen/sched.h> +#include <xen/vpci.h> +#include <xsm/xsm.h> + +static int vf_init_bars(const struct pci_dev *vf_pdev) +{ + int vf_idx; + unsigned int i; + const struct pci_dev *pf_pdev = vf_pdev->pf_pdev; + struct vpci_bar *bars = vf_pdev->vpci->header.bars; + struct vpci_bar *physfn_vf_bars = pf_pdev->vpci->sriov->vf_bars; + unsigned int sriov_pos = pci_find_ext_capability(pf_pdev, + PCI_EXT_CAP_ID_SRIOV); + uint16_t offset = pci_conf_read16(pf_pdev->sbdf, + sriov_pos + PCI_SRIOV_VF_OFFSET); + uint16_t stride = pci_conf_read16(pf_pdev->sbdf, + sriov_pos + PCI_SRIOV_VF_STRIDE); + + vf_idx = vf_pdev->sbdf.sbdf - (pf_pdev->sbdf.sbdf + offset); + if ( vf_idx < 0 ) + return -EINVAL; + + if ( stride ) + { + if ( vf_idx % stride ) + return -EINVAL; + vf_idx /= stride; + } + + /* + * Set up BARs for this VF out of PF's VF BARs taking into account + * the index of the VF. + */ + for ( i = 0; i < PCI_SRIOV_NUM_BARS; i++ ) + { + bars[i].addr = physfn_vf_bars[i].addr + vf_idx * physfn_vf_bars[i].size; + bars[i].guest_addr = bars[i].addr; + bars[i].size = physfn_vf_bars[i].size; + bars[i].type = physfn_vf_bars[i].type; + bars[i].prefetchable = physfn_vf_bars[i].prefetchable; + } + + return 0; +} + +/* Must be called form vpci_process_pending context */ +static int map_vfs(const struct pci_dev *pf_pdev, uint16_t cmd) +{ + struct pci_dev *vf_pdev; + int rc; + + ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock)); + + list_for_each_entry(vf_pdev, &pf_pdev->vf_list, vf_list) { + rc = vpci_modify_bars(vf_pdev, cmd, false, true); + if ( rc ) + { + gprintk(XENLOG_ERR, "failed to %s VF %pp: %d\n", + (cmd & PCI_COMMAND_MEMORY) ? "map" : "unmap", + &vf_pdev->sbdf, rc); + return rc; + } + } + + return 0; +} + + +static int size_vf_bars(struct pci_dev *pf_pdev, unsigned int sriov_pos) +{ + /* + * NB: a non-const pci_dev of the PF is needed in order to update + * vf_rlen. + */ + struct vpci_bar *bars; + unsigned int i; + int rc = 0; + + ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock)); + ASSERT(!pf_pdev->info.is_virtfn); + ASSERT(pf_pdev->vpci->sriov); + + /* Read BARs for VFs out of PF's SR-IOV extended capability. */ + bars = pf_pdev->vpci->sriov->vf_bars; + /* Set the BARs addresses and size. */ + for ( i = 0; i < PCI_SRIOV_NUM_BARS; i += rc ) + { + unsigned int idx = sriov_pos + PCI_SRIOV_BAR + i * 4; + uint32_t bar; + uint64_t addr, size; + + bar = pci_conf_read32(pf_pdev->sbdf, idx); + + rc = pci_size_mem_bar(pf_pdev->sbdf, idx, &addr, &size, + PCI_BAR_VF | + ((i == PCI_SRIOV_NUM_BARS - 1) ? PCI_BAR_LAST + : 0)); + + /* + * Update vf_rlen on the PF. According to the spec the size of + * the BARs can change if the system page size register is + * modified, so always update rlen when enabling VFs. + */ + pf_pdev->physfn.vf_rlen[i] = size; + + if ( !size ) + { + bars[i].type = VPCI_BAR_EMPTY; + continue; + } + + bars[i].addr = addr; + bars[i].guest_addr = addr; + bars[i].size = size; + bars[i].prefetchable = bar & PCI_BASE_ADDRESS_MEM_PREFETCH; + + switch ( rc ) + { + case 1: + bars[i].type = VPCI_BAR_MEM32; + break; + + case 2: + bars[i].type = VPCI_BAR_MEM64_LO; + bars[i + 1].type = VPCI_BAR_MEM64_HI; + break; + + default: + ASSERT_UNREACHABLE(); + } + } + + rc = rc > 0 ? 0 : rc; + + return rc; +} + +struct callback_data { + const struct pci_dev *pdev; + unsigned int pos; + uint32_t value; + bool enable : 1; + bool disable : 1; + bool map : 1; + bool unmap : 1; +}; + +static void cf_check control_write_cb(void *data) +{ + struct callback_data *cb = data; + const struct pci_dev *pdev = cb->pdev; + uint16_t offset = pci_conf_read16(pdev->sbdf, cb->pos + PCI_SRIOV_VF_OFFSET); + uint16_t stride = pci_conf_read16(pdev->sbdf, cb->pos + PCI_SRIOV_VF_STRIDE); + struct vpci_sriov *sriov = pdev->vpci->sriov; + int rc = 0; + unsigned int i; + + if ( cb->unmap ) + { + write_lock(&pdev->domain->pci_lock); + map_vfs(pdev, 0); + write_unlock(&pdev->domain->pci_lock); + } + + if ( cb->enable || cb->disable ) + { + for ( i = 0; i < sriov->num_vfs; i++ ) + { + const pci_sbdf_t vf_sbdf = { + .sbdf = pdev->sbdf.sbdf + offset + stride * i, + }; + + if ( cb->enable ) + { + const struct pci_dev_info info = { + .is_virtfn = true, + .is_extfn = false, + .physfn.bus = pdev->sbdf.bus, + .physfn.devfn = pdev->sbdf.devfn, + }; + rc = pci_add_device(vf_sbdf.seg, vf_sbdf.bus, vf_sbdf.devfn, + &info, pdev->node); + } + if ( cb->disable ) + rc = pci_remove_device(vf_sbdf.seg, vf_sbdf.bus, vf_sbdf.devfn); + + if ( rc && rc != -ENODEV) + gprintk(XENLOG_ERR, "failed to %s VF %pp: %d\n", + cb->enable ? "add" : "remove", &vf_sbdf, rc); + } + } + + if ( cb->map ) + { + write_lock(&pdev->domain->pci_lock); + rc = map_vfs(pdev, PCI_COMMAND_MEMORY); + + if ( rc ) + map_vfs(pdev, 0); + write_unlock(&pdev->domain->pci_lock); + } + + pci_conf_write16(pdev->sbdf, cb->pos + PCI_SRIOV_CTRL, cb->value); + xfree(cb); +} + +static void cf_check control_write(const struct pci_dev *pdev, unsigned int reg, + uint32_t val, void *data) +{ + unsigned int sriov_pos = reg - PCI_SRIOV_CTRL; + struct vpci_sriov *sriov = pdev->vpci->sriov; + struct callback_data *cb = NULL; + uint16_t control = pci_conf_read16(pdev->sbdf, reg); + bool mem_enabled = control & PCI_SRIOV_CTRL_MSE; + bool new_mem_enabled = val & PCI_SRIOV_CTRL_MSE; + bool enabled = control & PCI_SRIOV_CTRL_VFE; + bool new_enabled = val & PCI_SRIOV_CTRL_VFE; + + ASSERT(!pdev->info.is_virtfn); + + if ( new_enabled == enabled && new_mem_enabled == mem_enabled ) + { + pci_conf_write16(pdev->sbdf, reg, val); + return; + } + + cb = xzalloc(struct callback_data); + + if ( !cb ) + { + gprintk(XENLOG_ERR, + "%pp: Unable to allocate memory for SR-IOV enable\n", + pdev); + return; + } + + cb->pdev = pdev; + cb->pos = sriov_pos; + cb->value = val; + cb->map = new_mem_enabled && !mem_enabled; + cb->unmap = !new_mem_enabled && mem_enabled; + cb->enable = new_enabled && !enabled; + cb->disable = !new_enabled && enabled; + + current->vpci.task = WAIT; + current->vpci.wait.callback = control_write_cb; + current->vpci.wait.data = cb; + current->vpci.wait.end = NOW(); + + if ( cb->enable ) + { + size_vf_bars((struct pci_dev *)pdev, sriov_pos); + + /* + * Only update the number of active VFs when enabling, when + * disabling use the cached value in order to always remove the same + * number of VFs that were active. + */ + sriov->num_vfs = pci_conf_read16(pdev->sbdf, + sriov_pos + PCI_SRIOV_NUM_VF); + /* + * NB: VFE needs to be enabled before calling pci_add_device so Xen + * can access the config space of VFs. FIXME casting away const-ness + * to modify vf_rlen + */ + pci_conf_write16(pdev->sbdf, reg, control | PCI_SRIOV_CTRL_VFE); + /* + * The spec states that the software must wait at least 100ms before + * attempting to access VF registers when enabling virtual functions + * on the PF. + */ + + current->vpci.wait.end = NOW() + MILLISECS(100); + } +} + +int vf_init_header(struct pci_dev *vf_pdev) +{ + const struct pci_dev *pf_pdev; + unsigned int sriov_pos; + int rc = 0; + uint16_t ctrl; + + ASSERT(rw_is_write_locked(&vf_pdev->domain->pci_lock)); + + if ( !vf_pdev->info.is_virtfn ) + return 0; + + pf_pdev = vf_pdev->pf_pdev; + ASSERT(pf_pdev); + + rc = vf_init_bars(vf_pdev); + if ( rc ) + return rc; + + sriov_pos = pci_find_ext_capability(pf_pdev, PCI_EXT_CAP_ID_SRIOV); + ctrl = pci_conf_read16(pf_pdev->sbdf, sriov_pos + PCI_SRIOV_CTRL); + + if ( (pf_pdev->domain == vf_pdev->domain) && (ctrl & PCI_SRIOV_CTRL_MSE) ) + { + rc = vpci_modify_bars(vf_pdev, PCI_COMMAND_MEMORY, false, false); + if ( rc ) + return rc; + } + + return rc; +} + +static int cf_check init_sriov(struct pci_dev *pdev) +{ + unsigned int pos; + + ASSERT(!pdev->info.is_virtfn); + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); + + if ( !pos ) + return 0; + + if ( xsm_resource_setup_pci(XSM_PRIV, pdev->sbdf.bdf) ) + { + printk(XENLOG_ERR + "%pp: SR-IOV configuration unsupported for unpriv %pd\n", + &pdev->sbdf, pdev->domain); + return 0; + } + + pdev->vpci->sriov = xzalloc(struct vpci_sriov); + if ( !pdev->vpci->sriov ) + return -ENOMEM; + + return vpci_add_register(pdev->vpci, vpci_hw_read16, control_write, + pos + PCI_SRIOV_CTRL, 2, NULL); +} + +static int cf_check cleanup_sriov(const struct pci_dev *pdev, bool hide) +{ + if ( hide ) + return 0; + + XFREE(pdev->vpci->sriov); + + return 0; +} + +REGISTER_VPCI_EXTCAP(SRIOV, init_sriov, cleanup_sriov); + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h index 47cdb54d42..ae5f3b7274 100644 --- a/xen/include/xen/vpci.h +++ b/xen/include/xen/vpci.h @@ -45,6 +45,7 @@ typedef struct { REGISTER_VPCI_CAPABILITY(PCI_EXT_CAP_ID_##name, name, finit, fclean, true) int __must_check vpci_init_header(struct pci_dev *pdev); +int __must_check vf_init_header(struct pci_dev *pdev); /* Assign vPCI to device by adding handlers. */ int __must_check vpci_assign_device(struct pci_dev *pdev); @@ -146,7 +147,6 @@ struct vpci { * upon to know whether BARs are mapped into the guest p2m. */ bool bars_mapped : 1; - /* FIXME: currently there's no support for SR-IOV. */ } header; /* MSI data. */ @@ -200,6 +200,13 @@ struct vpci { struct vpci_arch_msix_entry arch; } entries[]; } *msix; + + struct vpci_sriov { + /* PF only */ + struct vpci_bar vf_bars[PCI_SRIOV_NUM_BARS]; + uint16_t num_vfs; + } *sriov; + #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT /* Guest SBDF of the device. */ #define INVALID_GUEST_SBDF ((pci_sbdf_t){ .sbdf = ~0U }) @@ -323,7 +330,8 @@ bool vpci_ecam_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int len, unsigned long *data); /* Map/unmap the BARs of a vPCI device. */ -int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only); +int vpci_modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only, + bool no_defer); #endif /* __XEN__ */ -- 2.51.2
