On 7/25/25 20:39, Teddy Astie wrote:
> Le 25/07/2025 à 16:26, Mykyta Poturai a écrit :
>> From: Stewart Hildebrand <[email protected]>
>>
>> This code is expected to only be used by privileged domains,
>> unprivileged domains should not get access to the SR-IOV capability.
>>
>> Implement RW handlers for PCI_SRIOV_CTRL register to dynamically
>> map/unmap VF BARS. Recalculate BAR sizes before mapping VFs to account
>> for possible changes in the system page size register.
>>
>> Relies on dom0 to enable SR-IOV and PHYSDEVOP to inform Xen about
>> addition/removal of VFs.
>>
>> Signed-off-by: Stewart Hildebrand <[email protected]>
>> Signed-off-by: Mykyta Poturai <[email protected]>
>> ---
>> CHANGELOG.md | 3 +-
>> SUPPORT.md | 2 -
>> xen/drivers/vpci/Makefile | 2 +-
>> xen/drivers/vpci/header.c | 3 +
>> xen/drivers/vpci/sriov.c | 235 ++++++++++++++++++++++++++++++++++++++
>> xen/drivers/vpci/vpci.c | 1 +
>> xen/include/xen/vpci.h | 7 +-
>> 7 files changed, 247 insertions(+), 6 deletions(-)
>> create mode 100644 xen/drivers/vpci/sriov.c
>>
>> diff --git a/CHANGELOG.md b/CHANGELOG.md
>> index 5f31ca08fe..7b0e8beb76 100644
>> --- a/CHANGELOG.md
>> +++ b/CHANGELOG.md
>> @@ -23,8 +23,7 @@ The format is based on [Keep a
>> Changelog](https://keepachangelog.com/en/1.0.0/
>> - On x86:
>> - Option to attempt to fixup p2m page-faults on PVH dom0.
>> - Resizable BARs is supported for PVH dom0.
>> - - Support PCI passthrough for HVM domUs when dom0 is PVH (note SR-IOV
>> - capability usage is not yet supported on PVH dom0).
>> + - Support PCI passthrough for HVM domUs when dom0 is PVH.
>> - Smoke tests for the FreeBSD Xen builds in Cirrus CI.
>>
>> - On Arm:
>> diff --git a/SUPPORT.md b/SUPPORT.md
>> index 6a82a92189..830b598714 100644
>> --- a/SUPPORT.md
>> +++ b/SUPPORT.md
>> @@ -170,8 +170,6 @@ unexpected behavior or issues on some hardware.
>>
>> At least the following features are missing on a PVH dom0:
>>
>> - * PCI SR-IOV.
>> -
>> * Native NMI forwarding (nmi=dom0 command line option).
>>
>> * MCE handling.
>
> I would prefer changelog/support changes to be a separate patch or
> alternatively at the last patch as I don't think SR-IOV is fully working
> without the 2 remaining ones.
>
>> diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
>> index a7c8a30a89..fe1e57b64d 100644
>> --- a/xen/drivers/vpci/Makefile
>> +++ b/xen/drivers/vpci/Makefile
>> @@ -1,2 +1,2 @@
>> -obj-y += vpci.o header.o rebar.o
>> +obj-y += vpci.o header.o rebar.o sriov.o
>> obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o
>> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
>> index f947f652cd..0a840c6dcc 100644
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -839,6 +839,9 @@ static int cf_check init_header(struct pci_dev *pdev)
>>
>> ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
>>
>> + if ( pdev->info.is_virtfn )
>> + return 0;
>> +
>> switch ( pci_conf_read8(pdev->sbdf, PCI_HEADER_TYPE) & 0x7f )
>> {
>> case PCI_HEADER_TYPE_NORMAL:
>> diff --git a/xen/drivers/vpci/sriov.c b/xen/drivers/vpci/sriov.c
>> new file mode 100644
>> index 0000000000..640430e3e9
>> --- /dev/null
>> +++ b/xen/drivers/vpci/sriov.c
>> @@ -0,0 +1,235 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Handlers for accesses to the SR-IOV capability structure.
>> + *
>> + * Copyright (C) 2018 Citrix Systems R&D
>> + */
>> +
>> +#include <xen/sched.h>
>> +#include <xen/vpci.h>
>> +
>> +static int vf_init_bars(const struct pci_dev *vf_pdev)
>> +{
>> + unsigned int i, sriov_pos;
>> + int vf_idx, rc;
>> + const struct pci_dev *pf_pdev = vf_pdev->pf_pdev;
>> + uint16_t offset, stride;
>> + struct vpci_bar *bars = vf_pdev->vpci->header.bars;
>> + struct vpci_bar *physfn_vf_bars = pf_pdev->vpci->sriov->vf_bars;
>> +
>> + sriov_pos = pci_find_ext_capability(pf_pdev->sbdf,
>> PCI_EXT_CAP_ID_SRIOV);
>> + offset = pci_conf_read16(pf_pdev->sbdf, sriov_pos +
>> PCI_SRIOV_VF_OFFSET);
>> + stride = pci_conf_read16(pf_pdev->sbdf, sriov_pos +
>> PCI_SRIOV_VF_STRIDE);
>> +
>> + vf_idx = vf_pdev->sbdf.sbdf;
>> + vf_idx -= pf_pdev->sbdf.sbdf + offset;
>
> We can probably rewrite it as
>
> vf_idx = vf_pdev->sbdf.sbdf - (pf_pdev->sbdf.sbdf + offset);
>
> especially with sbdf being potentially not representable as a int (even
> though very unlikely).
>
>> + if ( vf_idx < 0 )
>> + return -EINVAL;
>> + if ( stride )
>> + {
>> + if ( vf_idx % stride )
>> + return -EINVAL;
>> + vf_idx /= stride;
>> + }
>> +
>> + /*
>> + * Set up BARs for this VF out of PF's VF BARs taking into account
>> + * the index of the VF.
>> + */
>> + for ( i = 0; i < PCI_SRIOV_NUM_BARS; i++ )
>> + {
>> + bars[i].addr = physfn_vf_bars[i].addr + vf_idx *
>> physfn_vf_bars[i].size;
>> + bars[i].guest_addr = bars[i].addr;
>> + bars[i].size = physfn_vf_bars[i].size;
>> + bars[i].type = physfn_vf_bars[i].type;
>> + bars[i].prefetchable = physfn_vf_bars[i].prefetchable;
>> + rc = vpci_bar_add_rangeset(vf_pdev, &bars[i], i);
>> + if ( rc )
>> + return rc;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int map_vf(const struct pci_dev *vf_pdev, uint16_t cmd)
>> +{
>> + int rc;
>> +
>> + ASSERT(rw_is_write_locked(&vf_pdev->domain->pci_lock));
>> +
>> + rc = vf_init_bars(vf_pdev);
>> + if ( rc )
>> + return rc;
>> +
>> + return vpci_modify_bars(vf_pdev, cmd, false);
>> +}
>> +
>> +static int size_vf_bars(struct pci_dev *pf_pdev, unsigned int sriov_pos)
>> +{
>> + /*
>> + * NB: a non-const pci_dev of the PF is needed in order to update
>> + * vf_rlen.
>> + */
>> + struct vpci_bar *bars;
>> + unsigned int i;
>> + int rc = 0;
>> +
>> + ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock));
>> + ASSERT(!pf_pdev->info.is_virtfn);
>> +
>> + if ( !pf_pdev->vpci->sriov )
>> + return -EINVAL;
>> +
>> + /* Read BARs for VFs out of PF's SR-IOV extended capability. */
>> + bars = pf_pdev->vpci->sriov->vf_bars;
>> + /* Set the BARs addresses and size. */
>> + for ( i = 0; i < PCI_SRIOV_NUM_BARS; i += rc )
>> + {
>> + unsigned int idx = sriov_pos + PCI_SRIOV_BAR + i * 4;
>> + uint32_t bar;
>> + uint64_t addr, size;
>> +
>> + bar = pci_conf_read32(pf_pdev->sbdf, idx);
>> +
>> + rc = pci_size_mem_bar(pf_pdev->sbdf, idx, &addr, &size,
>> + PCI_BAR_VF |
>> + ((i == PCI_SRIOV_NUM_BARS - 1) ? PCI_BAR_LAST
>> + : 0));
>> +
>> + /*
>> + * Update vf_rlen on the PF. According to the spec the size of
>> + * the BARs can change if the system page size register is
>> + * modified, so always update rlen when enabling VFs.
>> + */
>> + pf_pdev->physfn.vf_rlen[i] = size;
>> +
>> + if ( !size )
>> + {
>> + bars[i].type = VPCI_BAR_EMPTY;
>> + continue;
>> + }
>> +
>> + bars[i].addr = addr;
>> + bars[i].guest_addr = addr;
>> + bars[i].size = size;
>> + bars[i].prefetchable = bar & PCI_BASE_ADDRESS_MEM_PREFETCH;
>> +
>> + switch ( rc )
>> + {
>> + case 1:
>> + bars[i].type = VPCI_BAR_MEM32;
>> + break;
>> +
>> + case 2:
>> + bars[i].type = VPCI_BAR_MEM64_LO;
>> + bars[i + 1].type = VPCI_BAR_MEM64_HI;
>> + break;
>> +
>> + default:
>> + ASSERT_UNREACHABLE();
>> + }
>> + }
>> +
>> + rc = rc > 0 ? 0 : rc;
>> +
>> + return rc;
>> +}
>> +
>> +static void cf_check control_write(const struct pci_dev *pdev, unsigned int
>> reg,
>> + uint32_t val, void *data)
>> +{
>> + unsigned int sriov_pos = reg - PCI_SRIOV_CTRL;
>> + uint16_t control = pci_conf_read16(pdev->sbdf, reg);
>> + bool mem_enabled = control & PCI_SRIOV_CTRL_MSE;
>> + bool new_mem_enabled = val & PCI_SRIOV_CTRL_MSE;
>> +
>> + ASSERT(!pdev->info.is_virtfn);
>> +
>> + if ( new_mem_enabled != mem_enabled )
>> + {
>> + struct pci_dev *vf_pdev;
>> + if ( new_mem_enabled )
>> + {
>> + /* FIXME casting away const-ness to modify vf_rlen */
>> + size_vf_bars((struct pci_dev *)pdev, sriov_pos);
>> +
>> + list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
>> + map_vf(vf_pdev, PCI_COMMAND_MEMORY);
>> + }
>> + else
>> + {
>> + list_for_each_entry(vf_pdev, &pdev->vf_list, vf_list)
>> + map_vf(vf_pdev, 0);
>> + }
>> + }
>> +
>> + pci_conf_write16(pdev->sbdf, reg, val);
>> +}
>> +
>> +static int vf_init_header(struct pci_dev *vf_pdev)
>> +{
>> + const struct pci_dev *pf_pdev;
>> + unsigned int sriov_pos;
>> + int rc = 0;
>> + uint16_t ctrl;
>> +
>> + ASSERT(rw_is_write_locked(&vf_pdev->domain->pci_lock));
>> +
>> + if ( !vf_pdev->info.is_virtfn )
>> + return 0;
>> +
>> + pf_pdev = vf_pdev->pf_pdev;
>> + ASSERT(pf_pdev);
>> +
>> + sriov_pos = pci_find_ext_capability(pf_pdev->sbdf,
>> PCI_EXT_CAP_ID_SRIOV);
>> + ctrl = pci_conf_read16(pf_pdev->sbdf, sriov_pos + PCI_SRIOV_CTRL);
>> +
>> + if ( (pf_pdev->domain == vf_pdev->domain) && (ctrl &
>> PCI_SRIOV_CTRL_MSE) )
>> + {
>> + rc = map_vf(vf_pdev, PCI_COMMAND_MEMORY);
>> + if ( rc )
>> + return rc;
>> + }
>> +
>> + return rc;
>> +}
>> +
>> +static int init_sriov(struct pci_dev *pdev)
>> +{
>> + unsigned int pos;
>> +
>> + if ( pdev->info.is_virtfn )
>> + return vf_init_header(pdev);
>> +
>> + pos = pci_find_ext_capability(pdev->sbdf, PCI_EXT_CAP_ID_SRIOV);
>> +
>> + if ( !pos )
>> + return 0;
>> +
>> + if ( !is_hardware_domain(pdev->domain) )
>> + {
>> + printk(XENLOG_ERR
>> + "%pp: SR-IOV configuration unsupported for unpriv %pd\n",
>> + &pdev->sbdf, pdev->domain);
>> + return 0;
>> + }
>
> Should it instead rely on xsm to know if it the operation is allowed or
> not for this domain ?
>
Hi,
Do you have a suggestion on which hook should be used here? Semantically
both resource_plug_pci and resource_setup_pci make sense here but I am
not sure which one to choose as I am not very familiar with XSM.
>> +
>> + pdev->vpci->sriov = xzalloc(struct vpci_sriov);
>> + if ( !pdev->vpci->sriov )
>> + return -ENOMEM;
>> +
>> + return vpci_add_register(pdev->vpci, vpci_hw_read16, control_write,
>> + pos + PCI_SRIOV_CTRL, 2, NULL);
>> +}
>> +
>> +REGISTER_VPCI_INIT(init_sriov, VPCI_PRIORITY_LOW);
>> +
>> +/*
>> + * Local variables:
>> + * mode: C
>> + * c-file-style: "BSD"
>> + * c-basic-offset: 4
>> + * tab-width: 4
>> + * indent-tabs-mode: nil
>> + * End:
>> + */
>> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
>> index 09988f04c2..7af6651831 100644
>> --- a/xen/drivers/vpci/vpci.c
>> +++ b/xen/drivers/vpci/vpci.c
>> @@ -120,6 +120,7 @@ void vpci_deassign_device(struct pci_dev *pdev)
>> for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
>> rangeset_destroy(pdev->vpci->header.bars[i].mem);
>>
>> + xfree(pdev->vpci->sriov);
>> xfree(pdev->vpci->msix);
>> xfree(pdev->vpci->msi);
>> xfree(pdev->vpci);
>> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
>> index 06f7039f20..9e8dcab17e 100644
>> --- a/xen/include/xen/vpci.h
>> +++ b/xen/include/xen/vpci.h
>> @@ -138,7 +138,6 @@ struct vpci {
>> * upon to know whether BARs are mapped into the guest p2m.
>> */
>> bool bars_mapped : 1;
>> - /* FIXME: currently there's no support for SR-IOV. */
>> } header;
>>
>> /* MSI data. */
>> @@ -192,6 +191,12 @@ struct vpci {
>> struct vpci_arch_msix_entry arch;
>> } entries[];
>> } *msix;
>> +
>> + struct vpci_sriov {
>> + /* PF only */
>> + struct vpci_bar vf_bars[PCI_SRIOV_NUM_BARS];
>> + } *sriov;
>> +
>> #ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
>> /* Guest SBDF of the device. */
>> #define INVALID_GUEST_SBDF ((pci_sbdf_t){ .sbdf = ~0U })
>
>
>
> Teddy Astie | Vates XCP-ng Developer
>
> XCP-ng & Xen Orchestra - Vates solutions
>
> web: https://vates.tech/
>
--
Mykyta