On 11/18/25 5:15 AM, Sairaj Kodilkar wrote:
> AMD IOMMU supports upto 2048 MSIs for a single device function
> when NUM_INT_REMAP_SUP Extended-Feature-Register-2 bit is set to one.
> Software can enable this feature by writing one to NUM_INT_REMAP_MODE
> in the control register. MSI address destination mode (DM) bit decides
> how many MSI data bits are used by IOMMU to index into IRT. When DM = 0,
> IOMMU uses bits 8:0 (max 512) for the index, otherwise (DM = 1)
> IOMMU uses bits 10:0 (max 2048) for IRT index.
>
> This feature can be enabled with flag `numint2k=on`. In case of
> passhthrough devices viommu uses control register provided by vendor
> capabilites to determine if host IOMMU has enabled 2048 MSIs. If host
> IOMMU has not enabled it then the guest feature is disabled.
>
> example command line
> '''
> -object iommufd,id=fd0 \
> -device amd_iommu,dma-remap=on,numint2k=on \
> -device vfio-host,host=<DEVID>,iommufd=fd0 \
> '''
>
> NOTE: In case of legacy VFIO container the guest will always fall back
> to 512 MSIs.
>
> Signed-off-by: Sairaj Kodilkar <[email protected]>
> ---
> hw/i386/amd_iommu.c | 74 ++++++++++++++++++++++++++++++++++++++++-----
> hw/i386/amd_iommu.h | 12 ++++++++
> 2 files changed, 79 insertions(+), 7 deletions(-)
>
> diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
> index 3221bf5a0303..4f62c4ee3671 100644
> --- a/hw/i386/amd_iommu.c
> +++ b/hw/i386/amd_iommu.c
> @@ -116,7 +116,12 @@ uint64_t amdvi_extended_feature_register(AMDVIState *s)
>
> uint64_t amdvi_extended_feature_register2(AMDVIState *s)
> {
> - return AMDVI_DEFAULT_EXT_FEATURES2;
> + uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES2;
> + if (s->num_int_sup_2k) {
> + feature |= AMDVI_FEATURE_NUM_INT_REMAP_SUP;
> + }
> +
> + return feature;
> }
>
> /* configure MMIO registers at startup/reset */
> @@ -1538,6 +1543,9 @@ static void amdvi_handle_control_write(AMDVIState *s)
> AMDVI_MMIO_CONTROL_CMDBUFLEN);
> s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
>
> + s->num_int_enabled = (control >> AMDVI_MMIO_CONTROL_NUM_INT_REMAP_SHIFT)
> &
> + AMDVI_MMIO_CONTROL_NUM_INT_REMAP_MASK;
> +
> /* update the flags depending on the control register */
> if (s->cmdbuf_enabled) {
> amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
> @@ -2119,6 +2127,25 @@ static int amdvi_int_remap_msi(AMDVIState *iommu,
> * (page 5)
> */
> delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
> + /*
> + * The MSI address register bit[2] is used to get the destination
> + * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
> + * and when IOMMU supports upto 2048 interrupts.
> + */
> + dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
> +
> + if (dest_mode &&
> + iommu->num_int_enabled == AMDVI_MMIO_CONTROL_NUM_INT_REMAP_2K) {
> +
> + trace_amdvi_ir_delivery_mode("2K interrupt mode");
> + ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq,
> sid);
> + if (ret < 0) {
> + goto remap_fail;
> + }
> + /* Translate IRQ to MSI messages */
> + x86_iommu_irq_to_msi_message(&irq, translated);
> + goto out;
> + }
>
> switch (delivery_mode) {
> case AMDVI_IOAPIC_INT_TYPE_FIXED:
> @@ -2159,12 +2186,6 @@ static int amdvi_int_remap_msi(AMDVIState *iommu,
> goto remap_fail;
> }
>
> - /*
> - * The MSI address register bit[2] is used to get the destination
> - * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
> - * only.
> - */
> - dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
> if (dest_mode) {
> trace_amdvi_ir_err("invalid dest_mode");
> ret = -AMDVI_IR_ERR;
> @@ -2322,6 +2343,30 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus,
> void *opaque, int devfn)
> return &iommu_as[devfn]->as;
> }
>
> +static void amdvi_refresh_efrs_hwinfo(struct AMDVIState *s,
> + struct iommu_hw_info_amd *hwinfo)
> +{
> + /* Check if host OS has enabled 2K interrupts */
> + bool hwinfo_ctrl_2k;
> +
> + if (s->num_int_sup_2k && !hwinfo) {
> + warn_report("AMDVI: Disabling 2048 MSI for guest, "
> + "use IOMMUFD for device passthrough to support it");
> + s->num_int_sup_2k = 0;
> + }
> +
> + hwinfo_ctrl_2k = ((hwinfo->control_register
We need to check that hwinfo is a valid pointer before attempting to access
any of its fields. The code in the line above causes a segfault in the
common case where we are just using the default VFIO legacy backend and no
new options.
Even when trying to use the new feature (numint2k=on) and iommufd backend
in QEMU, if the host kernel was built with CONFIG_AMD_IOMMU_IOMMUFD=n
(which is currently the default), the ioctl IOMMU_GET_HW_INFO will always
return NULL data and hwinfo is also NULL at this point, so we crash and burn.
> + >> AMDVI_MMIO_CONTROL_NUM_INT_REMAP_SHIFT)
> + & AMDVI_MMIO_CONTROL_NUM_INT_REMAP_2K);
> +
> + if (s->num_int_sup_2k && !hwinfo_ctrl_2k) {
> + warn_report("AMDVI: Disabling 2048 MSIs for guest, "
> + "as host kernel does not support this feature");
> + s->num_int_sup_2k = 0;
> + }
> +
> + amdvi_refresh_efrs(s);
> +}
>
> static bool amdvi_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
> HostIOMMUDevice *hiod, Error **errp)
> @@ -2354,6 +2399,20 @@ static bool amdvi_set_iommu_device(PCIBus *bus, void
> *opaque, int devfn,
> object_ref(hiod);
> g_hash_table_insert(s->hiod_hash, new_key, hiod);
>
> + if (hiod->caps.type == IOMMU_HW_INFO_TYPE_AMD) {
> + /*
> + * Refresh the MMIO efr registers so that changes are visible to the
> + * guest.
> + */
> + amdvi_refresh_efrs_hwinfo(s, &hiod->caps.vendor_caps.amd);
> + } else {
> + /*
> + * Pass NULL hardware registers when we have non-IOMMUFD
> + * passthrough device
> + */
> + amdvi_refresh_efrs_hwinfo(s, NULL);
This call with hwinfo = NULL causes a segfault as I mentioned above. The
code in amdvi_refresh_efrs_hwinfo() needs to be hardened.
Thank you,
Alejandro
> + }
> +
> return true;
> }
>
> @@ -2641,6 +2700,7 @@ static const Property amdvi_properties[] = {
> DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false),
> DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id),
> DEFINE_PROP_BOOL("dma-remap", AMDVIState, dma_remap, false),
> + DEFINE_PROP_BOOL("numint2k", AMDVIState, num_int_sup_2k, false),
> };
>
> static const VMStateDescription vmstate_amdvi_sysbus = {
> diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
> index c8eaf229b50e..588725fe0c25 100644
> --- a/hw/i386/amd_iommu.h
> +++ b/hw/i386/amd_iommu.h
> @@ -107,6 +107,9 @@
> #define AMDVI_MMIO_CONTROL_COMWAITINTEN (1ULL << 4)
> #define AMDVI_MMIO_CONTROL_CMDBUFLEN (1ULL << 12)
> #define AMDVI_MMIO_CONTROL_GAEN (1ULL << 17)
> +#define AMDVI_MMIO_CONTROL_NUM_INT_REMAP_MASK (0x3)
> +#define AMDVI_MMIO_CONTROL_NUM_INT_REMAP_SHIFT (43)
> +#define AMDVI_MMIO_CONTROL_NUM_INT_REMAP_2K (0x1)
>
> /* MMIO status register bits */
> #define AMDVI_MMIO_STATUS_CMDBUF_RUN (1 << 4)
> @@ -160,6 +163,7 @@
> #define AMDVI_PERM_READ (1 << 0)
> #define AMDVI_PERM_WRITE (1 << 1)
>
> +/* EFR */
> #define AMDVI_FEATURE_PREFETCH (1ULL << 0) /* page prefetch
> */
> #define AMDVI_FEATURE_PPR (1ULL << 1) /* PPR Support
> */
> #define AMDVI_FEATURE_XT (1ULL << 2) /* x2APIC Support
> */
> @@ -169,6 +173,9 @@
> #define AMDVI_FEATURE_HE (1ULL << 8) /* hardware error regs
> */
> #define AMDVI_FEATURE_PC (1ULL << 9) /* Perf counters
> */
>
> +/* EFR2 */
> +#define AMDVI_FEATURE_NUM_INT_REMAP_SUP (1ULL << 8) /* 2K int support
> */
> +
> /* reserved DTE bits */
> #define AMDVI_DTE_QUAD0_RESERVED (GENMASK64(6, 2) | GENMASK64(63, 63))
> #define AMDVI_DTE_QUAD1_RESERVED 0
> @@ -380,6 +387,8 @@ struct AMDVIState {
> bool evtlog_enabled; /* event log enabled */
> bool excl_enabled;
>
> + uint8_t num_int_enabled;
> +
> hwaddr devtab; /* base address device table */
> uint64_t devtab_len; /* device table length */
>
> @@ -433,6 +442,9 @@ struct AMDVIState {
>
> /* DMA address translation */
> bool dma_remap;
> +
> + /* upto 2048 interrupt support */
> + bool num_int_sup_2k;
> };
>
> uint64_t amdvi_extended_feature_register(AMDVIState *s);