On 11/18/25 5:15 AM, Sairaj Kodilkar wrote:
> AMD IOMMU supports upto 2048 MSIs for a single device function
> when NUM_INT_REMAP_SUP Extended-Feature-Register-2 bit is set to one.
> Software can enable this feature by writing one to NUM_INT_REMAP_MODE
> in the control register. MSI address destination mode (DM) bit decides
> how many MSI data bits are used by IOMMU to index into IRT. When DM = 0,
> IOMMU uses bits 8:0 (max 512) for the index, otherwise (DM = 1)
> IOMMU uses bits 10:0 (max 2048) for IRT index.
> 
> This feature can be enabled with flag `numint2k=on`. In case of
> passhthrough devices viommu uses control register provided by vendor
> capabilites to determine if host IOMMU has enabled 2048 MSIs. If host
> IOMMU has not enabled it then the guest feature is disabled.
> 
> example command line
> '''
> -object iommufd,id=fd0 \
> -device amd_iommu,dma-remap=on,numint2k=on \
> -device vfio-host,host=<DEVID>,iommufd=fd0 \
> '''
> 
> NOTE: In case of legacy VFIO container the guest will always fall back
> to 512 MSIs.
> 
> Signed-off-by: Sairaj Kodilkar <[email protected]>
> ---
>  hw/i386/amd_iommu.c | 74 ++++++++++++++++++++++++++++++++++++++++-----
>  hw/i386/amd_iommu.h | 12 ++++++++
>  2 files changed, 79 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
> index 3221bf5a0303..4f62c4ee3671 100644
> --- a/hw/i386/amd_iommu.c
> +++ b/hw/i386/amd_iommu.c
> @@ -116,7 +116,12 @@ uint64_t amdvi_extended_feature_register(AMDVIState *s)
>  
>  uint64_t amdvi_extended_feature_register2(AMDVIState *s)
>  {
> -    return AMDVI_DEFAULT_EXT_FEATURES2;
> +    uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES2;
> +    if (s->num_int_sup_2k) {
> +        feature |= AMDVI_FEATURE_NUM_INT_REMAP_SUP;
> +    }
> +
> +    return feature;
>  }
>  
>  /* configure MMIO registers at startup/reset */
> @@ -1538,6 +1543,9 @@ static void amdvi_handle_control_write(AMDVIState *s)
>                          AMDVI_MMIO_CONTROL_CMDBUFLEN);
>      s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
>  
> +    s->num_int_enabled = (control >> AMDVI_MMIO_CONTROL_NUM_INT_REMAP_SHIFT) 
> &
> +                         AMDVI_MMIO_CONTROL_NUM_INT_REMAP_MASK;
> +
>      /* update the flags depending on the control register */
>      if (s->cmdbuf_enabled) {
>          amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
> @@ -2119,6 +2127,25 @@ static int amdvi_int_remap_msi(AMDVIState *iommu,
>       * (page 5)
>       */
>      delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
> +    /*
> +     * The MSI address register bit[2] is used to get the destination
> +     * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
> +     * and when IOMMU supports upto 2048 interrupts.
> +     */
> +    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
> +
> +    if (dest_mode &&
> +        iommu->num_int_enabled == AMDVI_MMIO_CONTROL_NUM_INT_REMAP_2K) {
> +
> +        trace_amdvi_ir_delivery_mode("2K interrupt mode");
> +        ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, 
> sid);
> +        if (ret < 0) {
> +            goto remap_fail;
> +        }
> +        /* Translate IRQ to MSI messages */
> +        x86_iommu_irq_to_msi_message(&irq, translated);
> +        goto out;
> +    }
>  
>      switch (delivery_mode) {
>      case AMDVI_IOAPIC_INT_TYPE_FIXED:
> @@ -2159,12 +2186,6 @@ static int amdvi_int_remap_msi(AMDVIState *iommu,
>          goto remap_fail;
>      }
>  
> -    /*
> -     * The MSI address register bit[2] is used to get the destination
> -     * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
> -     * only.
> -     */
> -    dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
>      if (dest_mode) {
>          trace_amdvi_ir_err("invalid dest_mode");
>          ret = -AMDVI_IR_ERR;
> @@ -2322,6 +2343,30 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, 
> void *opaque, int devfn)
>      return &iommu_as[devfn]->as;
>  }
>  
> +static void amdvi_refresh_efrs_hwinfo(struct AMDVIState *s,
> +                                      struct iommu_hw_info_amd *hwinfo)
> +{
> +    /* Check if host OS has enabled 2K interrupts */
> +    bool hwinfo_ctrl_2k;
> +
> +    if (s->num_int_sup_2k && !hwinfo) {
> +        warn_report("AMDVI: Disabling 2048 MSI for guest, "
> +                    "use IOMMUFD for device passthrough to support it");
> +        s->num_int_sup_2k = 0;
> +    }
> +
> +    hwinfo_ctrl_2k = ((hwinfo->control_register

We need to check that hwinfo is a valid pointer before attempting to access
any of its fields. The code in the line above causes a segfault in the
common case where we are just using the default VFIO legacy backend and no
new options.
Even when trying to use the new feature (numint2k=on) and iommufd backend
in QEMU, if the host kernel was built with CONFIG_AMD_IOMMU_IOMMUFD=n
(which is currently the default), the ioctl IOMMU_GET_HW_INFO will always
return NULL data and hwinfo is also NULL at this point, so we crash and burn.

> +                       >> AMDVI_MMIO_CONTROL_NUM_INT_REMAP_SHIFT)
> +                      & AMDVI_MMIO_CONTROL_NUM_INT_REMAP_2K);
> +
> +    if (s->num_int_sup_2k && !hwinfo_ctrl_2k) {
> +        warn_report("AMDVI: Disabling 2048 MSIs for guest, "
> +                    "as host kernel does not support this feature");
> +        s->num_int_sup_2k = 0;
> +    }
> +
> +    amdvi_refresh_efrs(s);
> +}
>  
>  static bool amdvi_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
>                                     HostIOMMUDevice *hiod, Error **errp)
> @@ -2354,6 +2399,20 @@ static bool amdvi_set_iommu_device(PCIBus *bus, void 
> *opaque, int devfn,
>      object_ref(hiod);
>      g_hash_table_insert(s->hiod_hash, new_key, hiod);
>  
> +    if (hiod->caps.type == IOMMU_HW_INFO_TYPE_AMD) {
> +        /*
> +         * Refresh the MMIO efr registers so that changes are visible to the
> +         * guest.
> +         */
> +        amdvi_refresh_efrs_hwinfo(s, &hiod->caps.vendor_caps.amd);
> +    } else {
> +        /*
> +         * Pass NULL hardware registers when we have non-IOMMUFD
> +         * passthrough device
> +         */
> +        amdvi_refresh_efrs_hwinfo(s, NULL);

This call with hwinfo = NULL causes a segfault as I mentioned above. The
code in amdvi_refresh_efrs_hwinfo() needs to be hardened.

Thank you,
Alejandro

> +    }
> +
>      return true;
>  }
>  
> @@ -2641,6 +2700,7 @@ static const Property amdvi_properties[] = {
>      DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false),
>      DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id),
>      DEFINE_PROP_BOOL("dma-remap", AMDVIState, dma_remap, false),
> +    DEFINE_PROP_BOOL("numint2k", AMDVIState, num_int_sup_2k, false),
>  };
>  
>  static const VMStateDescription vmstate_amdvi_sysbus = {
> diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
> index c8eaf229b50e..588725fe0c25 100644
> --- a/hw/i386/amd_iommu.h
> +++ b/hw/i386/amd_iommu.h
> @@ -107,6 +107,9 @@
>  #define AMDVI_MMIO_CONTROL_COMWAITINTEN   (1ULL << 4)
>  #define AMDVI_MMIO_CONTROL_CMDBUFLEN      (1ULL << 12)
>  #define AMDVI_MMIO_CONTROL_GAEN           (1ULL << 17)
> +#define AMDVI_MMIO_CONTROL_NUM_INT_REMAP_MASK        (0x3)
> +#define AMDVI_MMIO_CONTROL_NUM_INT_REMAP_SHIFT       (43)
> +#define AMDVI_MMIO_CONTROL_NUM_INT_REMAP_2K          (0x1)
>  
>  /* MMIO status register bits */
>  #define AMDVI_MMIO_STATUS_CMDBUF_RUN  (1 << 4)
> @@ -160,6 +163,7 @@
>  #define AMDVI_PERM_READ             (1 << 0)
>  #define AMDVI_PERM_WRITE            (1 << 1)
>  
> +/* EFR */
>  #define AMDVI_FEATURE_PREFETCH            (1ULL << 0) /* page prefetch       
> */
>  #define AMDVI_FEATURE_PPR                 (1ULL << 1) /* PPR Support         
> */
>  #define AMDVI_FEATURE_XT                  (1ULL << 2) /* x2APIC Support      
> */
> @@ -169,6 +173,9 @@
>  #define AMDVI_FEATURE_HE                  (1ULL << 8) /* hardware error regs 
> */
>  #define AMDVI_FEATURE_PC                  (1ULL << 9) /* Perf counters       
> */
>  
> +/* EFR2 */
> +#define AMDVI_FEATURE_NUM_INT_REMAP_SUP   (1ULL << 8) /* 2K int support      
> */
> +
>  /* reserved DTE bits */
>  #define AMDVI_DTE_QUAD0_RESERVED        (GENMASK64(6, 2) | GENMASK64(63, 63))
>  #define AMDVI_DTE_QUAD1_RESERVED        0
> @@ -380,6 +387,8 @@ struct AMDVIState {
>      bool evtlog_enabled;         /* event log enabled            */
>      bool excl_enabled;
>  
> +    uint8_t num_int_enabled;
> +
>      hwaddr devtab;               /* base address device table    */
>      uint64_t devtab_len;         /* device table length          */
>  
> @@ -433,6 +442,9 @@ struct AMDVIState {
>  
>      /* DMA address translation */
>      bool dma_remap;
> +
> +    /* upto 2048 interrupt support */
> +    bool num_int_sup_2k;
>  };
>  
>  uint64_t amdvi_extended_feature_register(AMDVIState *s);


Reply via email to