On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
> This captures the guest PASID table entry modifications and
> propagates the changes to host to attach a hwpt with type determined
> per guest IOMMU mdoe and PGTT configuration.
>
> When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
> page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
> vIOMMU reuse hwpt(GPA->HPA) provided by VFIO as nested parent to
> construct nested page table.
>
> When guest decides to use legacy mode then vIOMMU switches the MRs of
> the device's AS, hence the IOAS created by VFIO container would be
> switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
> switched to IOMMU MR. So it is able to support shadowing the guest IO
> page table.
>
> Co-Authored-by: Yi Liu <yi.l....@intel.com>
> Signed-off-by: Yi Liu <yi.l....@intel.com>
> Signed-off-by: Yi Sun <yi.y....@linux.intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
> ---
>  hw/i386/intel_iommu_internal.h |  11 ++
>  hw/i386/intel_iommu.c          | 244 +++++++++++++++++++++++++++++++--
>  hw/i386/trace-events           |   3 +
>  3 files changed, 243 insertions(+), 15 deletions(-)
>
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 5ed76864be..92a533db54 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -563,6 +563,13 @@ typedef struct VTDRootEntry VTDRootEntry;
>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL | ~VTD_HAW_MASK(aw))
>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1      0xffffffffffe00000ULL
>  
> +typedef enum VTDPASIDOp {
> +    VTD_PASID_BIND,
> +    VTD_PASID_UPDATE,
> +    VTD_PASID_UNBIND,
> +    VTD_OP_NUM
> +} VTDPASIDOp;
> +
>  typedef enum VTDPCInvType {
>      /* Force reset all */
>      VTD_PASID_CACHE_FORCE_RESET = 0,
> @@ -607,6 +614,9 @@ typedef struct VTDPASIDCacheInfo {
>  
>  #define VTD_SM_PASID_ENTRY_FLPM          3ULL
>  #define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
> +#define VTD_SM_PASID_ENTRY_SRE_BIT(val)  (!!((val) & 1ULL))
> +#define VTD_SM_PASID_ENTRY_WPE_BIT(val)  (!!(((val) >> 4) & 1ULL))
> +#define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
>  
>  /* First Level Paging Structure */
>  /* Masks for First Level Paging Entry */
> @@ -644,5 +654,6 @@ typedef struct VTDHostIOMMUDevice {
>      PCIBus *bus;
>      uint8_t devfn;
>      HostIOMMUDevice *hiod;
> +    uint32_t s1_hwpt;
>  } VTDHostIOMMUDevice;
>  #endif
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index be01f8885f..1c94a0033c 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -20,6 +20,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
>  #include "qemu/error-report.h"
>  #include "qemu/main-loop.h"
>  #include "qapi/error.h"
> @@ -41,6 +42,9 @@
>  #include "migration/vmstate.h"
>  #include "trace.h"
>  #include "system/iommufd.h"
> +#ifdef CONFIG_IOMMUFD
> +#include <linux/iommufd.h>
> +#endif
>  
>  /* context entry operations */
>  #define VTD_CE_GET_RID2PASID(ce) \
> @@ -839,6 +843,27 @@ static inline uint16_t vtd_pe_get_did(VTDPASIDEntry *pe)
>      return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
>  }
>  
> +static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
> +{
> +    return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
Isn'it called FSPTPTR in the spec. In the positive I would use the same
terminology.
> +}
> +
> +static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
> +{
> +    return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
here again I am bit lost as you seem to look at 3d 64b FSPM while there
is an AW field in the first 64b, please add a comment.
Also it isnot clear where this computation come from. Can you quote the
spec?
> +}
> +
> +static inline bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
> +{
> +    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
> +}
> +
> +/* check if pgtt is first stage translation */
> +static inline bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
> +{
> +    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
> +}
> +
>  static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
>  {
>      return pdire->val & 1;
> @@ -2431,6 +2456,188 @@ static void 
> vtd_context_global_invalidate(IntelIOMMUState *s)
>      vtd_iommu_replay_all(s);
>  }
>  
> +#ifdef CONFIG_IOMMUFD
> +static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
> +                                  VTDPASIDEntry *pe)
> +{
> +    memset(vtd, 0, sizeof(*vtd));
> +
> +    vtd->flags =  (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
> +                                        IOMMU_VTD_S1_SRE : 0) |
> +                  (VTD_SM_PASID_ENTRY_WPE_BIT(pe->val[2]) ?
> +                                        IOMMU_VTD_S1_WPE : 0) |
> +                  (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
> +                                        IOMMU_VTD_S1_EAFE : 0);
> +    vtd->addr_width = vtd_pe_get_fl_aw(pe);
> +    vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
> +}
> +
> +static int vtd_create_s1_hwpt(VTDHostIOMMUDevice *vtd_hiod,
> +                              VTDPASIDEntry *pe, Error **errp)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> +    struct iommu_hwpt_vtd_s1 vtd;
> +    uint32_t s1_hwpt;
> +
> +    vtd_init_s1_hwpt_data(&vtd, pe);
> +
> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
> +                                    idev->hwpt_id, 0, IOMMU_HWPT_DATA_VTD_S1,
> +                                    sizeof(vtd), &vtd, &s1_hwpt, errp)) {
> +        return -EINVAL;
> +    }
> +
> +    vtd_hiod->s1_hwpt = s1_hwpt;
> +
> +    return 0;
> +}
> +
> +static void vtd_destroy_s1_hwpt(VTDHostIOMMUDevice *vtd_hiod)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> +
> +    iommufd_backend_free_id(idev->iommufd, vtd_hiod->s1_hwpt);
> +    vtd_hiod->s1_hwpt = 0;
> +}
> +
> +static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
> +                                     uint32_t pasid, VTDPASIDEntry *pe,
> +                                     Error **errp)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> +    uint32_t hwpt_id;
> +    int ret;
> +
> +    if (vtd_pe_pgtt_is_flt(pe)) {
> +        ret = vtd_create_s1_hwpt(vtd_hiod, pe, errp);
> +        if (ret) {
> +            return ret;
> +        }
> +        hwpt_id = vtd_hiod->s1_hwpt;
> +    } else {
> +        hwpt_id = idev->hwpt_id;
> +    }
> +
> +    ret = !host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id, errp);
> +    trace_vtd_device_attach_hwpt(idev->devid, pasid, hwpt_id, ret);
> +    if (ret && vtd_pe_pgtt_is_flt(pe)) {
> +        vtd_destroy_s1_hwpt(vtd_hiod);
> +    }
> +
> +    return ret;
> +}
> +
> +static int vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
> +                                     uint32_t pasid, VTDPASIDEntry *pe,
> +                                     Error **errp)
> +{
> +    HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> +    int ret;
> +
> +    if (vtd_hiod->iommu_state->dmar_enabled) {
> +        ret = !host_iommu_device_iommufd_detach_hwpt(idev, errp);
> +        trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
> +    } else {
> +        ret = !host_iommu_device_iommufd_attach_hwpt(idev, idev->hwpt_id, 
> errp);
> +        trace_vtd_device_reattach_def_hwpt(idev->devid, pasid, idev->hwpt_id,
> +                                           ret);
> +    }
> +
> +    if (vtd_pe_pgtt_is_flt(pe)) {
> +        vtd_destroy_s1_hwpt(vtd_hiod);
> +    }
> +
> +    return ret;
> +}
> +
> +static int vtd_device_attach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
> +                                   VTDAddressSpace *vtd_as, VTDPASIDEntry 
> *pe)
> +{
> +    /*
> +     * If pe->gptt == FLT, should be go ahead to do bind as host only
PGTT. The rest of the sentence is difficult to parse.
> +     * accepts guest FLT under nesting. If pe->pgtt==PT, should setup
> +     * the pasid with GPA page table. Otherwise should return failure.
> +     */
> +    if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
> +        return -EINVAL;
> +    }
> +
> +    /* Should fail if the FLPT base is 0 */
> +    if (vtd_pe_pgtt_is_flt(pe) && !vtd_pe_get_flpt_base(pe)) {
> +        return -EINVAL;
> +    }
> +
> +    return vtd_device_attach_iommufd(vtd_hiod, vtd_as->pasid, pe, 
> &error_abort);
> +}
> +
> +static int vtd_device_detach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
> +                                   VTDAddressSpace *vtd_as)
> +{
> +    VTDPASIDEntry *cached_pe = vtd_as->pasid_cache_entry.cache_filled ?
> +                       &vtd_as->pasid_cache_entry.pasid_entry : NULL;
> +
> +    if (!cached_pe ||
> +        (!vtd_pe_pgtt_is_flt(cached_pe) && !vtd_pe_pgtt_is_pt(cached_pe))) {
> +        return 0;
> +    }
> +
> +    return vtd_device_detach_iommufd(vtd_hiod, vtd_as->pasid, cached_pe,
> +                                     &error_abort);
> +}
> +
> +/**
> + * Caller should hold iommu_lock.
> + */
> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as,
> +                                VTDPASIDEntry *pe, VTDPASIDOp op)
> +{
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    VTDHostIOMMUDevice *vtd_hiod;
> +    int devfn = vtd_as->devfn;
> +    int ret = -EINVAL;
> +    struct vtd_as_key key = {
> +        .bus = vtd_as->bus,
> +        .devfn = devfn,
> +    };
> +
> +    vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
> +    if (!vtd_hiod || !vtd_hiod->hiod) {
> +        /* means no need to go further, e.g. for emulated devices */
don't you want to check

            object_dynamic_cast(OBJECT(vtd_hiod->hiod),
                                TYPE_HOST_IOMMU_DEVICE_IOMMUFD)
as well.
In the positive you may introduce a helper that returns the vtd_hiod or NULL. 
It could also be used in previous patch and maybe at other locations as well.

> +        return 0;
> +    }
> +
> +    if (vtd_as->pasid != PCI_NO_PASID) {
> +        error_report("Non-rid_pasid %d not supported yet", vtd_as->pasid);
> +        return ret;
> +    }
> +
> +    switch (op) {
> +    case VTD_PASID_UPDATE:
> +    case VTD_PASID_BIND:
> +    {
> +        ret = vtd_device_attach_pgtbl(vtd_hiod, vtd_as, pe);
> +        break;
> +    }
> +    case VTD_PASID_UNBIND:
> +    {
> +        ret = vtd_device_detach_pgtbl(vtd_hiod, vtd_as);
> +        break;
> +    }
> +    default:
> +        error_report_once("Unknown VTDPASIDOp!!!\n");
> +        break;
> +    }
> +
> +    return ret;
> +}
> +#else
> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as,
> +                                VTDPASIDEntry *pe, VTDPASIDOp op)
> +{
> +    return 0;
> +}
> +#endif
> +
>  /* Do a context-cache device-selective invalidation.
>   * @func_mask: FM field after shifting
>   */
> @@ -3181,20 +3388,23 @@ static int vtd_fill_pe_in_cache(IntelIOMMUState *s, 
> VTDAddressSpace *vtd_as,
>                                  VTDPASIDEntry *pe)
>  {
>      VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry;
> +    int ret;
>  
> -    if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) {
> -        /* No need to go further as cached pasid entry is latest */
> -        return 0;
> +    if (pc_entry->cache_filled) {
> +        if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) {
> +            /* No need to go further as cached pasid entry is latest */
> +            return 0;
> +        }
> +        ret = vtd_bind_guest_pasid(vtd_as, pe, VTD_PASID_UPDATE);
> +    } else {
> +        ret = vtd_bind_guest_pasid(vtd_as, pe, VTD_PASID_BIND);
>      }
>  
> -    pc_entry->pasid_entry = *pe;
> -    pc_entry->cache_filled = true;
> -
> -    /*
> -     * TODO: send pasid bind to host for passthru devices
> -     */
> -
> -    return 0;
> +    if (!ret) {
> +        pc_entry->pasid_entry = *pe;
> +        pc_entry->cache_filled = true;
> +    }
> +    return ret;
>  }
>  
>  /*
> @@ -3265,10 +3475,14 @@ static gboolean vtd_flush_pasid(gpointer key, 
> gpointer value,
>      return false;
>  
>  remove:
> -    /*
> -     * TODO: send pasid unbind to host for passthru devices
> -     */
> -    pc_entry->cache_filled = false;
> +    if (pc_entry->cache_filled) {
> +        if (vtd_bind_guest_pasid(vtd_as, NULL, VTD_PASID_UNBIND)) {
> +            pasid_cache_info_set_error(pc_info);
> +            return false;
> +        } else {
> +            pc_entry->cache_filled = false;
> +        }
> +    }
>  
>      /*
>       * Don't remove address space of PCI_NO_PASID which is created by PCI
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index c8a936eb46..1c31b9a873 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -73,6 +73,9 @@ vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16
>  vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 
> 0x%"PRIx16" index %d vec %d (should be: %d)"
>  vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 
> 0x%"PRIx16" index %d trigger %d (should be: %d)"
>  vtd_reset_exit(void) ""
> +vtd_device_attach_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t hwpt_id, 
> int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
> +vtd_device_detach_hwpt(uint32_t dev_id, uint32_t pasid, int ret) "dev_id %d 
> pasid %d ret: %d"
> +vtd_device_reattach_def_hwpt(uint32_t dev_id, uint32_t pasid, uint32_t 
> hwpt_id, int ret) "dev_id %d pasid %d hwpt_id %d, ret: %d"
>  
>  # amd_iommu.c
>  amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at 
> addr 0x%"PRIx64" +  offset 0x%"PRIx32
Eric


Reply via email to