>-----Original Message-----
>From: Eric Auger <eric.au...@redhat.com>
>Subject: Re: [PATCH v2 14/19] intel_iommu: Bind/unbind guest page table to
>host
>
>
>
>On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
>> This captures the guest PASID table entry modifications and
>> propagates the changes to host to attach a hwpt with type determined
>> per guest IOMMU mdoe and PGTT configuration.
>>
>> When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
>> page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
>> vIOMMU reuse hwpt(GPA->HPA) provided by VFIO as nested parent to
>> construct nested page table.
>>
>> When guest decides to use legacy mode then vIOMMU switches the MRs of
>> the device's AS, hence the IOAS created by VFIO container would be
>> switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
>> switched to IOMMU MR. So it is able to support shadowing the guest IO
>> page table.
>>
>> Co-Authored-by: Yi Liu <yi.l....@intel.com>
>> Signed-off-by: Yi Liu <yi.l....@intel.com>
>> Signed-off-by: Yi Sun <yi.y....@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
>> ---
>>  hw/i386/intel_iommu_internal.h |  11 ++
>>  hw/i386/intel_iommu.c          | 244
>+++++++++++++++++++++++++++++++--
>>  hw/i386/trace-events           |   3 +
>>  3 files changed, 243 insertions(+), 15 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu_internal.h
>b/hw/i386/intel_iommu_internal.h
>> index 5ed76864be..92a533db54 100644
>> --- a/hw/i386/intel_iommu_internal.h
>> +++ b/hw/i386/intel_iommu_internal.h
>> @@ -563,6 +563,13 @@ typedef struct VTDRootEntry VTDRootEntry;
>>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw)  (0x1e0ULL |
>~VTD_HAW_MASK(aw))
>>  #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1
>0xffffffffffe00000ULL
>>
>> +typedef enum VTDPASIDOp {
>> +    VTD_PASID_BIND,
>> +    VTD_PASID_UPDATE,
>> +    VTD_PASID_UNBIND,
>> +    VTD_OP_NUM
>> +} VTDPASIDOp;
>> +
>>  typedef enum VTDPCInvType {
>>      /* Force reset all */
>>      VTD_PASID_CACHE_FORCE_RESET = 0,
>> @@ -607,6 +614,9 @@ typedef struct VTDPASIDCacheInfo {
>>
>>  #define VTD_SM_PASID_ENTRY_FLPM          3ULL
>>  #define VTD_SM_PASID_ENTRY_FLPTPTR       (~0xfffULL)
>> +#define VTD_SM_PASID_ENTRY_SRE_BIT(val)  (!!((val) & 1ULL))
>> +#define VTD_SM_PASID_ENTRY_WPE_BIT(val)  (!!(((val) >> 4) & 1ULL))
>> +#define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
>>
>>  /* First Level Paging Structure */
>>  /* Masks for First Level Paging Entry */
>> @@ -644,5 +654,6 @@ typedef struct VTDHostIOMMUDevice {
>>      PCIBus *bus;
>>      uint8_t devfn;
>>      HostIOMMUDevice *hiod;
>> +    uint32_t s1_hwpt;
>>  } VTDHostIOMMUDevice;
>>  #endif
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index be01f8885f..1c94a0033c 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -20,6 +20,7 @@
>>   */
>>
>>  #include "qemu/osdep.h"
>> +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
>>  #include "qemu/error-report.h"
>>  #include "qemu/main-loop.h"
>>  #include "qapi/error.h"
>> @@ -41,6 +42,9 @@
>>  #include "migration/vmstate.h"
>>  #include "trace.h"
>>  #include "system/iommufd.h"
>> +#ifdef CONFIG_IOMMUFD
>> +#include <linux/iommufd.h>
>> +#endif
>>
>>  /* context entry operations */
>>  #define VTD_CE_GET_RID2PASID(ce) \
>> @@ -839,6 +843,27 @@ static inline uint16_t
>vtd_pe_get_did(VTDPASIDEntry *pe)
>>      return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
>>  }
>>
>> +static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
>> +{
>> +    return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
>Isn'it called FSPTPTR in the spec. In the positive I would use the same
>terminology.

Sure.

>> +}
>> +
>> +static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
>> +{
>> +    return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
>here again I am bit lost as you seem to look at 3d 64b FSPM while there
>is an AW field in the first 64b, please add a comment.

Sure, will add. Per spec, this helper is for stage1 address width,
AW in first 64b is for stage2 page table:

"This field is treated as Reserved(0) for implementations not supporting 
Second-stage
Translation (SSTS=0 in the Extended Capability Register)."

>Also it isnot clear where this computation come from. Can you quote the
>spec?

In Figure 9-6. Scalable-Mode PASID Table Entry Format:

This field specifies the paging mode for first-stage translation.
* 00: 4-level paging (FSPTPTR is base of FS-PML4)
* 01: 5-level paging (FSPTPTR is base of FS-PML5)
* 10-11: Reserved

For 4-level paging, iova width is 48bit, for 5-level paging it's (48+9)bit.


>> +}
>> +
>> +static inline bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
>> +{
>> +    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
>> +}
>> +
>> +/* check if pgtt is first stage translation */
>> +static inline bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
>> +{
>> +    return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
>> +}
>> +
>>  static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
>>  {
>>      return pdire->val & 1;
>> @@ -2431,6 +2456,188 @@ static void
>vtd_context_global_invalidate(IntelIOMMUState *s)
>>      vtd_iommu_replay_all(s);
>>  }
>>
>> +#ifdef CONFIG_IOMMUFD
>> +static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
>> +                                  VTDPASIDEntry *pe)
>> +{
>> +    memset(vtd, 0, sizeof(*vtd));
>> +
>> +    vtd->flags =  (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
>> +                                        IOMMU_VTD_S1_SRE : 0)
>|
>> +                  (VTD_SM_PASID_ENTRY_WPE_BIT(pe->val[2]) ?
>> +                                        IOMMU_VTD_S1_WPE :
>0) |
>> +                  (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
>> +                                        IOMMU_VTD_S1_EAFE :
>0);
>> +    vtd->addr_width = vtd_pe_get_fl_aw(pe);
>> +    vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
>> +}
>> +
>> +static int vtd_create_s1_hwpt(VTDHostIOMMUDevice *vtd_hiod,
>> +                              VTDPASIDEntry *pe, Error **errp)
>> +{
>> +    HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> +    struct iommu_hwpt_vtd_s1 vtd;
>> +    uint32_t s1_hwpt;
>> +
>> +    vtd_init_s1_hwpt_data(&vtd, pe);
>> +
>> +    if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
>> +                                    idev->hwpt_id, 0,
>IOMMU_HWPT_DATA_VTD_S1,
>> +                                    sizeof(vtd), &vtd, &s1_hwpt,
>errp)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    vtd_hiod->s1_hwpt = s1_hwpt;
>> +
>> +    return 0;
>> +}
>> +
>> +static void vtd_destroy_s1_hwpt(VTDHostIOMMUDevice *vtd_hiod)
>> +{
>> +    HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> +
>> +    iommufd_backend_free_id(idev->iommufd, vtd_hiod->s1_hwpt);
>> +    vtd_hiod->s1_hwpt = 0;
>> +}
>> +
>> +static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
>> +                                     uint32_t pasid,
>VTDPASIDEntry *pe,
>> +                                     Error **errp)
>> +{
>> +    HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> +    uint32_t hwpt_id;
>> +    int ret;
>> +
>> +    if (vtd_pe_pgtt_is_flt(pe)) {
>> +        ret = vtd_create_s1_hwpt(vtd_hiod, pe, errp);
>> +        if (ret) {
>> +            return ret;
>> +        }
>> +        hwpt_id = vtd_hiod->s1_hwpt;
>> +    } else {
>> +        hwpt_id = idev->hwpt_id;
>> +    }
>> +
>> +    ret = !host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id,
>errp);
>> +    trace_vtd_device_attach_hwpt(idev->devid, pasid, hwpt_id, ret);
>> +    if (ret && vtd_pe_pgtt_is_flt(pe)) {
>> +        vtd_destroy_s1_hwpt(vtd_hiod);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static int vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
>> +                                     uint32_t pasid,
>VTDPASIDEntry *pe,
>> +                                     Error **errp)
>> +{
>> +    HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> +    int ret;
>> +
>> +    if (vtd_hiod->iommu_state->dmar_enabled) {
>> +        ret = !host_iommu_device_iommufd_detach_hwpt(idev, errp);
>> +        trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
>> +    } else {
>> +        ret = !host_iommu_device_iommufd_attach_hwpt(idev,
>idev->hwpt_id, errp);
>> +        trace_vtd_device_reattach_def_hwpt(idev->devid, pasid,
>idev->hwpt_id,
>> +                                           ret);
>> +    }
>> +
>> +    if (vtd_pe_pgtt_is_flt(pe)) {
>> +        vtd_destroy_s1_hwpt(vtd_hiod);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static int vtd_device_attach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
>> +                                   VTDAddressSpace *vtd_as,
>VTDPASIDEntry *pe)
>> +{
>> +    /*
>> +     * If pe->gptt == FLT, should be go ahead to do bind as host only
>PGTT. The rest of the sentence is difficult to parse.

Sure, will rephrase.

>> +     * accepts guest FLT under nesting. If pe->pgtt==PT, should setup
>> +     * the pasid with GPA page table. Otherwise should return failure.
>> +     */
>> +    if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    /* Should fail if the FLPT base is 0 */
>> +    if (vtd_pe_pgtt_is_flt(pe) && !vtd_pe_get_flpt_base(pe)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    return vtd_device_attach_iommufd(vtd_hiod, vtd_as->pasid, pe,
>&error_abort);
>> +}
>> +
>> +static int vtd_device_detach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
>> +                                   VTDAddressSpace *vtd_as)
>> +{
>> +    VTDPASIDEntry *cached_pe =
>vtd_as->pasid_cache_entry.cache_filled ?
>> +                       &vtd_as->pasid_cache_entry.pasid_entry :
>NULL;
>> +
>> +    if (!cached_pe ||
>> +        (!vtd_pe_pgtt_is_flt(cached_pe)
>&& !vtd_pe_pgtt_is_pt(cached_pe))) {
>> +        return 0;
>> +    }
>> +
>> +    return vtd_device_detach_iommufd(vtd_hiod, vtd_as->pasid,
>cached_pe,
>> +                                     &error_abort);
>> +}
>> +
>> +/**
>> + * Caller should hold iommu_lock.
>> + */
>> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as,
>> +                                VTDPASIDEntry *pe, VTDPASIDOp
>op)
>> +{
>> +    IntelIOMMUState *s = vtd_as->iommu_state;
>> +    VTDHostIOMMUDevice *vtd_hiod;
>> +    int devfn = vtd_as->devfn;
>> +    int ret = -EINVAL;
>> +    struct vtd_as_key key = {
>> +        .bus = vtd_as->bus,
>> +        .devfn = devfn,
>> +    };
>> +
>> +    vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
>> +    if (!vtd_hiod || !vtd_hiod->hiod) {
>> +        /* means no need to go further, e.g. for emulated devices */
>don't you want to check
>
>            object_dynamic_cast(OBJECT(vtd_hiod->hiod),
>
>TYPE_HOST_IOMMU_DEVICE_IOMMUFD)
>as well.
>In the positive you may introduce a helper that returns the vtd_hiod or NULL.
>It could also be used in previous patch and maybe at other locations as well.

After further thinking, it looks checking only vtd_hiod is enough, as vtd_hiod 
is
created based on hiod, and below check in vtd_check_hiod() ensure hiod are
TYPE_HOST_IOMMU_DEVICE_IOMMUFD.

    /* Remaining checks are all stage-1 translation specific */
    if (!object_dynamic_cast(OBJECT(hiod), TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) {
        error_setg(errp, "Need IOMMUFD backend when x-flts=on");
        return false;
    }

Note we only run into these functions when hiod is 
TYPE_HOST_IOMMU_DEVICE_IOMMUFD and x-flts=on.

Thanks
Zhenzhong

Reply via email to