>-----Original Message-----
>From: Eric Auger <eric.au...@redhat.com>
>Subject: Re: [PATCH v2 14/19] intel_iommu: Bind/unbind guest page table to
>host
>
>
>
>On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
>> This captures the guest PASID table entry modifications and
>> propagates the changes to host to attach a hwpt with type determined
>> per guest IOMMU mdoe and PGTT configuration.
>>
>> When PGTT is Pass-through(100b), the hwpt on host side is a stage-2
>> page table(GPA->HPA). When PGTT is First-stage Translation only(001b),
>> vIOMMU reuse hwpt(GPA->HPA) provided by VFIO as nested parent to
>> construct nested page table.
>>
>> When guest decides to use legacy mode then vIOMMU switches the MRs of
>> the device's AS, hence the IOAS created by VFIO container would be
>> switched to using the IOMMU_NOTIFIER_IOTLB_EVENTS since the MR is
>> switched to IOMMU MR. So it is able to support shadowing the guest IO
>> page table.
>>
>> Co-Authored-by: Yi Liu <yi.l....@intel.com>
>> Signed-off-by: Yi Liu <yi.l....@intel.com>
>> Signed-off-by: Yi Sun <yi.y....@linux.intel.com>
>> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
>> ---
>> hw/i386/intel_iommu_internal.h | 11 ++
>> hw/i386/intel_iommu.c | 244
>+++++++++++++++++++++++++++++++--
>> hw/i386/trace-events | 3 +
>> 3 files changed, 243 insertions(+), 15 deletions(-)
>>
>> diff --git a/hw/i386/intel_iommu_internal.h
>b/hw/i386/intel_iommu_internal.h
>> index 5ed76864be..92a533db54 100644
>> --- a/hw/i386/intel_iommu_internal.h
>> +++ b/hw/i386/intel_iommu_internal.h
>> @@ -563,6 +563,13 @@ typedef struct VTDRootEntry VTDRootEntry;
>> #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL |
>~VTD_HAW_MASK(aw))
>> #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1
>0xffffffffffe00000ULL
>>
>> +typedef enum VTDPASIDOp {
>> + VTD_PASID_BIND,
>> + VTD_PASID_UPDATE,
>> + VTD_PASID_UNBIND,
>> + VTD_OP_NUM
>> +} VTDPASIDOp;
>> +
>> typedef enum VTDPCInvType {
>> /* Force reset all */
>> VTD_PASID_CACHE_FORCE_RESET = 0,
>> @@ -607,6 +614,9 @@ typedef struct VTDPASIDCacheInfo {
>>
>> #define VTD_SM_PASID_ENTRY_FLPM 3ULL
>> #define VTD_SM_PASID_ENTRY_FLPTPTR (~0xfffULL)
>> +#define VTD_SM_PASID_ENTRY_SRE_BIT(val) (!!((val) & 1ULL))
>> +#define VTD_SM_PASID_ENTRY_WPE_BIT(val) (!!(((val) >> 4) & 1ULL))
>> +#define VTD_SM_PASID_ENTRY_EAFE_BIT(val) (!!(((val) >> 7) & 1ULL))
>>
>> /* First Level Paging Structure */
>> /* Masks for First Level Paging Entry */
>> @@ -644,5 +654,6 @@ typedef struct VTDHostIOMMUDevice {
>> PCIBus *bus;
>> uint8_t devfn;
>> HostIOMMUDevice *hiod;
>> + uint32_t s1_hwpt;
>> } VTDHostIOMMUDevice;
>> #endif
>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> index be01f8885f..1c94a0033c 100644
>> --- a/hw/i386/intel_iommu.c
>> +++ b/hw/i386/intel_iommu.c
>> @@ -20,6 +20,7 @@
>> */
>>
>> #include "qemu/osdep.h"
>> +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
>> #include "qemu/error-report.h"
>> #include "qemu/main-loop.h"
>> #include "qapi/error.h"
>> @@ -41,6 +42,9 @@
>> #include "migration/vmstate.h"
>> #include "trace.h"
>> #include "system/iommufd.h"
>> +#ifdef CONFIG_IOMMUFD
>> +#include <linux/iommufd.h>
>> +#endif
>>
>> /* context entry operations */
>> #define VTD_CE_GET_RID2PASID(ce) \
>> @@ -839,6 +843,27 @@ static inline uint16_t
>vtd_pe_get_did(VTDPASIDEntry *pe)
>> return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
>> }
>>
>> +static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
>> +{
>> + return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
>Isn'it called FSPTPTR in the spec. In the positive I would use the same
>terminology.
Sure.
>> +}
>> +
>> +static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
>> +{
>> + return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
>here again I am bit lost as you seem to look at 3d 64b FSPM while there
>is an AW field in the first 64b, please add a comment.
Sure, will add. Per spec, this helper is for stage1 address width,
AW in first 64b is for stage2 page table:
"This field is treated as Reserved(0) for implementations not supporting
Second-stage
Translation (SSTS=0 in the Extended Capability Register)."
>Also it isnot clear where this computation come from. Can you quote the
>spec?
In Figure 9-6. Scalable-Mode PASID Table Entry Format:
This field specifies the paging mode for first-stage translation.
* 00: 4-level paging (FSPTPTR is base of FS-PML4)
* 01: 5-level paging (FSPTPTR is base of FS-PML5)
* 10-11: Reserved
For 4-level paging, iova width is 48bit, for 5-level paging it's (48+9)bit.
>> +}
>> +
>> +static inline bool vtd_pe_pgtt_is_pt(VTDPASIDEntry *pe)
>> +{
>> + return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_PT);
>> +}
>> +
>> +/* check if pgtt is first stage translation */
>> +static inline bool vtd_pe_pgtt_is_flt(VTDPASIDEntry *pe)
>> +{
>> + return (VTD_PE_GET_TYPE(pe) == VTD_SM_PASID_ENTRY_FLT);
>> +}
>> +
>> static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
>> {
>> return pdire->val & 1;
>> @@ -2431,6 +2456,188 @@ static void
>vtd_context_global_invalidate(IntelIOMMUState *s)
>> vtd_iommu_replay_all(s);
>> }
>>
>> +#ifdef CONFIG_IOMMUFD
>> +static void vtd_init_s1_hwpt_data(struct iommu_hwpt_vtd_s1 *vtd,
>> + VTDPASIDEntry *pe)
>> +{
>> + memset(vtd, 0, sizeof(*vtd));
>> +
>> + vtd->flags = (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
>> + IOMMU_VTD_S1_SRE : 0)
>|
>> + (VTD_SM_PASID_ENTRY_WPE_BIT(pe->val[2]) ?
>> + IOMMU_VTD_S1_WPE :
>0) |
>> + (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
>> + IOMMU_VTD_S1_EAFE :
>0);
>> + vtd->addr_width = vtd_pe_get_fl_aw(pe);
>> + vtd->pgtbl_addr = (uint64_t)vtd_pe_get_flpt_base(pe);
>> +}
>> +
>> +static int vtd_create_s1_hwpt(VTDHostIOMMUDevice *vtd_hiod,
>> + VTDPASIDEntry *pe, Error **errp)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + struct iommu_hwpt_vtd_s1 vtd;
>> + uint32_t s1_hwpt;
>> +
>> + vtd_init_s1_hwpt_data(&vtd, pe);
>> +
>> + if (!iommufd_backend_alloc_hwpt(idev->iommufd, idev->devid,
>> + idev->hwpt_id, 0,
>IOMMU_HWPT_DATA_VTD_S1,
>> + sizeof(vtd), &vtd, &s1_hwpt,
>errp)) {
>> + return -EINVAL;
>> + }
>> +
>> + vtd_hiod->s1_hwpt = s1_hwpt;
>> +
>> + return 0;
>> +}
>> +
>> +static void vtd_destroy_s1_hwpt(VTDHostIOMMUDevice *vtd_hiod)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> +
>> + iommufd_backend_free_id(idev->iommufd, vtd_hiod->s1_hwpt);
>> + vtd_hiod->s1_hwpt = 0;
>> +}
>> +
>> +static int vtd_device_attach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
>> + uint32_t pasid,
>VTDPASIDEntry *pe,
>> + Error **errp)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + uint32_t hwpt_id;
>> + int ret;
>> +
>> + if (vtd_pe_pgtt_is_flt(pe)) {
>> + ret = vtd_create_s1_hwpt(vtd_hiod, pe, errp);
>> + if (ret) {
>> + return ret;
>> + }
>> + hwpt_id = vtd_hiod->s1_hwpt;
>> + } else {
>> + hwpt_id = idev->hwpt_id;
>> + }
>> +
>> + ret = !host_iommu_device_iommufd_attach_hwpt(idev, hwpt_id,
>errp);
>> + trace_vtd_device_attach_hwpt(idev->devid, pasid, hwpt_id, ret);
>> + if (ret && vtd_pe_pgtt_is_flt(pe)) {
>> + vtd_destroy_s1_hwpt(vtd_hiod);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static int vtd_device_detach_iommufd(VTDHostIOMMUDevice *vtd_hiod,
>> + uint32_t pasid,
>VTDPASIDEntry *pe,
>> + Error **errp)
>> +{
>> + HostIOMMUDeviceIOMMUFD *idev =
>HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + int ret;
>> +
>> + if (vtd_hiod->iommu_state->dmar_enabled) {
>> + ret = !host_iommu_device_iommufd_detach_hwpt(idev, errp);
>> + trace_vtd_device_detach_hwpt(idev->devid, pasid, ret);
>> + } else {
>> + ret = !host_iommu_device_iommufd_attach_hwpt(idev,
>idev->hwpt_id, errp);
>> + trace_vtd_device_reattach_def_hwpt(idev->devid, pasid,
>idev->hwpt_id,
>> + ret);
>> + }
>> +
>> + if (vtd_pe_pgtt_is_flt(pe)) {
>> + vtd_destroy_s1_hwpt(vtd_hiod);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static int vtd_device_attach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
>> + VTDAddressSpace *vtd_as,
>VTDPASIDEntry *pe)
>> +{
>> + /*
>> + * If pe->gptt == FLT, should be go ahead to do bind as host only
>PGTT. The rest of the sentence is difficult to parse.
Sure, will rephrase.
>> + * accepts guest FLT under nesting. If pe->pgtt==PT, should setup
>> + * the pasid with GPA page table. Otherwise should return failure.
>> + */
>> + if (!vtd_pe_pgtt_is_flt(pe) && !vtd_pe_pgtt_is_pt(pe)) {
>> + return -EINVAL;
>> + }
>> +
>> + /* Should fail if the FLPT base is 0 */
>> + if (vtd_pe_pgtt_is_flt(pe) && !vtd_pe_get_flpt_base(pe)) {
>> + return -EINVAL;
>> + }
>> +
>> + return vtd_device_attach_iommufd(vtd_hiod, vtd_as->pasid, pe,
>&error_abort);
>> +}
>> +
>> +static int vtd_device_detach_pgtbl(VTDHostIOMMUDevice *vtd_hiod,
>> + VTDAddressSpace *vtd_as)
>> +{
>> + VTDPASIDEntry *cached_pe =
>vtd_as->pasid_cache_entry.cache_filled ?
>> + &vtd_as->pasid_cache_entry.pasid_entry :
>NULL;
>> +
>> + if (!cached_pe ||
>> + (!vtd_pe_pgtt_is_flt(cached_pe)
>&& !vtd_pe_pgtt_is_pt(cached_pe))) {
>> + return 0;
>> + }
>> +
>> + return vtd_device_detach_iommufd(vtd_hiod, vtd_as->pasid,
>cached_pe,
>> + &error_abort);
>> +}
>> +
>> +/**
>> + * Caller should hold iommu_lock.
>> + */
>> +static int vtd_bind_guest_pasid(VTDAddressSpace *vtd_as,
>> + VTDPASIDEntry *pe, VTDPASIDOp
>op)
>> +{
>> + IntelIOMMUState *s = vtd_as->iommu_state;
>> + VTDHostIOMMUDevice *vtd_hiod;
>> + int devfn = vtd_as->devfn;
>> + int ret = -EINVAL;
>> + struct vtd_as_key key = {
>> + .bus = vtd_as->bus,
>> + .devfn = devfn,
>> + };
>> +
>> + vtd_hiod = g_hash_table_lookup(s->vtd_host_iommu_dev, &key);
>> + if (!vtd_hiod || !vtd_hiod->hiod) {
>> + /* means no need to go further, e.g. for emulated devices */
>don't you want to check
>
> object_dynamic_cast(OBJECT(vtd_hiod->hiod),
>
>TYPE_HOST_IOMMU_DEVICE_IOMMUFD)
>as well.
>In the positive you may introduce a helper that returns the vtd_hiod or NULL.
>It could also be used in previous patch and maybe at other locations as well.
After further thinking, it looks checking only vtd_hiod is enough, as vtd_hiod
is
created based on hiod, and below check in vtd_check_hiod() ensure hiod are
TYPE_HOST_IOMMU_DEVICE_IOMMUFD.
/* Remaining checks are all stage-1 translation specific */
if (!object_dynamic_cast(OBJECT(hiod), TYPE_HOST_IOMMU_DEVICE_IOMMUFD)) {
error_setg(errp, "Need IOMMUFD backend when x-flts=on");
return false;
}
Note we only run into these functions when hiod is
TYPE_HOST_IOMMU_DEVICE_IOMMUFD and x-flts=on.
Thanks
Zhenzhong