Hi Zhenzhong, On 6/20/25 9:18 AM, Zhenzhong Duan wrote: I would suggest: Handle PASID entry removal and update instead of verbing. > This adds an new entry VTDPASIDCacheEntry in VTDAddressSpace to cache the > pasid entry and track PASID usage and future PASID tagged DMA address > translation support in vIOMMU. > > VTDAddressSpace of PCI_NO_PASID is allocated when device is plugged and > never freed. For other pasid, VTDAddressSpace instance is created/destroyed > per the guest pasid entry set up/destroy for passthrough devices. While for > emulated devices, VTDAddressSpace instance is created in the PASID tagged DMA > translation and be destroyed per guest PASID cache invalidation. This focuses s/be detroyed/destroyed > on the PASID cache management for passthrough devices as there is no PASID > capable emulated devices yet. if you don't handle the emulated device case, may be omit talking about them here. > > When guest modifies a PASID entry, QEMU will capture the guest pasid selective > pasid cache invalidation, allocate or remove a VTDAddressSpace instance per > the > invalidation reasons: > > a) a present pasid entry moved to non-present > b) a present pasid entry to be a present entry > c) a non-present pasid entry moved to present > > This handles a) and b), following patch will handle c). This -> This patch > > vIOMMU emulator could figure out the reason by fetching latest guest pasid > entry > and compare it with the PASID cache. > > Signed-off-by: Yi Liu <yi.l....@intel.com> > Signed-off-by: Yi Sun <yi.y....@linux.intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com> > --- > hw/i386/intel_iommu_internal.h | 27 ++++ > include/hw/i386/intel_iommu.h | 6 + > hw/i386/intel_iommu.c | 265 +++++++++++++++++++++++++++++++-- > hw/i386/trace-events | 3 + > 4 files changed, 291 insertions(+), 10 deletions(-) > > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > index 18bc22fc72..01c881ed4d 100644 > --- a/hw/i386/intel_iommu_internal.h > +++ b/hw/i386/intel_iommu_internal.h > @@ -315,6 +315,7 @@ typedef enum VTDFaultReason { > * request while disabled */ > VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ > > + VTD_FR_RTADDR_INV_TTM = 0x31, /* Invalid TTM in RTADDR */ > /* PASID directory entry access failure */ > VTD_FR_PASID_DIR_ACCESS_ERR = 0x50, > /* The Present(P) field of pasid directory entry is 0 */ > @@ -492,6 +493,15 @@ typedef union VTDInvDesc VTDInvDesc; > #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL > #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL > Adding some basic comments for the uncultured reader like me would help doing the link with the vtd spec: PASID-cache Invalidate Descriptor (pc_inv_dsc) fields > +#define VTD_INV_DESC_PASIDC_G (3ULL << 4) > +#define VTD_INV_DESC_PASIDC_PASID(val) (((val) >> 32) & 0xfffffULL) > +#define VTD_INV_DESC_PASIDC_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK) > +#define VTD_INV_DESC_PASIDC_RSVD_VAL0 0xfff000000000f1c0ULL > + > +#define VTD_INV_DESC_PASIDC_DSI (0ULL << 4) > +#define VTD_INV_DESC_PASIDC_PASID_SI (1ULL << 4) > +#define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4) as those are values for the granularity field using
VTD_INV_DESC_PASIDC_G_* look relevant to me. I think you would gain in readability if you adopt extract32/64 syntax like in hw/arm/smmuv3-internal.h Looks more readable to me. > + > /* Information about page-selective IOTLB invalidate */ > struct VTDIOTLBPageInvInfo { > uint16_t domain_id; > @@ -552,6 +562,22 @@ typedef struct VTDRootEntry VTDRootEntry; > #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw)) > #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL > > +typedef enum VTDPCInvType { > + /* pasid cache invalidation rely on guest PASID entry */ > + VTD_PASID_CACHE_GLOBAL_INV, /* pasid cache global invalidation */ > + VTD_PASID_CACHE_DOMSI, /* pasid cache domain selective invalidation > */ > + VTD_PASID_CACHE_PASIDSI, /* pasid cache pasid selective invalidation > */ > +} VTDPCInvType; > + > +typedef struct VTDPASIDCacheInfo { > + VTDPCInvType type; > + uint16_t domain_id; did? > + uint32_t pasid; > + PCIBus *bus; > + uint16_t devfn; > + bool error_happened; wouldn't it make more sense to store an Error * directly so that you can fill it whenever there is an error occuring. Something like VFIOContainerBase::error > +} VTDPASIDCacheInfo; > + > /* PASID Table Related Definitions */ > #define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL) > #define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL) > @@ -563,6 +589,7 @@ typedef struct VTDRootEntry VTDRootEntry; > #define VTD_PASID_TABLE_BITS_MASK (0x3fULL) > #define VTD_PASID_TABLE_INDEX(pasid) ((pasid) & VTD_PASID_TABLE_BITS_MASK) > #define VTD_PASID_ENTRY_FPD (1ULL << 1) /* Fault Processing > Disable */ > +#define VTD_PASID_TBL_ENTRY_NUM (1ULL << 6) > > /* PASID Granular Translation Type Mask */ > #define VTD_PASID_ENTRY_P 1ULL > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > index 50f9b27a45..fbc9da903a 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -95,6 +95,11 @@ struct VTDPASIDEntry { > uint64_t val[8]; > }; > > +typedef struct VTDPASIDCacheEntry { > + struct VTDPASIDEntry pasid_entry; > + bool cache_filled; > +} VTDPASIDCacheEntry; > + > struct VTDAddressSpace { > PCIBus *bus; > uint8_t devfn; > @@ -107,6 +112,7 @@ struct VTDAddressSpace { > MemoryRegion iommu_ir_fault; /* Interrupt region for catching fault */ > IntelIOMMUState *iommu_state; > VTDContextCacheEntry context_cache_entry; > + VTDPASIDCacheEntry pasid_cache_entry; > QLIST_ENTRY(VTDAddressSpace) next; > /* Superset of notifier flags that this address space has */ > IOMMUNotifierFlag notifier_flags; > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 8948b8370f..1db581d14a 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -826,11 +826,24 @@ static inline bool vtd_pe_type_check(IntelIOMMUState > *s, VTDPASIDEntry *pe) > } > } > > +static inline uint16_t vtd_pe_get_did(VTDPASIDEntry *pe) > +{ > + return VTD_SM_PASID_ENTRY_DID((pe)->val[1]); see my previous comment about extract* > +} > + > static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire) > { > return pdire->val & 1; > } > > +static inline void pasid_cache_info_set_error(VTDPASIDCacheInfo *pc_info) not sure whether this helper is really needed > +{ > + if (pc_info->error_happened) { Besides why not simply overwriting unconditionnally? > + return; > + } > + pc_info->error_happened = true; > +} > + > /** > * Caller of this function should check present bit if wants > * to use pdir entry for further usage except for fpd bit check. > @@ -3103,6 +3116,241 @@ static bool vtd_process_piotlb_desc(IntelIOMMUState > *s, > return true; > } > > +static inline int vtd_dev_get_pe_from_pasid(VTDAddressSpace *vtd_as, > + uint32_t pasid, VTDPASIDEntry > *pe) > +{ > + IntelIOMMUState *s = vtd_as->iommu_state; > + VTDContextEntry ce; > + int ret; > + > + if (!s->root_scalable) { > + return -VTD_FR_RTADDR_INV_TTM; > + } > + > + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), > vtd_as->devfn, > + &ce); > + if (ret) { > + return ret; > + } > + > + return vtd_ce_get_pasid_entry(s, &ce, pe, pasid); > +} > + > +static bool vtd_pasid_entry_compare(VTDPASIDEntry *p1, VTDPASIDEntry *p2) > +{ > + return !memcmp(p1, p2, sizeof(*p1)); > +} > + > +/* > + * This function fills in the pasid entry in &vtd_as. Caller > + * of this function should hold iommu_lock. > + */ > +static int vtd_fill_pe_in_cache(IntelIOMMUState *s, VTDAddressSpace *vtd_as, > + VTDPASIDEntry *pe) > +{ > + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry; > + > + if (vtd_pasid_entry_compare(pe, &pc_entry->pasid_entry)) { > + /* No need to go further as cached pasid entry is latest */ > + return 0; > + } > + > + pc_entry->pasid_entry = *pe; > + pc_entry->cache_filled = true; > + > + /* > + * TODO: send pasid bind to host for passthru devices > + */ > + > + return 0; > +} > + > +/* > + * This function is used to update or clear cached pasid entry in vtd_as > + * instances. Caller of this function should hold iommu_lock. > + */ > +static gboolean vtd_flush_pasid(gpointer key, gpointer value, > + gpointer user_data) > +{ > + VTDPASIDCacheInfo *pc_info = user_data; > + VTDAddressSpace *vtd_as = value; > + IntelIOMMUState *s = vtd_as->iommu_state; > + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry; > + VTDPASIDEntry pe; > + uint16_t did; > + uint32_t pasid; > + int ret; > + > + if (!pc_entry->cache_filled) { > + return false; > + } > + did = vtd_pe_get_did(&pc_entry->pasid_entry); > + > + if (vtd_as_to_iommu_pasid_locked(vtd_as, &pasid)) { > + goto remove; > + } > + > + switch (pc_info->type) { > + case VTD_PASID_CACHE_PASIDSI: > + if (pc_info->pasid != pasid) { > + return false; > + } > + /* Fall through */ > + case VTD_PASID_CACHE_DOMSI: > + if (pc_info->domain_id != did) { > + return false; > + } > + /* Fall through */ > + case VTD_PASID_CACHE_GLOBAL_INV: > + break; > + default: > + error_report("invalid pc_info->type"); > + abort(); > + } > + > + /* > + * pasid cache invalidation may indicate a present pasid > + * entry to present pasid entry modification. To cover such > + * case, vIOMMU emulator needs to fetch latest guest pasid > + * entry and check cached pasid entry, then update pasid > + * cache and send pasid bind/unbind to host properly. if you don't do that in that patch I would put that in a subsequent patch. Focus on the PASID cache in this patch. See my subsequent comment > + */ > + ret = vtd_dev_get_pe_from_pasid(vtd_as, pasid, &pe); > + if (ret) { > + /* > + * No valid pasid entry in guest memory. e.g. pasid entry > + * was modified to be either all-zero or non-present. Either > + * case means existing pasid cache should be removed. > + */ > + goto remove; > + } > + > + if (vtd_fill_pe_in_cache(s, vtd_as, &pe)) { > + pasid_cache_info_set_error(pc_info); > + } > + return false; > + > +remove: > + /* > + * TODO: send pasid unbind to host for passthru devices > + */ > + pc_entry->cache_filled = false; > + > + /* > + * Don't remove address space of PCI_NO_PASID which is created by PCI > + * sub-system. > + */ > + if (vtd_as->pasid == PCI_NO_PASID) { > + return false; > + } > + return true; > +} > + > +/* > + * This function syncs the pasid bindings between guest and host. > + * It includes updating the pasid cache in vIOMMU and updating the > + * pasid bindings per guest's latest pasid entry presence. > + */ > +static void vtd_pasid_cache_sync(IntelIOMMUState *s, > + VTDPASIDCacheInfo *pc_info) > +{ > + if (!s->flts || !s->root_scalable || !s->dmar_enabled) { you don't update error_happened. Is that OK? > + return; > + } > + > + /* > + * Regards to a pasid cache invalidation, e.g. a PSI. > + * it could be either cases of below: > + * a) a present pasid entry moved to non-present > + * b) a present pasid entry to be a present entry > + * c) a non-present pasid entry moved to present > + * > + * Different invalidation granularity may affect different device > + * scope and pasid scope. But for each invalidation granularity, > + * it needs to do two steps to sync host and guest pasid binding. > + * > + * Here is the handling of a PSI: > + * 1) loop all the existing vtd_as instances to update them > + * according to the latest guest pasid entry in pasid table. > + * this will make sure affected existing vtd_as instances > + * cached the latest pasid entries. Also, during the loop, the > + * host should be notified if needed. e.g. pasid unbind or pasid > + * update. Should be able to cover case a) and case b). > + * > + * 2) loop all devices to cover case c) > + * - For devices which are backed by HostIOMMUDeviceIOMMUFD instances, > + * we loop them and check if guest pasid entry exists. If yes, > + * it is case c), we update the pasid cache and also notify > + * host. > + * - For devices which are not backed by HostIOMMUDeviceIOMMUFD, > + * it is not necessary to create pasid cache at this phase since > + * it could be created when vIOMMU does DMA address translation. > + * This is not yet implemented since there is no emulated > + * pasid-capable devices today. If we have such devices in > + * future, the pasid cache shall be created there. > + * Other granularity follow the same steps, just with different scope I would put all the stuff related to interactions with host in a subsequent patch. This patch could concentrate on the IOMMU PASID cache only. And then you would add the extra complexity of syncs with the host. I think it would simplify the review. > + * > + */ > + > + vtd_iommu_lock(s); > + /* > + * Step 1: loop all the existing vtd_as instances for pasid unbind and > + * update. > + */ > + g_hash_table_foreach_remove(s->vtd_address_spaces, vtd_flush_pasid, > + pc_info); > + vtd_iommu_unlock(s); > + > + /* TODO: Step 2: loop all the existing vtd_hiod instances for pasid > bind. */ > +} > + > +static bool vtd_process_pasid_desc(IntelIOMMUState *s, > + VTDInvDesc *inv_desc) > +{ > + uint16_t domain_id; > + uint32_t pasid; > + VTDPASIDCacheInfo pc_info = {}; > + uint64_t mask[4] = {VTD_INV_DESC_PASIDC_RSVD_VAL0, VTD_INV_DESC_ALL_ONE, > + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; > + > + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, true, > + __func__, "pasid cache inv")) { > + return false; > + } > + > + domain_id = VTD_INV_DESC_PASIDC_DID(inv_desc->val[0]); > + pasid = VTD_INV_DESC_PASIDC_PASID(inv_desc->val[0]); > + > + switch (inv_desc->val[0] & VTD_INV_DESC_PASIDC_G) { > + case VTD_INV_DESC_PASIDC_DSI: > + trace_vtd_pasid_cache_dsi(domain_id); > + pc_info.type = VTD_PASID_CACHE_DOMSI; > + pc_info.domain_id = domain_id; > + break; > + > + case VTD_INV_DESC_PASIDC_PASID_SI: > + /* PASID selective implies a DID selective */ > + trace_vtd_pasid_cache_psi(domain_id, pasid); > + pc_info.type = VTD_PASID_CACHE_PASIDSI; > + pc_info.domain_id = domain_id; > + pc_info.pasid = pasid; > + break; > + > + case VTD_INV_DESC_PASIDC_GLOBAL: > + trace_vtd_pasid_cache_gsi(); > + pc_info.type = VTD_PASID_CACHE_GLOBAL_INV; > + break; > + > + default: > + error_report_once("invalid-inv-granu-in-pc_inv_desc hi: 0x%" PRIx64 Make it end-user understandable? invalid granulrity field in PASID-cache invalidate descriptor. > + " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]); > + return false; > + } > + > + vtd_pasid_cache_sync(s, &pc_info); > + return !pc_info.error_happened ? true : false; > +} > + > static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, > VTDInvDesc *inv_desc) > { > @@ -3264,6 +3512,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) > } > break; > > + case VTD_INV_DESC_PC: > + trace_vtd_inv_desc("pasid-cache", inv_desc.val[1], inv_desc.val[0]); > + if (!vtd_process_pasid_desc(s, &inv_desc)) { > + return false; > + } > + break; > + > case VTD_INV_DESC_PIOTLB: > trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]); > if (!vtd_process_piotlb_desc(s, &inv_desc)) { > @@ -3299,16 +3554,6 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) > } > break; > > - /* > - * TODO: the entity of below two cases will be implemented in future > series. > - * To make guest (which integrates scalable mode support patch set in > - * iommu driver) work, just return true is enough so far. > - */ > - case VTD_INV_DESC_PC: > - if (s->scalable_mode) { > - break; > - } > - /* fallthrough */ > default: > error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 > " (unknown type)", __func__, inv_desc.hi, > diff --git a/hw/i386/trace-events b/hw/i386/trace-events > index ac9e1a10aa..ae5bbfcdc0 100644 > --- a/hw/i386/trace-events > +++ b/hw/i386/trace-events > @@ -24,6 +24,9 @@ vtd_inv_qi_head(uint16_t head) "read head %d" > vtd_inv_qi_tail(uint16_t head) "write tail %d" > vtd_inv_qi_fetch(void) "" > vtd_context_cache_reset(void) "" > +vtd_pasid_cache_gsi(void) "" > +vtd_pasid_cache_dsi(uint16_t domain) "Domain selective PC invalidation > domain 0x%"PRIx16 Would use pasid cache instead of PC > +vtd_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID selective PC > invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32 > vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present" > vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" > devfn %"PRIu8" not present" > vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t > domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" > domain 0x%"PRIx16 Thanks Eric