>-----Original Message----- >From: Eric Auger <eric.au...@redhat.com> >Subject: Re: [PATCH v2 18/19] Workaround for ERRATA_772415_SPR17 > >Hi Zhenzhong, > >On 6/20/25 9:18 AM, Zhenzhong Duan wrote: >> On a system influenced by ERRATA_772415, >IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 >> is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the >readonly >> range mapped on stage-2 page table could still be written. > >I would split this patch into a vfio only patch and an iommu one that >sets bcontainer->readonly according to the fetched info.
Will do. >> >> Reference from 4th Gen Intel Xeon Processor Scalable Family Specification >> Update, Errata Details, SPR17. >> https://www.intel.com/content/www/us/en/content-details/772415/content- >details.html >the link does not work for me. That's strange, what about: https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/ > >Please could you explain in english what the errata is about and what >actions need to be taken care in VFIO? > >Sorry I failed to understand " > >Due to this errata, even the readonly >range mapped on stage-2 page table could still be written. Copy the explanation in above link: "SPR17. Remapping Hardware May Set Access/Dirty Bits in a First-stage Page-table Entry Problem: When remapping hardware is configured by system software in scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended Access bit if enabled) in first-stage page-table entries even when second-stage mappings indicate that corresponding first-stage page-table is Read-Only. Implication: Due to this erratum, pages mapped as Read-only in second-stage page-tables may be modified by remapping hardware Access/Dirty bit updates. Workaround: None identified. System software enabling nested translations for a VM should ensure that there are no read-only pages in the corresponding second-stage mappings." > >" >> >> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com> >> --- >> include/hw/vfio/vfio-container-base.h | 1 + >> hw/vfio/iommufd.c | 8 +++++++- >> hw/vfio/listener.c | 13 +++++++++---- >> 3 files changed, 17 insertions(+), 5 deletions(-) >> >> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio- >container-base.h >> index f0232654ee..e5c51a51ac 100644 >> --- a/include/hw/vfio/vfio-container-base.h >> +++ b/include/hw/vfio/vfio-container-base.h >> @@ -51,6 +51,7 @@ typedef struct VFIOContainerBase { >> QLIST_HEAD(, VFIODevice) device_list; >> GList *iova_ranges; >> NotifierWithReturn cpr_reboot_notifier; >> + bool bypass_ro; >> } VFIOContainerBase; >> >> typedef struct VFIOGuestIOMMU { >> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c >> index 83a632bdee..23839a511a 100644 >> --- a/hw/vfio/iommufd.c >> +++ b/hw/vfio/iommufd.c >> @@ -306,6 +306,7 @@ static bool >iommufd_cdev_autodomains_get(VFIODevice *vbasedev, >> { >> ERRP_GUARD(); >> IOMMUFDBackend *iommufd = vbasedev->iommufd; >> + struct iommu_hw_info_vtd vtd; >= {}; ? Will do. >> uint32_t type, flags = 0; >> uint64_t hw_caps; >> VFIOIOASHwpt *hwpt; >> @@ -345,10 +346,15 @@ static bool >iommufd_cdev_autodomains_get(VFIODevice *vbasedev, >> * instead. >> */ >> if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev- >>devid, >> - &type, NULL, 0, &hw_caps, errp)) { >> + &type, &vtd, sizeof(vtd), &hw_caps, >> + errp)) { >> return false; >> } >> >> + if (vtd.flags & IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17) { >> + container->bcontainer.bypass_ro = true; >> + } >> + >> if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) { >> flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; >> } >> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c >> index f498e23a93..c64aa4539e 100644 >> --- a/hw/vfio/listener.c >> +++ b/hw/vfio/listener.c >> @@ -364,7 +364,8 @@ static bool >vfio_known_safe_misalignment(MemoryRegionSection *section) >> return true; >> } >> >> -static bool vfio_listener_valid_section(MemoryRegionSection *section, >> +static bool vfio_listener_valid_section(VFIOContainerBase *bcontainer, >> + MemoryRegionSection *section, >> const char *name) >> { >> if (vfio_listener_skipped_section(section)) { >> @@ -375,6 +376,10 @@ static bool >vfio_listener_valid_section(MemoryRegionSection *section, >> return false; >> } >> >> + if (bcontainer && bcontainer->bypass_ro && section->readonly) { >> + return false; >> + } >> + >> if (unlikely((section->offset_within_address_space & >> ~qemu_real_host_page_mask()) != >> (section->offset_within_region & >> ~qemu_real_host_page_mask()))) { >> @@ -494,7 +499,7 @@ void vfio_container_region_add(VFIOContainerBase >*bcontainer, >> int ret; >> Error *err = NULL; >> >> - if (!vfio_listener_valid_section(section, "region_add")) { >> + if (!vfio_listener_valid_section(bcontainer, section, "region_add")) { >> return; >> } >> >> @@ -655,7 +660,7 @@ static void vfio_listener_region_del(MemoryListener >*listener, >> int ret; >> bool try_unmap = true; >> >> - if (!vfio_listener_valid_section(section, "region_del")) { >> + if (!vfio_listener_valid_section(bcontainer, section, "region_del")) { >> return; >> } >> >> @@ -812,7 +817,7 @@ static void vfio_dirty_tracking_update(MemoryListener >*listener, >> container_of(listener, VFIODirtyRangesListener, listener); >> hwaddr iova, end; >> >> - if (!vfio_listener_valid_section(section, "tracking_update") || >> + if (!vfio_listener_valid_section(NULL, section, "tracking_update") || >> !vfio_get_section_iova_range(dirty->bcontainer, section, >> &iova, &end, NULL)) { >> return; >Thanks > >Eric