On 6/23/25 5:29 AM, Duan, Zhenzhong wrote:
>
>> -----Original Message-----
>> From: Eric Auger <eric.au...@redhat.com>
>> Subject: Re: [PATCH v2 18/19] Workaround for ERRATA_772415_SPR17
>>
>> Hi Zhenzhong,
>>
>> On 6/20/25 9:18 AM, Zhenzhong Duan wrote:
>>> On a system influenced by ERRATA_772415,
>> IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17
>>> is repored by IOMMU_DEVICE_GET_HW_INFO. Due to this errata, even the
>> readonly
>>> range mapped on stage-2 page table could still be written.
>> I would split this patch into a vfio only patch and an iommu one that
>> sets bcontainer->readonly according to the fetched info.
> Will do.
>
>>> Reference from 4th Gen Intel Xeon Processor Scalable Family Specification
>>> Update, Errata Details, SPR17.
>>> https://www.intel.com/content/www/us/en/content-details/772415/content-
>> details.html
>> the link does not work for me.
> That's strange, what about:
>
> https://edc.intel.com/content/www/us/en/design/products-and-solutions/processors-and-chipsets/eagle-stream/sapphire-rapids-specification-update/
yes this link works for me.
>
>> Please could you explain in english what the errata is about and what
>> actions need to be taken care in VFIO?
>>
>> Sorry I failed to understand "
>>
>> Due to this errata, even the readonly
>> range mapped on stage-2 page table could still be written.
> Copy the explanation in above link:
>
> "SPR17. Remapping Hardware May Set Access/Dirty Bits in a First-stage
> Page-table Entry
>
> Problem: When remapping hardware is configured by system software in
> scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the
> PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended
> Access bit if enabled) in first-stage page-table entries even when
> second-stage mappings indicate that corresponding first-stage page-table
> is Read-Only.
>
> Implication: Due to this erratum, pages mapped as Read-only in second-stage
> page-tables may be modified by remapping hardware Access/Dirty bit updates.
>
> Workaround: None identified. System software enabling nested translations
> for a VM should ensure that there are no read-only pages in the
> corresponding second-stage mappings."
OK this definitively helps. I will further look at the patch with that
background

Eric
>
>> "
>>> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com>
>>> ---
>>>  include/hw/vfio/vfio-container-base.h |  1 +
>>>  hw/vfio/iommufd.c                     |  8 +++++++-
>>>  hw/vfio/listener.c                    | 13 +++++++++----
>>>  3 files changed, 17 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-
>> container-base.h
>>> index f0232654ee..e5c51a51ac 100644
>>> --- a/include/hw/vfio/vfio-container-base.h
>>> +++ b/include/hw/vfio/vfio-container-base.h
>>> @@ -51,6 +51,7 @@ typedef struct VFIOContainerBase {
>>>      QLIST_HEAD(, VFIODevice) device_list;
>>>      GList *iova_ranges;
>>>      NotifierWithReturn cpr_reboot_notifier;
>>> +    bool bypass_ro;
>>>  } VFIOContainerBase;
>>>
>>>  typedef struct VFIOGuestIOMMU {
>>> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
>>> index 83a632bdee..23839a511a 100644
>>> --- a/hw/vfio/iommufd.c
>>> +++ b/hw/vfio/iommufd.c
>>> @@ -306,6 +306,7 @@ static bool
>> iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>>>  {
>>>      ERRP_GUARD();
>>>      IOMMUFDBackend *iommufd = vbasedev->iommufd;
>>> +    struct iommu_hw_info_vtd vtd;
>> = {}; ?
> Will do.
>
>>>      uint32_t type, flags = 0;
>>>      uint64_t hw_caps;
>>>      VFIOIOASHwpt *hwpt;
>>> @@ -345,10 +346,15 @@ static bool
>> iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
>>>       * instead.
>>>       */
>>>      if (!iommufd_backend_get_device_info(vbasedev->iommufd, vbasedev-
>>> devid,
>>> -                                         &type, NULL, 0, &hw_caps, errp)) {
>>> +                                         &type, &vtd, sizeof(vtd), 
>>> &hw_caps,
>>> +                                         errp)) {
>>>          return false;
>>>      }
>>>
>>> +    if (vtd.flags & IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17) {
>>> +        container->bcontainer.bypass_ro = true;
>>> +    }
>>> +
>>>      if (hw_caps & IOMMU_HW_CAP_DIRTY_TRACKING) {
>>>          flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
>>>      }
>>> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
>>> index f498e23a93..c64aa4539e 100644
>>> --- a/hw/vfio/listener.c
>>> +++ b/hw/vfio/listener.c
>>> @@ -364,7 +364,8 @@ static bool
>> vfio_known_safe_misalignment(MemoryRegionSection *section)
>>>      return true;
>>>  }
>>>
>>> -static bool vfio_listener_valid_section(MemoryRegionSection *section,
>>> +static bool vfio_listener_valid_section(VFIOContainerBase *bcontainer,
>>> +                                        MemoryRegionSection *section,
>>>                                          const char *name)
>>>  {
>>>      if (vfio_listener_skipped_section(section)) {
>>> @@ -375,6 +376,10 @@ static bool
>> vfio_listener_valid_section(MemoryRegionSection *section,
>>>          return false;
>>>      }
>>>
>>> +    if (bcontainer && bcontainer->bypass_ro && section->readonly) {
>>> +        return false;
>>> +    }
>>> +
>>>      if (unlikely((section->offset_within_address_space &
>>>                    ~qemu_real_host_page_mask()) !=
>>>                   (section->offset_within_region & 
>>> ~qemu_real_host_page_mask()))) {
>>> @@ -494,7 +499,7 @@ void vfio_container_region_add(VFIOContainerBase
>> *bcontainer,
>>>      int ret;
>>>      Error *err = NULL;
>>>
>>> -    if (!vfio_listener_valid_section(section, "region_add")) {
>>> +    if (!vfio_listener_valid_section(bcontainer, section, "region_add")) {
>>>          return;
>>>      }
>>>
>>> @@ -655,7 +660,7 @@ static void vfio_listener_region_del(MemoryListener
>> *listener,
>>>      int ret;
>>>      bool try_unmap = true;
>>>
>>> -    if (!vfio_listener_valid_section(section, "region_del")) {
>>> +    if (!vfio_listener_valid_section(bcontainer, section, "region_del")) {
>>>          return;
>>>      }
>>>
>>> @@ -812,7 +817,7 @@ static void vfio_dirty_tracking_update(MemoryListener
>> *listener,
>>>          container_of(listener, VFIODirtyRangesListener, listener);
>>>      hwaddr iova, end;
>>>
>>> -    if (!vfio_listener_valid_section(section, "tracking_update") ||
>>> +    if (!vfio_listener_valid_section(NULL, section, "tracking_update") ||
>>>          !vfio_get_section_iova_range(dirty->bcontainer, section,
>>>                                       &iova, &end, NULL)) {
>>>          return;
>> Thanks
>>
>> Eric


Reply via email to