On Fri, Jun 20, 2025 at 05:14:16PM +0200, Igor Mammedov wrote:
> This patch brings back Jan's idea [1] of BQL-free IO access,
> with a twist that whitelist read access only.
> 
> (as BQL-free write access in [1] used to cause issues [2]
> and still does (Windows crash) if write path is not lock protected)

Can we add some explanation on why it would fail on lockless writes?

I saw that acpi_pm_tmr_write() is no-op, so I don't yet understand what
raced, and also why guest writes to it at all..

Thanks,

> 
> However with limiting it read path only, guest boots without issues.
> This will let us make read access ACPI PM/HPET timers cheaper,
> and prevent guest VCPUs BQL contention in case of workload
> that heavily uses the timers.
> 
> 2) 196ea13104f (memory: Add global-locking property to memory regions)
>    ... de7ea885c539 (kvm: Switch to unlocked MMIO)
> 3) https://bugzilla.redhat.com/show_bug.cgi?id=1322713
>    1beb99f787 (Revert "acpi: mark PMTIMER as unlocked")
> 
> Signed-off-by: Igor Mammedov <imamm...@redhat.com>
> ---
>  include/system/memory.h   | 10 +++++++++-
>  hw/remote/vfio-user-obj.c |  2 +-
>  system/memory.c           |  5 +++++
>  system/memory_ldst.c.inc  | 18 +++++++++---------
>  system/physmem.c          |  9 +++++----
>  5 files changed, 29 insertions(+), 15 deletions(-)
> 
> diff --git a/include/system/memory.h b/include/system/memory.h
> index fc35a0dcad..1afabf2d94 100644
> --- a/include/system/memory.h
> +++ b/include/system/memory.h
> @@ -775,6 +775,7 @@ struct MemoryRegion {
>      bool nonvolatile;
>      bool rom_device;
>      bool flush_coalesced_mmio;
> +    bool lockless_ro_io;
>      bool unmergeable;
>      uint8_t dirty_log_mask;
>      bool is_iommu;
> @@ -2253,6 +2254,13 @@ void memory_region_set_flush_coalesced(MemoryRegion 
> *mr);
>   */
>  void memory_region_clear_flush_coalesced(MemoryRegion *mr);
>  
> +/**
> + * memory_region_enable_lockless_ro_io: Enable lockless (BQL) read-only 
> acceess.
> + *
> + * Enable BQL-free readonly access for devices with fine-grained locking.
> + */
> +void memory_region_enable_lockless_ro_io(MemoryRegion *mr);
> +
>  /**
>   * memory_region_add_eventfd: Request an eventfd to be triggered when a word
>   *                            is written to a location.
> @@ -3002,7 +3010,7 @@ MemTxResult 
> address_space_write_cached_slow(MemoryRegionCache *cache,
>                                              hwaddr len);
>  
>  int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr);
> -bool prepare_mmio_access(MemoryRegion *mr);
> +bool prepare_mmio_access(MemoryRegion *mr, bool read);
>  
>  static inline bool memory_region_supports_direct_access(MemoryRegion *mr)
>  {
> diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> index ea6165ebdc..936a9befd4 100644
> --- a/hw/remote/vfio-user-obj.c
> +++ b/hw/remote/vfio-user-obj.c
> @@ -381,7 +381,7 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t 
> *buf, hwaddr offset,
>           * The read/write logic used below is similar to the ones in
>           * flatview_read/write_continue()
>           */
> -        release_lock = prepare_mmio_access(mr);
> +        release_lock = prepare_mmio_access(mr, !is_write);
>  
>          access_size = memory_access_size(mr, size, offset);
>  
> diff --git a/system/memory.c b/system/memory.c
> index 63b983efcd..5192195473 100644
> --- a/system/memory.c
> +++ b/system/memory.c
> @@ -2558,6 +2558,11 @@ void memory_region_clear_flush_coalesced(MemoryRegion 
> *mr)
>      }
>  }
>  
> +void memory_region_enable_lockless_ro_io(MemoryRegion *mr)
> +{
> +    mr->lockless_ro_io = true;
> +}
> +
>  void memory_region_add_eventfd(MemoryRegion *mr,
>                                 hwaddr addr,
>                                 unsigned size,
> diff --git a/system/memory_ldst.c.inc b/system/memory_ldst.c.inc
> index 7f32d3d9ff..919357578c 100644
> --- a/system/memory_ldst.c.inc
> +++ b/system/memory_ldst.c.inc
> @@ -35,7 +35,7 @@ static inline uint32_t glue(address_space_ldl_internal, 
> SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
>      if (l < 4 || !memory_access_is_direct(mr, false, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, true);
>  
>          /* I/O case */
>          r = memory_region_dispatch_read(mr, addr1, &val,
> @@ -104,7 +104,7 @@ static inline uint64_t glue(address_space_ldq_internal, 
> SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
>      if (l < 8 || !memory_access_is_direct(mr, false, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, true);
>  
>          /* I/O case */
>          r = memory_region_dispatch_read(mr, addr1, &val,
> @@ -171,7 +171,7 @@ uint8_t glue(address_space_ldub, SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
>      if (!memory_access_is_direct(mr, false, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, true);
>  
>          /* I/O case */
>          r = memory_region_dispatch_read(mr, addr1, &val, MO_8, attrs);
> @@ -208,7 +208,7 @@ static inline uint16_t glue(address_space_lduw_internal, 
> SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
>      if (l < 2 || !memory_access_is_direct(mr, false, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, true);
>  
>          /* I/O case */
>          r = memory_region_dispatch_read(mr, addr1, &val,
> @@ -278,7 +278,7 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
>      if (l < 4 || !memory_access_is_direct(mr, true, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, false);
>  
>          r = memory_region_dispatch_write(mr, addr1, val, MO_32, attrs);
>      } else {
> @@ -315,7 +315,7 @@ static inline void glue(address_space_stl_internal, 
> SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
>      if (l < 4 || !memory_access_is_direct(mr, true, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, false);
>          r = memory_region_dispatch_write(mr, addr1, val,
>                                           MO_32 | devend_memop(endian), 
> attrs);
>      } else {
> @@ -378,7 +378,7 @@ void glue(address_space_stb, SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
>      if (!memory_access_is_direct(mr, true, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, false);
>          r = memory_region_dispatch_write(mr, addr1, val, MO_8, attrs);
>      } else {
>          /* RAM case */
> @@ -411,7 +411,7 @@ static inline void glue(address_space_stw_internal, 
> SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
>      if (l < 2 || !memory_access_is_direct(mr, true, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, false);
>          r = memory_region_dispatch_write(mr, addr1, val,
>                                           MO_16 | devend_memop(endian), 
> attrs);
>      } else {
> @@ -475,7 +475,7 @@ static void glue(address_space_stq_internal, 
> SUFFIX)(ARG1_DECL,
>      RCU_READ_LOCK();
>      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
>      if (l < 8 || !memory_access_is_direct(mr, true, attrs)) {
> -        release_lock |= prepare_mmio_access(mr);
> +        release_lock |= prepare_mmio_access(mr, false);
>          r = memory_region_dispatch_write(mr, addr1, val,
>                                           MO_64 | devend_memop(endian), 
> attrs);
>      } else {
> diff --git a/system/physmem.c b/system/physmem.c
> index a8a9ca309e..60e330de99 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -2881,11 +2881,12 @@ int memory_access_size(MemoryRegion *mr, unsigned l, 
> hwaddr addr)
>      return l;
>  }
>  
> -bool prepare_mmio_access(MemoryRegion *mr)
> +bool prepare_mmio_access(MemoryRegion *mr, bool read)
>  {
>      bool release_lock = false;
>  
> -    if (!bql_locked()) {
> +    if (!bql_locked() &&
> +        !(read && mr->lockless_ro_io == true)) {
>          bql_lock();
>          release_lock = true;
>      }
> @@ -2935,7 +2936,7 @@ static MemTxResult 
> flatview_write_continue_step(MemTxAttrs attrs,
>      if (!memory_access_is_direct(mr, true, attrs)) {
>          uint64_t val;
>          MemTxResult result;
> -        bool release_lock = prepare_mmio_access(mr);
> +        bool release_lock = prepare_mmio_access(mr, false);
>  
>          *l = memory_access_size(mr, *l, mr_addr);
>          /*
> @@ -3032,7 +3033,7 @@ static MemTxResult 
> flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf,
>          /* I/O case */
>          uint64_t val;
>          MemTxResult result;
> -        bool release_lock = prepare_mmio_access(mr);
> +        bool release_lock = prepare_mmio_access(mr, true);
>  
>          *l = memory_access_size(mr, *l, mr_addr);
>          result = memory_region_dispatch_read(mr, mr_addr, &val, 
> size_memop(*l),
> -- 
> 2.43.5
> 

-- 
Peter Xu


Reply via email to