On Fri, 20 Jun 2025 12:53:06 -0400
Peter Xu <pet...@redhat.com> wrote:

> On Fri, Jun 20, 2025 at 05:14:16PM +0200, Igor Mammedov wrote:
> > This patch brings back Jan's idea [1] of BQL-free IO access,
> > with a twist that whitelist read access only.
> > 
> > (as BQL-free write access in [1] used to cause issues [2]
> > and still does (Windows crash) if write path is not lock protected)  
> 
> Can we add some explanation on why it would fail on lockless writes?
> 
> I saw that acpi_pm_tmr_write() is no-op, so I don't yet understand what
> raced, and also why guest writes to it at all..

root cause wasn't diagnosed back then, and I haven't able to
reproduce that as well. So I erred on side of caution and
implemented RO only.

Theoretically write should be fine too, but I don't have
an idea how to test that.

> 
> Thanks,
> 
> > 
> > However with limiting it read path only, guest boots without issues.
> > This will let us make read access ACPI PM/HPET timers cheaper,
> > and prevent guest VCPUs BQL contention in case of workload
> > that heavily uses the timers.
> > 
> > 2) 196ea13104f (memory: Add global-locking property to memory regions)
> >    ... de7ea885c539 (kvm: Switch to unlocked MMIO)
> > 3) https://bugzilla.redhat.com/show_bug.cgi?id=1322713
> >    1beb99f787 (Revert "acpi: mark PMTIMER as unlocked")
> > 
> > Signed-off-by: Igor Mammedov <imamm...@redhat.com>
> > ---
> >  include/system/memory.h   | 10 +++++++++-
> >  hw/remote/vfio-user-obj.c |  2 +-
> >  system/memory.c           |  5 +++++
> >  system/memory_ldst.c.inc  | 18 +++++++++---------
> >  system/physmem.c          |  9 +++++----
> >  5 files changed, 29 insertions(+), 15 deletions(-)
> > 
> > diff --git a/include/system/memory.h b/include/system/memory.h
> > index fc35a0dcad..1afabf2d94 100644
> > --- a/include/system/memory.h
> > +++ b/include/system/memory.h
> > @@ -775,6 +775,7 @@ struct MemoryRegion {
> >      bool nonvolatile;
> >      bool rom_device;
> >      bool flush_coalesced_mmio;
> > +    bool lockless_ro_io;
> >      bool unmergeable;
> >      uint8_t dirty_log_mask;
> >      bool is_iommu;
> > @@ -2253,6 +2254,13 @@ void memory_region_set_flush_coalesced(MemoryRegion 
> > *mr);
> >   */
> >  void memory_region_clear_flush_coalesced(MemoryRegion *mr);
> >  
> > +/**
> > + * memory_region_enable_lockless_ro_io: Enable lockless (BQL) read-only 
> > acceess.
> > + *
> > + * Enable BQL-free readonly access for devices with fine-grained locking.
> > + */
> > +void memory_region_enable_lockless_ro_io(MemoryRegion *mr);
> > +
> >  /**
> >   * memory_region_add_eventfd: Request an eventfd to be triggered when a 
> > word
> >   *                            is written to a location.
> > @@ -3002,7 +3010,7 @@ MemTxResult 
> > address_space_write_cached_slow(MemoryRegionCache *cache,
> >                                              hwaddr len);
> >  
> >  int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr);
> > -bool prepare_mmio_access(MemoryRegion *mr);
> > +bool prepare_mmio_access(MemoryRegion *mr, bool read);
> >  
> >  static inline bool memory_region_supports_direct_access(MemoryRegion *mr)
> >  {
> > diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
> > index ea6165ebdc..936a9befd4 100644
> > --- a/hw/remote/vfio-user-obj.c
> > +++ b/hw/remote/vfio-user-obj.c
> > @@ -381,7 +381,7 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t 
> > *buf, hwaddr offset,
> >           * The read/write logic used below is similar to the ones in
> >           * flatview_read/write_continue()
> >           */
> > -        release_lock = prepare_mmio_access(mr);
> > +        release_lock = prepare_mmio_access(mr, !is_write);
> >  
> >          access_size = memory_access_size(mr, size, offset);
> >  
> > diff --git a/system/memory.c b/system/memory.c
> > index 63b983efcd..5192195473 100644
> > --- a/system/memory.c
> > +++ b/system/memory.c
> > @@ -2558,6 +2558,11 @@ void 
> > memory_region_clear_flush_coalesced(MemoryRegion *mr)
> >      }
> >  }
> >  
> > +void memory_region_enable_lockless_ro_io(MemoryRegion *mr)
> > +{
> > +    mr->lockless_ro_io = true;
> > +}
> > +
> >  void memory_region_add_eventfd(MemoryRegion *mr,
> >                                 hwaddr addr,
> >                                 unsigned size,
> > diff --git a/system/memory_ldst.c.inc b/system/memory_ldst.c.inc
> > index 7f32d3d9ff..919357578c 100644
> > --- a/system/memory_ldst.c.inc
> > +++ b/system/memory_ldst.c.inc
> > @@ -35,7 +35,7 @@ static inline uint32_t glue(address_space_ldl_internal, 
> > SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
> >      if (l < 4 || !memory_access_is_direct(mr, false, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, true);
> >  
> >          /* I/O case */
> >          r = memory_region_dispatch_read(mr, addr1, &val,
> > @@ -104,7 +104,7 @@ static inline uint64_t glue(address_space_ldq_internal, 
> > SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
> >      if (l < 8 || !memory_access_is_direct(mr, false, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, true);
> >  
> >          /* I/O case */
> >          r = memory_region_dispatch_read(mr, addr1, &val,
> > @@ -171,7 +171,7 @@ uint8_t glue(address_space_ldub, SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
> >      if (!memory_access_is_direct(mr, false, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, true);
> >  
> >          /* I/O case */
> >          r = memory_region_dispatch_read(mr, addr1, &val, MO_8, attrs);
> > @@ -208,7 +208,7 @@ static inline uint16_t 
> > glue(address_space_lduw_internal, SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, false, attrs);
> >      if (l < 2 || !memory_access_is_direct(mr, false, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, true);
> >  
> >          /* I/O case */
> >          r = memory_region_dispatch_read(mr, addr1, &val,
> > @@ -278,7 +278,7 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> >      if (l < 4 || !memory_access_is_direct(mr, true, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, false);
> >  
> >          r = memory_region_dispatch_write(mr, addr1, val, MO_32, attrs);
> >      } else {
> > @@ -315,7 +315,7 @@ static inline void glue(address_space_stl_internal, 
> > SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> >      if (l < 4 || !memory_access_is_direct(mr, true, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, false);
> >          r = memory_region_dispatch_write(mr, addr1, val,
> >                                           MO_32 | devend_memop(endian), 
> > attrs);
> >      } else {
> > @@ -378,7 +378,7 @@ void glue(address_space_stb, SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> >      if (!memory_access_is_direct(mr, true, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, false);
> >          r = memory_region_dispatch_write(mr, addr1, val, MO_8, attrs);
> >      } else {
> >          /* RAM case */
> > @@ -411,7 +411,7 @@ static inline void glue(address_space_stw_internal, 
> > SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> >      if (l < 2 || !memory_access_is_direct(mr, true, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, false);
> >          r = memory_region_dispatch_write(mr, addr1, val,
> >                                           MO_16 | devend_memop(endian), 
> > attrs);
> >      } else {
> > @@ -475,7 +475,7 @@ static void glue(address_space_stq_internal, 
> > SUFFIX)(ARG1_DECL,
> >      RCU_READ_LOCK();
> >      mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> >      if (l < 8 || !memory_access_is_direct(mr, true, attrs)) {
> > -        release_lock |= prepare_mmio_access(mr);
> > +        release_lock |= prepare_mmio_access(mr, false);
> >          r = memory_region_dispatch_write(mr, addr1, val,
> >                                           MO_64 | devend_memop(endian), 
> > attrs);
> >      } else {
> > diff --git a/system/physmem.c b/system/physmem.c
> > index a8a9ca309e..60e330de99 100644
> > --- a/system/physmem.c
> > +++ b/system/physmem.c
> > @@ -2881,11 +2881,12 @@ int memory_access_size(MemoryRegion *mr, unsigned 
> > l, hwaddr addr)
> >      return l;
> >  }
> >  
> > -bool prepare_mmio_access(MemoryRegion *mr)
> > +bool prepare_mmio_access(MemoryRegion *mr, bool read)
> >  {
> >      bool release_lock = false;
> >  
> > -    if (!bql_locked()) {
> > +    if (!bql_locked() &&
> > +        !(read && mr->lockless_ro_io == true)) {
> >          bql_lock();
> >          release_lock = true;
> >      }
> > @@ -2935,7 +2936,7 @@ static MemTxResult 
> > flatview_write_continue_step(MemTxAttrs attrs,
> >      if (!memory_access_is_direct(mr, true, attrs)) {
> >          uint64_t val;
> >          MemTxResult result;
> > -        bool release_lock = prepare_mmio_access(mr);
> > +        bool release_lock = prepare_mmio_access(mr, false);
> >  
> >          *l = memory_access_size(mr, *l, mr_addr);
> >          /*
> > @@ -3032,7 +3033,7 @@ static MemTxResult 
> > flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf,
> >          /* I/O case */
> >          uint64_t val;
> >          MemTxResult result;
> > -        bool release_lock = prepare_mmio_access(mr);
> > +        bool release_lock = prepare_mmio_access(mr, true);
> >  
> >          *l = memory_access_size(mr, *l, mr_addr);
> >          result = memory_region_dispatch_read(mr, mr_addr, &val, 
> > size_memop(*l),
> > -- 
> > 2.43.5
> >   
> 


Reply via email to