On Fri, Jun 20, 2025 at 05:14:16PM +0200, Igor Mammedov wrote: > This patch brings back Jan's idea [1] of BQL-free IO access, > with a twist that whitelist read access only. > > (as BQL-free write access in [1] used to cause issues [2] > and still does (Windows crash) if write path is not lock protected)
Can we add some explanation on why it would fail on lockless writes? I saw that acpi_pm_tmr_write() is no-op, so I don't yet understand what raced, and also why guest writes to it at all.. Thanks, > > However with limiting it read path only, guest boots without issues. > This will let us make read access ACPI PM/HPET timers cheaper, > and prevent guest VCPUs BQL contention in case of workload > that heavily uses the timers. > > 2) 196ea13104f (memory: Add global-locking property to memory regions) > ... de7ea885c539 (kvm: Switch to unlocked MMIO) > 3) https://bugzilla.redhat.com/show_bug.cgi?id=1322713 > 1beb99f787 (Revert "acpi: mark PMTIMER as unlocked") > > Signed-off-by: Igor Mammedov <imamm...@redhat.com> > --- > include/system/memory.h | 10 +++++++++- > hw/remote/vfio-user-obj.c | 2 +- > system/memory.c | 5 +++++ > system/memory_ldst.c.inc | 18 +++++++++--------- > system/physmem.c | 9 +++++---- > 5 files changed, 29 insertions(+), 15 deletions(-) > > diff --git a/include/system/memory.h b/include/system/memory.h > index fc35a0dcad..1afabf2d94 100644 > --- a/include/system/memory.h > +++ b/include/system/memory.h > @@ -775,6 +775,7 @@ struct MemoryRegion { > bool nonvolatile; > bool rom_device; > bool flush_coalesced_mmio; > + bool lockless_ro_io; > bool unmergeable; > uint8_t dirty_log_mask; > bool is_iommu; > @@ -2253,6 +2254,13 @@ void memory_region_set_flush_coalesced(MemoryRegion > *mr); > */ > void memory_region_clear_flush_coalesced(MemoryRegion *mr); > > +/** > + * memory_region_enable_lockless_ro_io: Enable lockless (BQL) read-only > acceess. > + * > + * Enable BQL-free readonly access for devices with fine-grained locking. > + */ > +void memory_region_enable_lockless_ro_io(MemoryRegion *mr); > + > /** > * memory_region_add_eventfd: Request an eventfd to be triggered when a word > * is written to a location. > @@ -3002,7 +3010,7 @@ MemTxResult > address_space_write_cached_slow(MemoryRegionCache *cache, > hwaddr len); > > int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr); > -bool prepare_mmio_access(MemoryRegion *mr); > +bool prepare_mmio_access(MemoryRegion *mr, bool read); > > static inline bool memory_region_supports_direct_access(MemoryRegion *mr) > { > diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c > index ea6165ebdc..936a9befd4 100644 > --- a/hw/remote/vfio-user-obj.c > +++ b/hw/remote/vfio-user-obj.c > @@ -381,7 +381,7 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t > *buf, hwaddr offset, > * The read/write logic used below is similar to the ones in > * flatview_read/write_continue() > */ > - release_lock = prepare_mmio_access(mr); > + release_lock = prepare_mmio_access(mr, !is_write); > > access_size = memory_access_size(mr, size, offset); > > diff --git a/system/memory.c b/system/memory.c > index 63b983efcd..5192195473 100644 > --- a/system/memory.c > +++ b/system/memory.c > @@ -2558,6 +2558,11 @@ void memory_region_clear_flush_coalesced(MemoryRegion > *mr) > } > } > > +void memory_region_enable_lockless_ro_io(MemoryRegion *mr) > +{ > + mr->lockless_ro_io = true; > +} > + > void memory_region_add_eventfd(MemoryRegion *mr, > hwaddr addr, > unsigned size, > diff --git a/system/memory_ldst.c.inc b/system/memory_ldst.c.inc > index 7f32d3d9ff..919357578c 100644 > --- a/system/memory_ldst.c.inc > +++ b/system/memory_ldst.c.inc > @@ -35,7 +35,7 @@ static inline uint32_t glue(address_space_ldl_internal, > SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > if (l < 4 || !memory_access_is_direct(mr, false, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, true); > > /* I/O case */ > r = memory_region_dispatch_read(mr, addr1, &val, > @@ -104,7 +104,7 @@ static inline uint64_t glue(address_space_ldq_internal, > SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > if (l < 8 || !memory_access_is_direct(mr, false, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, true); > > /* I/O case */ > r = memory_region_dispatch_read(mr, addr1, &val, > @@ -171,7 +171,7 @@ uint8_t glue(address_space_ldub, SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > if (!memory_access_is_direct(mr, false, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, true); > > /* I/O case */ > r = memory_region_dispatch_read(mr, addr1, &val, MO_8, attrs); > @@ -208,7 +208,7 @@ static inline uint16_t glue(address_space_lduw_internal, > SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > if (l < 2 || !memory_access_is_direct(mr, false, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, true); > > /* I/O case */ > r = memory_region_dispatch_read(mr, addr1, &val, > @@ -278,7 +278,7 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > if (l < 4 || !memory_access_is_direct(mr, true, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, false); > > r = memory_region_dispatch_write(mr, addr1, val, MO_32, attrs); > } else { > @@ -315,7 +315,7 @@ static inline void glue(address_space_stl_internal, > SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > if (l < 4 || !memory_access_is_direct(mr, true, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, false); > r = memory_region_dispatch_write(mr, addr1, val, > MO_32 | devend_memop(endian), > attrs); > } else { > @@ -378,7 +378,7 @@ void glue(address_space_stb, SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > if (!memory_access_is_direct(mr, true, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, false); > r = memory_region_dispatch_write(mr, addr1, val, MO_8, attrs); > } else { > /* RAM case */ > @@ -411,7 +411,7 @@ static inline void glue(address_space_stw_internal, > SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > if (l < 2 || !memory_access_is_direct(mr, true, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, false); > r = memory_region_dispatch_write(mr, addr1, val, > MO_16 | devend_memop(endian), > attrs); > } else { > @@ -475,7 +475,7 @@ static void glue(address_space_stq_internal, > SUFFIX)(ARG1_DECL, > RCU_READ_LOCK(); > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > if (l < 8 || !memory_access_is_direct(mr, true, attrs)) { > - release_lock |= prepare_mmio_access(mr); > + release_lock |= prepare_mmio_access(mr, false); > r = memory_region_dispatch_write(mr, addr1, val, > MO_64 | devend_memop(endian), > attrs); > } else { > diff --git a/system/physmem.c b/system/physmem.c > index a8a9ca309e..60e330de99 100644 > --- a/system/physmem.c > +++ b/system/physmem.c > @@ -2881,11 +2881,12 @@ int memory_access_size(MemoryRegion *mr, unsigned l, > hwaddr addr) > return l; > } > > -bool prepare_mmio_access(MemoryRegion *mr) > +bool prepare_mmio_access(MemoryRegion *mr, bool read) > { > bool release_lock = false; > > - if (!bql_locked()) { > + if (!bql_locked() && > + !(read && mr->lockless_ro_io == true)) { > bql_lock(); > release_lock = true; > } > @@ -2935,7 +2936,7 @@ static MemTxResult > flatview_write_continue_step(MemTxAttrs attrs, > if (!memory_access_is_direct(mr, true, attrs)) { > uint64_t val; > MemTxResult result; > - bool release_lock = prepare_mmio_access(mr); > + bool release_lock = prepare_mmio_access(mr, false); > > *l = memory_access_size(mr, *l, mr_addr); > /* > @@ -3032,7 +3033,7 @@ static MemTxResult > flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf, > /* I/O case */ > uint64_t val; > MemTxResult result; > - bool release_lock = prepare_mmio_access(mr); > + bool release_lock = prepare_mmio_access(mr, true); > > *l = memory_access_size(mr, *l, mr_addr); > result = memory_region_dispatch_read(mr, mr_addr, &val, > size_memop(*l), > -- > 2.43.5 > -- Peter Xu