On Fri, 20 Jun 2025 12:53:06 -0400 Peter Xu <pet...@redhat.com> wrote:
> On Fri, Jun 20, 2025 at 05:14:16PM +0200, Igor Mammedov wrote: > > This patch brings back Jan's idea [1] of BQL-free IO access, > > with a twist that whitelist read access only. > > > > (as BQL-free write access in [1] used to cause issues [2] > > and still does (Windows crash) if write path is not lock protected) > > Can we add some explanation on why it would fail on lockless writes? > > I saw that acpi_pm_tmr_write() is no-op, so I don't yet understand what > raced, and also why guest writes to it at all.. root cause wasn't diagnosed back then, and I haven't able to reproduce that as well. So I erred on side of caution and implemented RO only. Theoretically write should be fine too, but I don't have an idea how to test that. > > Thanks, > > > > > However with limiting it read path only, guest boots without issues. > > This will let us make read access ACPI PM/HPET timers cheaper, > > and prevent guest VCPUs BQL contention in case of workload > > that heavily uses the timers. > > > > 2) 196ea13104f (memory: Add global-locking property to memory regions) > > ... de7ea885c539 (kvm: Switch to unlocked MMIO) > > 3) https://bugzilla.redhat.com/show_bug.cgi?id=1322713 > > 1beb99f787 (Revert "acpi: mark PMTIMER as unlocked") > > > > Signed-off-by: Igor Mammedov <imamm...@redhat.com> > > --- > > include/system/memory.h | 10 +++++++++- > > hw/remote/vfio-user-obj.c | 2 +- > > system/memory.c | 5 +++++ > > system/memory_ldst.c.inc | 18 +++++++++--------- > > system/physmem.c | 9 +++++---- > > 5 files changed, 29 insertions(+), 15 deletions(-) > > > > diff --git a/include/system/memory.h b/include/system/memory.h > > index fc35a0dcad..1afabf2d94 100644 > > --- a/include/system/memory.h > > +++ b/include/system/memory.h > > @@ -775,6 +775,7 @@ struct MemoryRegion { > > bool nonvolatile; > > bool rom_device; > > bool flush_coalesced_mmio; > > + bool lockless_ro_io; > > bool unmergeable; > > uint8_t dirty_log_mask; > > bool is_iommu; > > @@ -2253,6 +2254,13 @@ void memory_region_set_flush_coalesced(MemoryRegion > > *mr); > > */ > > void memory_region_clear_flush_coalesced(MemoryRegion *mr); > > > > +/** > > + * memory_region_enable_lockless_ro_io: Enable lockless (BQL) read-only > > acceess. > > + * > > + * Enable BQL-free readonly access for devices with fine-grained locking. > > + */ > > +void memory_region_enable_lockless_ro_io(MemoryRegion *mr); > > + > > /** > > * memory_region_add_eventfd: Request an eventfd to be triggered when a > > word > > * is written to a location. > > @@ -3002,7 +3010,7 @@ MemTxResult > > address_space_write_cached_slow(MemoryRegionCache *cache, > > hwaddr len); > > > > int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr); > > -bool prepare_mmio_access(MemoryRegion *mr); > > +bool prepare_mmio_access(MemoryRegion *mr, bool read); > > > > static inline bool memory_region_supports_direct_access(MemoryRegion *mr) > > { > > diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c > > index ea6165ebdc..936a9befd4 100644 > > --- a/hw/remote/vfio-user-obj.c > > +++ b/hw/remote/vfio-user-obj.c > > @@ -381,7 +381,7 @@ static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t > > *buf, hwaddr offset, > > * The read/write logic used below is similar to the ones in > > * flatview_read/write_continue() > > */ > > - release_lock = prepare_mmio_access(mr); > > + release_lock = prepare_mmio_access(mr, !is_write); > > > > access_size = memory_access_size(mr, size, offset); > > > > diff --git a/system/memory.c b/system/memory.c > > index 63b983efcd..5192195473 100644 > > --- a/system/memory.c > > +++ b/system/memory.c > > @@ -2558,6 +2558,11 @@ void > > memory_region_clear_flush_coalesced(MemoryRegion *mr) > > } > > } > > > > +void memory_region_enable_lockless_ro_io(MemoryRegion *mr) > > +{ > > + mr->lockless_ro_io = true; > > +} > > + > > void memory_region_add_eventfd(MemoryRegion *mr, > > hwaddr addr, > > unsigned size, > > diff --git a/system/memory_ldst.c.inc b/system/memory_ldst.c.inc > > index 7f32d3d9ff..919357578c 100644 > > --- a/system/memory_ldst.c.inc > > +++ b/system/memory_ldst.c.inc > > @@ -35,7 +35,7 @@ static inline uint32_t glue(address_space_ldl_internal, > > SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > > if (l < 4 || !memory_access_is_direct(mr, false, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, true); > > > > /* I/O case */ > > r = memory_region_dispatch_read(mr, addr1, &val, > > @@ -104,7 +104,7 @@ static inline uint64_t glue(address_space_ldq_internal, > > SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > > if (l < 8 || !memory_access_is_direct(mr, false, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, true); > > > > /* I/O case */ > > r = memory_region_dispatch_read(mr, addr1, &val, > > @@ -171,7 +171,7 @@ uint8_t glue(address_space_ldub, SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > > if (!memory_access_is_direct(mr, false, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, true); > > > > /* I/O case */ > > r = memory_region_dispatch_read(mr, addr1, &val, MO_8, attrs); > > @@ -208,7 +208,7 @@ static inline uint16_t > > glue(address_space_lduw_internal, SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, false, attrs); > > if (l < 2 || !memory_access_is_direct(mr, false, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, true); > > > > /* I/O case */ > > r = memory_region_dispatch_read(mr, addr1, &val, > > @@ -278,7 +278,7 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > > if (l < 4 || !memory_access_is_direct(mr, true, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, false); > > > > r = memory_region_dispatch_write(mr, addr1, val, MO_32, attrs); > > } else { > > @@ -315,7 +315,7 @@ static inline void glue(address_space_stl_internal, > > SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > > if (l < 4 || !memory_access_is_direct(mr, true, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, false); > > r = memory_region_dispatch_write(mr, addr1, val, > > MO_32 | devend_memop(endian), > > attrs); > > } else { > > @@ -378,7 +378,7 @@ void glue(address_space_stb, SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > > if (!memory_access_is_direct(mr, true, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, false); > > r = memory_region_dispatch_write(mr, addr1, val, MO_8, attrs); > > } else { > > /* RAM case */ > > @@ -411,7 +411,7 @@ static inline void glue(address_space_stw_internal, > > SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > > if (l < 2 || !memory_access_is_direct(mr, true, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, false); > > r = memory_region_dispatch_write(mr, addr1, val, > > MO_16 | devend_memop(endian), > > attrs); > > } else { > > @@ -475,7 +475,7 @@ static void glue(address_space_stq_internal, > > SUFFIX)(ARG1_DECL, > > RCU_READ_LOCK(); > > mr = TRANSLATE(addr, &addr1, &l, true, attrs); > > if (l < 8 || !memory_access_is_direct(mr, true, attrs)) { > > - release_lock |= prepare_mmio_access(mr); > > + release_lock |= prepare_mmio_access(mr, false); > > r = memory_region_dispatch_write(mr, addr1, val, > > MO_64 | devend_memop(endian), > > attrs); > > } else { > > diff --git a/system/physmem.c b/system/physmem.c > > index a8a9ca309e..60e330de99 100644 > > --- a/system/physmem.c > > +++ b/system/physmem.c > > @@ -2881,11 +2881,12 @@ int memory_access_size(MemoryRegion *mr, unsigned > > l, hwaddr addr) > > return l; > > } > > > > -bool prepare_mmio_access(MemoryRegion *mr) > > +bool prepare_mmio_access(MemoryRegion *mr, bool read) > > { > > bool release_lock = false; > > > > - if (!bql_locked()) { > > + if (!bql_locked() && > > + !(read && mr->lockless_ro_io == true)) { > > bql_lock(); > > release_lock = true; > > } > > @@ -2935,7 +2936,7 @@ static MemTxResult > > flatview_write_continue_step(MemTxAttrs attrs, > > if (!memory_access_is_direct(mr, true, attrs)) { > > uint64_t val; > > MemTxResult result; > > - bool release_lock = prepare_mmio_access(mr); > > + bool release_lock = prepare_mmio_access(mr, false); > > > > *l = memory_access_size(mr, *l, mr_addr); > > /* > > @@ -3032,7 +3033,7 @@ static MemTxResult > > flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf, > > /* I/O case */ > > uint64_t val; > > MemTxResult result; > > - bool release_lock = prepare_mmio_access(mr); > > + bool release_lock = prepare_mmio_access(mr, true); > > > > *l = memory_access_size(mr, *l, mr_addr); > > result = memory_region_dispatch_read(mr, mr_addr, &val, > > size_memop(*l), > > -- > > 2.43.5 > > >