Hello, Almost there...
Milos Nikic, le ven. 29 août 2025 08:16:26 -0700, a ecrit: > From af5cbb943d03c79e1713edcc045ecfa680007fc9 Mon Sep 17 00:00:00 2001 > From: Milos Nikic <[email protected]> > Date: Mon, 25 Aug 2025 21:38:08 +0100 > Subject: [PATCH] libpager, ext2fs: add bulk page write support > > Introduce a new pager_write_pages() entry point, allowing filesystems > to implement multi-page writes in one call. libpager will attempt to > coalesce contiguous dirty pages and call this bulk interface; if it is > unsupported or only partially completes, the remaining pages are > handled by the existing per-page path. > > ext2fs now provides file_pager_write_pages(), implemented using a > small chunked write loop (currently 2 blocks per chunk) under > alloc_lock. This reduces lock/unlock cycles, improves batching of > store_write() calls, and avoids starvation by yielding between chunks. > > In testing on a 128 MiB file, average store_write size improved from > ~4 KiB to ~8 KiB, cutting device write calls roughly in half. You want to update the figures :) > System > stress tests all pass without stalls or data corruption. > > Other filesystems may continue using pager_write_page() unchanged. > --- > ext2fs/pager.c | 128 ++++++++++++++++++++++++++++++++++++++++- > libpager/Makefile | 2 +- > libpager/data-return.c | 57 +++++++++++++++--- > libpager/pager-bulk.c | 37 ++++++++++++ > libpager/pager.h | 10 ++++ > 5 files changed, 224 insertions(+), 10 deletions(-) > create mode 100644 libpager/pager-bulk.c > > diff --git a/ext2fs/pager.c b/ext2fs/pager.c > index c55107a9..a47fb381 100644 > --- a/ext2fs/pager.c > +++ b/ext2fs/pager.c > @@ -83,11 +83,16 @@ do { pthread_spin_lock (&ext2s_pager_stats.lock); > \ > #define STAT_INC(field) /* nop */0 > #endif /* STATS */ > > +#ifndef EXT2_BULK_MAX_BLOCKS > +#define EXT2_BULK_MAX_BLOCKS 128 > +#endif /* EXT2_BULK_MAX_BLOCKS */ > + > static void > disk_cache_info_free_push (struct disk_cache_info *p); > > #define FREE_PAGE_BUFS 24 > > + > /* Returns a single page page-aligned buffer. */ > static void * > get_page_buf (void) > @@ -336,7 +341,6 @@ pending_blocks_write (struct pending_blocks *pb) > return err; > else if (amount != length) > return EIO; > - > pb->offs += length; > pb->num = 0; > } Please avoid unrelated whitespace changes. > @@ -378,6 +382,128 @@ pending_blocks_add (struct pending_blocks *pb, block_t > block) > pb->num++; > return 0; > } > + > +/* Bulk write across [offset, offset + length). We coalesce strictly > + consecutive blocks into a single device write. The run ends on the > + first non-consecutive block or when we hit the configured cap. > + We report partial progress via *written (page-aligned). */ > +static error_t > +file_pager_write_pages (struct node *node, > + vm_offset_t offset, > + vm_address_t buf, > + vm_size_t length, > + vm_size_t *written) > +{ > + error_t err = 0; > + vm_size_t done = 0; /* bytes successfully issued to the store */ > + pthread_rwlock_t *lock = &diskfs_node_disknode (node)->alloc_lock; > + const unsigned max_blocks = EXT2_BULK_MAX_BLOCKS; > + > + if (written) > + *written = 0; > + > + while (done < length) > + { > + /* Acquire the same lock the per-page path uses. We keep it > + for the short time we enumerate a *single* run and flush it. */ > + pthread_rwlock_rdlock (lock); > + > + /* Recompute clipping against allocsize each iteration. */ > + vm_size_t left = length - done; > + if (offset + done >= node->allocsize) > + { > + pthread_rwlock_unlock (lock); > + break; > + } > + if (offset + done + left > node->allocsize) > + left = node->allocsize - (offset + done); > + if (left == 0) > + { > + pthread_rwlock_unlock (lock); > + break; > + } > + > + /* Build one run of strictly consecutive on-disk blocks. */ > + struct pending_blocks pb; > + vm_size_t built = 0; > + vm_size_t blocks_built = 0; > + block_t prev = 0, blk = 0; > + int have_prev = 0; have_prev is actually duplicate with prev, since block numbers are never to be 0 you can directly test prev != 0 > + > + pending_blocks_init (&pb, (void *) (buf + done)); > + > + while (blocks_built < max_blocks && built < left) > + { > + error_t ferr = find_block (node, offset + done + built, &blk, > &lock); > + if (ferr) > + { > + err = ferr; > + break; > + } > + > + assert_backtrace (blk); > + > + if (have_prev && blk != prev + 1) > + break; /* non-consecutive block end of this coalesced run */ But in this case blk is already allocated, so we are leaking it. I guess simplest might be to have the blk variable kept across the main while loop, and you can put a if (!blk) before the find_block call, and blk = 0 at the end of the inside while loop. You can also put an assert (blk == 0); at the end of the function to make sure that we didn't miss a leak. > + > + /* Extend the run by one block. */ > + ferr = pending_blocks_add (&pb, blk); > + if (ferr) > + { > + err = ferr; > + break; > + } > + > + prev = blk; > + have_prev = 1; > + built += block_size; > + blocks_built++; > + } > + > + /* Flush exactly one coalesced run; even if the loop broke early, > + we may have a valid prefix to push. */ > + error_t werr = pending_blocks_write (&pb); > + if (!err) > + err = werr; > + > + pthread_rwlock_unlock (lock); > + > + /* Advance only by what we actually enumerated and flushed. */ > + done += built; > + > + /* Stop on error or if we could not make any progress. */ > + if (err || built == 0) When might we make no progress without errors? Samuel
