Hello,

Almost there...

Milos Nikic, le ven. 29 août 2025 08:16:26 -0700, a ecrit:
> From af5cbb943d03c79e1713edcc045ecfa680007fc9 Mon Sep 17 00:00:00 2001
> From: Milos Nikic <[email protected]>
> Date: Mon, 25 Aug 2025 21:38:08 +0100
> Subject: [PATCH] libpager, ext2fs: add bulk page write support
> 
> Introduce a new pager_write_pages() entry point, allowing filesystems
> to implement multi-page writes in one call.  libpager will attempt to
> coalesce contiguous dirty pages and call this bulk interface; if it is
> unsupported or only partially completes, the remaining pages are
> handled by the existing per-page path.
> 
> ext2fs now provides file_pager_write_pages(), implemented using a
> small chunked write loop (currently 2 blocks per chunk) under
> alloc_lock.  This reduces lock/unlock cycles, improves batching of
> store_write() calls, and avoids starvation by yielding between chunks.
> 
> In testing on a 128 MiB file, average store_write size improved from
> ~4 KiB to ~8 KiB, cutting device write calls roughly in half.

You want to update the figures :)

> System
> stress tests all pass without stalls or data corruption.
> 
> Other filesystems may continue using pager_write_page() unchanged.
> ---
>  ext2fs/pager.c         | 128 ++++++++++++++++++++++++++++++++++++++++-
>  libpager/Makefile      |   2 +-
>  libpager/data-return.c |  57 +++++++++++++++---
>  libpager/pager-bulk.c  |  37 ++++++++++++
>  libpager/pager.h       |  10 ++++
>  5 files changed, 224 insertions(+), 10 deletions(-)
>  create mode 100644 libpager/pager-bulk.c
> 
> diff --git a/ext2fs/pager.c b/ext2fs/pager.c
> index c55107a9..a47fb381 100644
> --- a/ext2fs/pager.c
> +++ b/ext2fs/pager.c
> @@ -83,11 +83,16 @@ do { pthread_spin_lock (&ext2s_pager_stats.lock);         
>               \
>  #define STAT_INC(field) /* nop */0
>  #endif /* STATS */
>  
> +#ifndef EXT2_BULK_MAX_BLOCKS
> +#define EXT2_BULK_MAX_BLOCKS 128
> +#endif  /* EXT2_BULK_MAX_BLOCKS */
> +
>  static void
>  disk_cache_info_free_push (struct disk_cache_info *p);
>  
>  #define FREE_PAGE_BUFS 24
>  
> +
>  /* Returns a single page page-aligned buffer.  */
>  static void *
>  get_page_buf (void)
> @@ -336,7 +341,6 @@ pending_blocks_write (struct pending_blocks *pb)
>       return err;
>        else if (amount != length)
>       return EIO;
> -
>        pb->offs += length;
>        pb->num = 0;
>      }

Please avoid unrelated whitespace changes.

> @@ -378,6 +382,128 @@ pending_blocks_add (struct pending_blocks *pb, block_t 
> block)
>    pb->num++;
>    return 0;
>  }
> +
> +/* Bulk write across [offset, offset + length).  We coalesce strictly
> +   consecutive blocks into a single device write.  The run ends on the
> +   first non-consecutive block or when we hit the configured cap.
> +   We report partial progress via *written (page-aligned).  */
> +static error_t
> +file_pager_write_pages (struct node *node,
> +                        vm_offset_t offset,
> +                        vm_address_t buf,
> +                        vm_size_t length,
> +                        vm_size_t *written)
> +{
> +  error_t err = 0;
> +  vm_size_t done = 0;   /* bytes successfully issued to the store */
> +  pthread_rwlock_t *lock = &diskfs_node_disknode (node)->alloc_lock;
> +  const unsigned max_blocks = EXT2_BULK_MAX_BLOCKS; 
> +
> +  if (written)
> +    *written = 0;
> +
> +  while (done < length)
> +    {
> +      /* Acquire the same lock the per-page path uses.  We keep it
> +         for the short time we enumerate a *single* run and flush it.  */
> +      pthread_rwlock_rdlock (lock);
> +
> +      /* Recompute clipping against allocsize each iteration.  */
> +      vm_size_t left = length - done;
> +      if (offset + done >= node->allocsize)
> +        {
> +          pthread_rwlock_unlock (lock);
> +          break;
> +        }
> +      if (offset + done + left > node->allocsize)
> +        left = node->allocsize - (offset + done);
> +      if (left == 0)
> +        {
> +          pthread_rwlock_unlock (lock);
> +          break;
> +        }
> +
> +      /* Build one run of strictly consecutive on-disk blocks.  */
> +      struct pending_blocks pb;
> +      vm_size_t built = 0;
> +      vm_size_t blocks_built = 0;
> +      block_t prev = 0, blk = 0;
> +      int have_prev = 0;

have_prev is actually duplicate with prev, since block numbers are never
to be 0 you can directly test prev != 0

> +
> +      pending_blocks_init (&pb, (void *) (buf + done));
> +
> +      while (blocks_built < max_blocks && built < left)
> +        {
> +          error_t ferr = find_block (node, offset + done + built, &blk, 
> &lock);
> +          if (ferr)
> +            {
> +              err = ferr;
> +              break;
> +            }
> +
> +          assert_backtrace (blk);
> +
> +          if (have_prev && blk != prev + 1)
> +            break; /* non-consecutive block end of this coalesced run */

But in this case blk is already allocated, so we are leaking it.

I guess simplest might be to have the blk variable kept across the main
while loop, and you can put a if (!blk) before the find_block call, and
blk = 0 at the end of the inside while loop. You can also put an
assert (blk == 0); at the end of the function to make sure that we
didn't miss a leak.

> +
> +          /* Extend the run by one block.  */
> +          ferr = pending_blocks_add (&pb, blk);
> +          if (ferr)
> +            {
> +              err = ferr;
> +              break;
> +            }
> +
> +          prev = blk;
> +          have_prev = 1;
> +          built += block_size;
> +          blocks_built++;
> +        }
> +
> +      /* Flush exactly one coalesced run; even if the loop broke early,
> +         we may have a valid prefix to push.  */
> +      error_t werr = pending_blocks_write (&pb);
> +      if (!err)
> +        err = werr;
> +
> +      pthread_rwlock_unlock (lock);
> +
> +      /* Advance only by what we actually enumerated and flushed.  */
> +      done += built;
> +
> +      /* Stop on error or if we could not make any progress.  */
> +      if (err || built == 0)

When might we make no progress without errors?

Samuel

Reply via email to