When a guest reports free pages to the hypervisor via the page reporting
framework (used by virtio-balloon and hv_balloon), the host typically
zeros those pages when reclaiming their backing memory.  However, when
those pages are later allocated in the guest, post_alloc_hook()
unconditionally zeros them again if __GFP_ZERO is set.  This
double-zeroing is wasteful, especially for large pages.

Avoid redundant zeroing:

- Add a host_zeroes_pages flag to page_reporting_dev_info, allowing
  drivers to declare that their host zeros reported pages on reclaim.
  A static key (page_reporting_host_zeroes) gates the fast path.

- Add PG_zeroed page flag (sharing PG_private bit) to mark pages
  that have been zeroed by the host.  Set it on reported pages during
  allocation from the buddy in page_del_and_expand().

- Thread the zeroed bool through rmqueue -> prep_new_page ->
  post_alloc_hook, where it skips redundant zeroing for __GFP_ZERO
  allocations.

No driver sets host_zeroes_pages yet; a follow-up patch to
virtio_balloon is needed to opt in.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Assisted-by: Claude:claude-opus-4-6
Assisted-by: cursor-agent:GPT-5.4-xhigh
---
 include/linux/mm.h             | 28 +++++++++++++++++
 include/linux/page-flags.h     | 12 ++++++-
 include/linux/page_reporting.h |  3 ++
 mm/compaction.c                |  5 +--
 mm/internal.h                  |  2 +-
 mm/page_alloc.c                | 57 ++++++++++++++++++++++------------
 mm/page_reporting.c            | 14 ++++++++-
 mm/page_reporting.h            | 12 +++++++
 8 files changed, 108 insertions(+), 25 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 541d36e5e420..821034dd33d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4817,6 +4817,34 @@ static inline bool user_alloc_needs_zeroing(void)
                                   &init_on_alloc);
 }
 
+/**
+ * __page_test_clear_zeroed - test and clear the zeroed marker.
+ * @page: the page to test.
+ *
+ * Returns true if the page was zeroed by the host, and clears
+ * the marker. Caller must have exclusive access to @page.
+ */
+static inline bool __page_test_clear_zeroed(struct page *page)
+{
+       if (PageZeroed(page)) {
+               __ClearPageZeroed(page);
+               return true;
+       }
+       return false;
+}
+
+/**
+ * folio_test_clear_zeroed - test and clear the zeroed marker.
+ * @folio: the folio to test.
+ *
+ * Returns true if the folio was zeroed by the host, and clears
+ * the marker.  Callers can skip their own zeroing.
+ */
+static inline bool folio_test_clear_zeroed(struct folio *folio)
+{
+       return __page_test_clear_zeroed(&folio->page);
+}
+
 int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user 
*status);
 int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
 int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f7a0e4af0c73..aa0de99247d4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -135,6 +135,8 @@ enum pageflags {
        PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
        /* Some filesystems */
        PG_checked = PG_owner_priv_1,
+       /* Page contents are known to be zero */
+       PG_zeroed = PG_private,
 
        /*
         * Depending on the way an anonymous folio can be mapped into a page
@@ -679,6 +681,13 @@ FOLIO_TEST_CLEAR_FLAG_FALSE(young)
 FOLIO_FLAG_FALSE(idle)
 #endif
 
+/*
+ * PageZeroed() tracks pages known to be zero.  The allocator
+ * uses this to skip redundant zeroing in post_alloc_hook().
+ */
+__PAGEFLAG(Zeroed, zeroed, PF_NO_COMPOUND)
+#define __PG_ZEROED (1UL << PG_zeroed)
+
 /*
  * PageReported() is used to track reported free pages within the Buddy
  * allocator. We can use the non-atomic version of the test and set
@@ -1207,9 +1216,10 @@ static __always_inline void 
__ClearPageAnonExclusive(struct page *page)
  *
  * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
  * alloc-free cycle to prevent from reusing the page.
+ * __PG_ZEROED survives alloc-free cycles to track known-zero pages.
  */
 #define PAGE_FLAGS_CHECK_AT_PREP       \
-       ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
+       ((PAGEFLAGS_MASK & ~(__PG_HWPOISON | __PG_ZEROED)) | LRU_GEN_MASK | 
LRU_REFS_MASK)
 
 /*
  * Flags stored in the second page of a compound page.  They may overlap
diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index fe648dfa3a7c..10faadfeb4fb 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -13,6 +13,9 @@ struct page_reporting_dev_info {
        int (*report)(struct page_reporting_dev_info *prdev,
                      struct scatterlist *sg, unsigned int nents);
 
+       /* If true, host zeros reported pages on reclaim */
+       bool host_zeroes_pages;
+
        /* work struct for processing reports */
        struct delayed_work work;
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 82f2914962f5..3d9ae727a98a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -82,7 +82,8 @@ static inline bool is_via_compact_memory(int order) { return 
false; }
 
 static struct page *mark_allocated_noprof(struct page *page, unsigned int 
order, gfp_t gfp_flags)
 {
-       post_alloc_hook(page, order, __GFP_MOVABLE, USER_ADDR_NONE)
+       post_alloc_hook(page, order, __GFP_MOVABLE, false, USER_ADDR_NONE);
+       set_page_refcounted(page);
        return page;
 }
 #define mark_allocated(...)    alloc_hooks(mark_allocated_noprof(__VA_ARGS__))
@@ -1831,7 +1832,7 @@ static struct folio *compaction_alloc_noprof(struct folio 
*src, unsigned long da
                set_page_private(&freepage[size], start_order);
        }
        dst = (struct folio *)freepage;
-       post_alloc_hook(&dst->page, order, __GFP_MOVABLE, USER_ADDR_NONE);
+       post_alloc_hook(&dst->page, order, __GFP_MOVABLE, false, 
USER_ADDR_NONE);
        set_page_refcounted(&dst->page);
        if (order)
                prep_compound_page(&dst->page, order);
diff --git a/mm/internal.h b/mm/internal.h
index 0b9c0bd133d3..4c33249e03f0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -889,7 +889,7 @@ static inline void prep_compound_tail(struct page *head, 
int tail_idx)
 }
 
 void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags,
-                    unsigned long user_addr);
+                    bool zeroed, unsigned long user_addr);
 extern bool free_pages_prepare(struct page *page, unsigned int order);
 
 extern int user_min_free_kbytes;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 211e9e32b91d..2098d569d80c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1774,6 +1774,7 @@ static __always_inline void page_del_and_expand(struct 
zone *zone,
        bool was_reported = page_reported(page);
 
        __del_page_from_free_list(page, zone, high, migratetype);
+
        nr_pages -= expand(zone, page, low, high, migratetype, was_reported);
        account_freepages(zone, -nr_pages, migratetype);
 }
@@ -1846,8 +1847,10 @@ static inline bool should_skip_init(gfp_t flags)
        return (flags & __GFP_SKIP_ZERO);
 }
 
+
 inline void post_alloc_hook(struct page *page, unsigned int order,
-                               gfp_t gfp_flags, unsigned long user_addr)
+                               gfp_t gfp_flags, bool zeroed,
+                               unsigned long user_addr)
 {
        bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
                        !should_skip_init(gfp_flags);
@@ -1856,6 +1859,14 @@ inline void post_alloc_hook(struct page *page, unsigned 
int order,
 
        set_page_private(page, 0);
 
+       /*
+        * If the page is zeroed, skip memory initialization.
+        * We still need to handle tag zeroing separately since the host
+        * does not know about memory tags.
+        */
+       if (zeroed && init && !zero_tags)
+               init = false;
+
        arch_alloc_page(page, order);
        debug_pagealloc_map_pages(page, 1 << order);
 
@@ -1913,13 +1924,13 @@ inline void post_alloc_hook(struct page *page, unsigned 
int order,
 }
 
 static void prep_new_page(struct page *page, unsigned int order, gfp_t 
gfp_flags,
-                                                       unsigned int 
alloc_flags,
-                                                       unsigned long user_addr)
+                         unsigned int alloc_flags, bool zeroed,
+                         unsigned long user_addr)
 {
        if (order && (gfp_flags & __GFP_COMP))
                prep_compound_page(page, order);
 
-       post_alloc_hook(page, order, gfp_flags, user_addr);
+       post_alloc_hook(page, order, gfp_flags, zeroed, user_addr);
 
        /*
         * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
@@ -3261,7 +3272,7 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z,
 static __always_inline
 struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
                           unsigned int order, unsigned int alloc_flags,
-                          int migratetype)
+                          int migratetype, bool *zeroed)
 {
        struct page *page;
        unsigned long flags;
@@ -3296,6 +3307,7 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, 
struct zone *zone,
                        }
                }
                spin_unlock_irqrestore(&zone->lock, flags);
+               *zeroed = __page_test_clear_zeroed(page);
        } while (check_new_pages(page, order));
 
        __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
@@ -3357,10 +3369,9 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, 
struct zone *zone, int order)
 /* Remove page from the per-cpu list, caller must protect the list */
 static inline
 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
-                       int migratetype,
-                       unsigned int alloc_flags,
+                       int migratetype, unsigned int alloc_flags,
                        struct per_cpu_pages *pcp,
-                       struct list_head *list)
+                       struct list_head *list, bool *zeroed)
 {
        struct page *page;
 
@@ -3381,6 +3392,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, 
unsigned int order,
                page = list_first_entry(list, struct page, pcp_list);
                list_del(&page->pcp_list);
                pcp->count -= 1 << order;
+               *zeroed = __page_test_clear_zeroed(page);
        } while (check_new_pages(page, order));
 
        return page;
@@ -3389,7 +3401,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, 
unsigned int order,
 /* Lock and remove page from the per-cpu list */
 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
                        struct zone *zone, unsigned int order,
-                       int migratetype, unsigned int alloc_flags)
+                       int migratetype, unsigned int alloc_flags,
+                       bool *zeroed)
 {
        struct per_cpu_pages *pcp;
        struct list_head *list;
@@ -3408,7 +3421,8 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
         */
        pcp->free_count >>= 1;
        list = &pcp->lists[order_to_pindex(migratetype, order)];
-       page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, 
list);
+       page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags,
+                                pcp, list, zeroed);
        pcp_spin_unlock(pcp, UP_flags);
        if (page) {
                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
@@ -3433,19 +3447,19 @@ static inline
 struct page *rmqueue(struct zone *preferred_zone,
                        struct zone *zone, unsigned int order,
                        gfp_t gfp_flags, unsigned int alloc_flags,
-                       int migratetype)
+                       int migratetype, bool *zeroed)
 {
        struct page *page;
 
        if (likely(pcp_allowed_order(order))) {
                page = rmqueue_pcplist(preferred_zone, zone, order,
-                                      migratetype, alloc_flags);
+                                      migratetype, alloc_flags, zeroed);
                if (likely(page))
                        goto out;
        }
 
        page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
-                                                       migratetype);
+                            migratetype, zeroed);
 
 out:
        /* Separate test+clear to avoid unnecessary atomics */
@@ -3836,6 +3850,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int 
order, int alloc_flags,
        struct pglist_data *last_pgdat = NULL;
        bool last_pgdat_dirty_ok = false;
        bool no_fallback;
+       bool zeroed;
        bool skip_kswapd_nodes = nr_online_nodes > 1;
        bool skipped_kswapd_nodes = false;
 
@@ -3980,10 +3995,11 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int 
order, int alloc_flags,
 
 try_this_zone:
                page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, 
order,
-                               gfp_mask, alloc_flags, ac->migratetype);
+                                       gfp_mask, alloc_flags, ac->migratetype,
+                                       &zeroed);
                if (page) {
                        prep_new_page(page, order, gfp_mask, alloc_flags,
-                                     ac->user_addr);
+                                     zeroed, ac->user_addr);
 
                        /*
                         * If this is a high-order atomic allocation then check
@@ -4218,7 +4234,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int 
order,
 
        /* Prep a captured page if available */
        if (page)
-               prep_new_page(page, order, gfp_mask, alloc_flags,
+               prep_new_page(page, order, gfp_mask, alloc_flags, false,
                              ac->user_addr);
 
        /* Try get a page from the freelist if available */
@@ -5193,6 +5209,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int 
preferred_nid,
        /* Attempt the batch allocation */
        pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
        while (nr_populated < nr_pages) {
+               bool zeroed = false;
 
                /* Skip existing pages */
                if (page_array[nr_populated]) {
@@ -5201,7 +5218,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int 
preferred_nid,
                }
 
                page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
-                                                               pcp, pcp_list);
+                                        pcp, pcp_list, &zeroed);
                if (unlikely(!page)) {
                        /* Try and allocate at least one page */
                        if (!nr_account) {
@@ -5212,7 +5229,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int 
preferred_nid,
                }
                nr_account++;
 
-               prep_new_page(page, 0, gfp, 0, USER_ADDR_NONE);
+               prep_new_page(page, 0, gfp, 0, zeroed, USER_ADDR_NONE);
                set_page_refcounted(page);
                page_array[nr_populated++] = page;
        }
@@ -6938,7 +6955,7 @@ static void split_free_frozen_pages(struct list_head 
*list, gfp_t gfp_mask)
                list_for_each_entry_safe(page, next, &list[order], lru) {
                        int i;
 
-                       post_alloc_hook(page, order, gfp_mask, USER_ADDR_NONE);
+                       post_alloc_hook(page, order, gfp_mask, false, 
USER_ADDR_NONE);
                        if (!order)
                                continue;
 
@@ -7144,7 +7161,7 @@ int alloc_contig_frozen_range_noprof(unsigned long start, 
unsigned long end,
                struct page *head = pfn_to_page(start);
 
                check_new_pages(head, order);
-               prep_new_page(head, order, gfp_mask, 0, USER_ADDR_NONE);
+               prep_new_page(head, order, gfp_mask, 0, false, USER_ADDR_NONE);
        } else {
                ret = -EINVAL;
                WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, 
%lu)\n",
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index f0042d5743af..6177d2413743 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL_GPL(page_reporting_order);
 #define PAGE_REPORTING_DELAY   (2 * HZ)
 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
 
+DEFINE_STATIC_KEY_FALSE(page_reporting_host_zeroes);
+
 enum {
        PAGE_REPORTING_IDLE = 0,
        PAGE_REPORTING_REQUESTED,
@@ -129,8 +131,11 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
                 * report on the new larger page when we make our way
                 * up to that higher order.
                 */
-               if (PageBuddy(page) && buddy_order(page) == order)
+               if (PageBuddy(page) && buddy_order(page) == order) {
                        __SetPageReported(page);
+                       if (page_reporting_host_zeroes_pages())
+                               __SetPageZeroed(page);
+               }
        } while ((sg = sg_next(sg)));
 
        /* reinitialize scatterlist now that it is empty */
@@ -386,6 +391,10 @@ int page_reporting_register(struct page_reporting_dev_info 
*prdev)
        /* Assign device to allow notifications */
        rcu_assign_pointer(pr_dev_info, prdev);
 
+       /* enable zeroed page optimization if host zeroes reported pages */
+       if (prdev->host_zeroes_pages)
+               static_branch_enable(&page_reporting_host_zeroes);
+
        /* enable page reporting notification */
        if (!static_key_enabled(&page_reporting_enabled)) {
                static_branch_enable(&page_reporting_enabled);
@@ -410,6 +419,9 @@ void page_reporting_unregister(struct 
page_reporting_dev_info *prdev)
 
                /* Flush any existing work, and lock it out */
                cancel_delayed_work_sync(&prdev->work);
+
+               if (prdev->host_zeroes_pages)
+                       static_branch_disable(&page_reporting_host_zeroes);
        }
 
        mutex_unlock(&page_reporting_mutex);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index c51dbc228b94..736ea7b37e9e 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -15,6 +15,13 @@ DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
 extern unsigned int page_reporting_order;
 void __page_reporting_notify(void);
 
+DECLARE_STATIC_KEY_FALSE(page_reporting_host_zeroes);
+
+static inline bool page_reporting_host_zeroes_pages(void)
+{
+       return static_branch_unlikely(&page_reporting_host_zeroes);
+}
+
 static inline bool page_reported(struct page *page)
 {
        return static_branch_unlikely(&page_reporting_enabled) &&
@@ -46,6 +53,11 @@ static inline void page_reporting_notify_free(unsigned int 
order)
 #else /* CONFIG_PAGE_REPORTING */
 #define page_reported(_page)   false
 
+static inline bool page_reporting_host_zeroes_pages(void)
+{
+       return false;
+}
+
 static inline void page_reporting_notify_free(unsigned int order)
 {
 }
-- 
MST


Reply via email to