Unmapped was added as a parameter to __folio_split() and related call sites to support splitting of folios already in the midst of a migration. This special case arose for device private folio migration since during migration there could be a disconnect between source and destination on the folio size.
Introduce folio_split_unmapped() to handle this special case. Also refactor code and add __folio_freeze_and_split_unmapped() helper that is common to both __folio_split() and folio_split_unmapped(). This in turn removes the special casing introduced by the unmapped parameter in __folio_split(). Cc: Andrew Morton <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Zi Yan <[email protected]> Cc: Joshua Hahn <[email protected]> Cc: Rakie Kim <[email protected]> Cc: Byungchul Park <[email protected]> Cc: Gregory Price <[email protected]> Cc: Ying Huang <[email protected]> Cc: Alistair Popple <[email protected]> Cc: Oscar Salvador <[email protected]> Cc: Lorenzo Stoakes <[email protected]> Cc: Baolin Wang <[email protected]> Cc: "Liam R. Howlett" <[email protected]> Cc: Nico Pache <[email protected]> Cc: Ryan Roberts <[email protected]> Cc: Dev Jain <[email protected]> Cc: Barry Song <[email protected]> Cc: Lyude Paul <[email protected]> Cc: Danilo Krummrich <[email protected]> Cc: David Airlie <[email protected]> Cc: Simona Vetter <[email protected]> Cc: Ralph Campbell <[email protected]> Cc: Mika Penttilä <[email protected]> Cc: Matthew Brost <[email protected]> Cc: Francois Dugast <[email protected]> Suggested-by: Zi Yan <[email protected]> Signed-off-by: Balbir Singh <[email protected]> --- include/linux/huge_mm.h | 5 +- mm/huge_memory.c | 332 ++++++++++++++++++++++------------------ mm/migrate_device.c | 3 +- 3 files changed, 191 insertions(+), 149 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e2e91aa1a042..1d439de1ca2c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -371,7 +371,8 @@ enum split_type { bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order, bool unmapped); + unsigned int new_order); +int folio_split_unmapped(struct folio *folio, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); bool folio_split_supported(struct folio *folio, unsigned int new_order, @@ -382,7 +383,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page, static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return __split_huge_page_to_list_to_order(page, list, new_order, false); + return __split_huge_page_to_list_to_order(page, list, new_order); } static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e4dadcb9e90b..afc10079724c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3739,6 +3739,151 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, return true; } +static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order, + struct page *split_at, struct xa_state *xas, + struct address_space *mapping, bool do_lru, + struct list_head *list, enum split_type split_type, + pgoff_t end, int extra_pins) +{ + struct folio *end_folio = folio_next(folio); + struct folio *new_folio, *next; + int old_order = folio_order(folio); + int nr_shmem_dropped = 0; + int ret = 0; + struct deferred_split *ds_queue; + + /* Prevent deferred_split_scan() touching ->_refcount */ + ds_queue = folio_split_queue_lock(folio); + if (folio_ref_freeze(folio, 1 + extra_pins)) { + struct swap_cluster_info *ci = NULL; + struct lruvec *lruvec; + int expected_refs; + + if (old_order > 1) { + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ + list_del_init(&folio->_deferred_list); + } + if (folio_test_partially_mapped(folio)) { + folio_clear_partially_mapped(folio); + mod_mthp_stat(old_order, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } + } + split_queue_unlock(ds_queue); + if (mapping) { + int nr = folio_nr_pages(folio); + + if (folio_test_pmd_mappable(folio) && + new_order < HPAGE_PMD_ORDER) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } + } + } + + if (folio_test_swapcache(folio)) { + if (mapping) { + VM_WARN_ON_ONCE_FOLIO(mapping, folio); + return -EINVAL; + } + + ci = swap_cluster_get_and_lock(folio); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + if (do_lru) + lruvec = folio_lruvec_lock(folio); + + ret = __split_unmapped_folio(folio, new_order, split_at, xas, + mapping, split_type); + + /* + * Unfreeze after-split folios and put them back to the right + * list. @folio should be kept frozon until page cache + * entries are updated with all the other after-split folios + * to prevent others seeing stale page cache entries. + * As a result, new_folio starts from the next folio of + * @folio. + */ + for (new_folio = folio_next(folio); new_folio != end_folio; + new_folio = next) { + unsigned long nr_pages = folio_nr_pages(new_folio); + + next = folio_next(new_folio); + + zone_device_private_split_cb(folio, new_folio); + + expected_refs = folio_expected_ref_count(new_folio) + 1; + folio_ref_unfreeze(new_folio, expected_refs); + + if (do_lru) + lru_add_split_folio(folio, new_folio, lruvec, list); + + /* + * Anonymous folio with swap cache. + * NOTE: shmem in swap cache is not supported yet. + */ + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); + continue; + } + + /* Anonymous folio without swap cache */ + if (!mapping) + continue; + + /* Add the new folio to the page cache. */ + if (new_folio->index < end) { + __xa_store(&mapping->i_pages, new_folio->index, + new_folio, 0); + continue; + } + + /* Drop folio beyond EOF: ->index >= end */ + if (shmem_mapping(mapping)) + nr_shmem_dropped += nr_pages; + else if (folio_test_clear_dirty(new_folio)) + folio_account_cleaned( + new_folio, inode_to_wb(mapping->host)); + __filemap_remove_folio(new_folio, NULL); + folio_put_refs(new_folio, nr_pages); + } + + zone_device_private_split_cb(folio, NULL); + /* + * Unfreeze @folio only after all page cache entries, which + * used to point to it, have been updated with new folios. + * Otherwise, a parallel folio_try_get() can grab @folio + * and its caller can see stale page cache entries. + */ + expected_refs = folio_expected_ref_count(folio) + 1; + folio_ref_unfreeze(folio, expected_refs); + + if (do_lru) + unlock_page_lruvec(lruvec); + + if (ci) + swap_cluster_unlock(ci); + } else { + split_queue_unlock(ds_queue); + return -EAGAIN; + } + + return ret; +} + /** * __folio_split() - split a folio at @split_at to a @new_order folio * @folio: folio to split @@ -3747,7 +3892,6 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, * @lock_at: a page within @folio to be left locked to caller * @list: after-split folios will be put on it if non NULL * @split_type: perform uniform split or not (non-uniform split) - * @unmapped: The pages are already unmapped, they are migration entries. * * It calls __split_unmapped_folio() to perform uniform and non-uniform split. * It is in charge of checking whether the split is supported or not and @@ -3763,9 +3907,8 @@ bool folio_split_supported(struct folio *folio, unsigned int new_order, */ static int __folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct page *lock_at, - struct list_head *list, enum split_type split_type, bool unmapped) + struct list_head *list, enum split_type split_type) { - struct deferred_split *ds_queue; XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct folio *end_folio = folio_next(folio); bool is_anon = folio_test_anon(folio); @@ -3809,14 +3952,12 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * is taken to serialise against parallel split or collapse * operations. */ - if (!unmapped) { - anon_vma = folio_get_anon_vma(folio); - if (!anon_vma) { - ret = -EBUSY; - goto out; - } - anon_vma_lock_write(anon_vma); + anon_vma = folio_get_anon_vma(folio); + if (!anon_vma) { + ret = -EBUSY; + goto out; } + anon_vma_lock_write(anon_vma); mapping = NULL; } else { unsigned int min_order; @@ -3882,8 +4023,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto out_unlock; } - if (!unmapped) - unmap_folio(folio); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -3900,142 +4040,14 @@ static int __folio_split(struct folio *folio, unsigned int new_order, } } - /* Prevent deferred_split_scan() touching ->_refcount */ - ds_queue = folio_split_queue_lock(folio); - if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct swap_cluster_info *ci = NULL; - struct lruvec *lruvec; - int expected_refs; - - if (old_order > 1) { - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - /* - * Reinitialize page_deferred_list after removing the - * page from the split_queue, otherwise a subsequent - * split will see list corruption when checking the - * page_deferred_list. - */ - list_del_init(&folio->_deferred_list); - } - if (folio_test_partially_mapped(folio)) { - folio_clear_partially_mapped(folio); - mod_mthp_stat(old_order, - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); - } - } - split_queue_unlock(ds_queue); - if (mapping) { - int nr = folio_nr_pages(folio); - - if (folio_test_pmd_mappable(folio) && - new_order < HPAGE_PMD_ORDER) { - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, - NR_SHMEM_THPS, -nr); - } else { - __lruvec_stat_mod_folio(folio, - NR_FILE_THPS, -nr); - filemap_nr_thps_dec(mapping); - } - } - } - - if (folio_test_swapcache(folio)) { - if (mapping) { - VM_WARN_ON_ONCE_FOLIO(mapping, folio); - ret = -EINVAL; - goto fail; - } - - ci = swap_cluster_get_and_lock(folio); - } - - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ - lruvec = folio_lruvec_lock(folio); - - ret = __split_unmapped_folio(folio, new_order, split_at, &xas, - mapping, split_type); - - /* - * Unfreeze after-split folios and put them back to the right - * list. @folio should be kept frozon until page cache - * entries are updated with all the other after-split folios - * to prevent others seeing stale page cache entries. - * As a result, new_folio starts from the next folio of - * @folio. - */ - for (new_folio = folio_next(folio); new_folio != end_folio; - new_folio = next) { - unsigned long nr_pages = folio_nr_pages(new_folio); - - next = folio_next(new_folio); - - zone_device_private_split_cb(folio, new_folio); - - expected_refs = folio_expected_ref_count(new_folio) + 1; - folio_ref_unfreeze(new_folio, expected_refs); - - if (!unmapped) - lru_add_split_folio(folio, new_folio, lruvec, list); - - /* - * Anonymous folio with swap cache. - * NOTE: shmem in swap cache is not supported yet. - */ - if (ci) { - __swap_cache_replace_folio(ci, folio, new_folio); - continue; - } - - /* Anonymous folio without swap cache */ - if (!mapping) - continue; - - /* Add the new folio to the page cache. */ - if (new_folio->index < end) { - __xa_store(&mapping->i_pages, new_folio->index, - new_folio, 0); - continue; - } - - /* Drop folio beyond EOF: ->index >= end */ - if (shmem_mapping(mapping)) - nr_shmem_dropped += nr_pages; - else if (folio_test_clear_dirty(new_folio)) - folio_account_cleaned( - new_folio, inode_to_wb(mapping->host)); - __filemap_remove_folio(new_folio, NULL); - folio_put_refs(new_folio, nr_pages); - } - - zone_device_private_split_cb(folio, NULL); - /* - * Unfreeze @folio only after all page cache entries, which - * used to point to it, have been updated with new folios. - * Otherwise, a parallel folio_try_get() can grab @folio - * and its caller can see stale page cache entries. - */ - expected_refs = folio_expected_ref_count(folio) + 1; - folio_ref_unfreeze(folio, expected_refs); - - unlock_page_lruvec(lruvec); - - if (ci) - swap_cluster_unlock(ci); - } else { - split_queue_unlock(ds_queue); - ret = -EAGAIN; - } + ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping, + true, list, split_type, end, extra_pins); fail: if (mapping) xas_unlock(&xas); local_irq_enable(); - if (unmapped) - return ret; - if (nr_shmem_dropped) shmem_uncharge(mapping->host, nr_shmem_dropped); @@ -4079,6 +4091,36 @@ static int __folio_split(struct folio *folio, unsigned int new_order, return ret; } +/* + * This function is a helper for splitting folios that have already been unmapped. + * The use case is that the device or the CPU can refuse to migrate THP pages in + * the middle of migration, due to allocation issues on either side + * + * The high level code is copied from __folio_split, since the pages are anonymous + * and are already isolated from the LRU, the code has been simplified to not + * burden __folio_split with unmapped sprinkled into the code. + * + * None of the split folios are unlocked + */ +int folio_split_unmapped(struct folio *folio, unsigned int new_order) +{ + int extra_pins, ret = 0; + + VM_WARN_ON_FOLIO(folio_mapped(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); + + if (!can_split_folio(folio, 1, &extra_pins)) + return -EAGAIN; + + local_irq_disable(); + ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL, + NULL, false, NULL, SPLIT_TYPE_UNIFORM, + 0, extra_pins); + local_irq_enable(); + return ret; +} + /* * This function splits a large folio into smaller folios of order @new_order. * @page can point to any page of the large folio to split. The split operation @@ -4127,12 +4169,12 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * with the folio. Splitting to order 0 is compatible with all folios. */ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order, bool unmapped) + unsigned int new_order) { struct folio *folio = page_folio(page); return __folio_split(folio, new_order, &folio->page, page, list, - SPLIT_TYPE_UNIFORM, unmapped); + SPLIT_TYPE_UNIFORM); } /** @@ -4163,7 +4205,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *split_at, struct list_head *list) { return __folio_split(folio, new_order, split_at, &folio->page, list, - SPLIT_TYPE_NON_UNIFORM, false); + SPLIT_TYPE_NON_UNIFORM); } int min_order_for_split(struct folio *folio) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index c50abbd32f21..723ef42550dc 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -918,8 +918,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate, folio_get(folio); split_huge_pmd_address(migrate->vma, addr, true); - ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL, - 0, true); + ret = folio_split_unmapped(folio, 0); if (ret) return ret; migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND; -- 2.51.1
