migrate_device: add THP splitting during migration

Zi Yan Tue, 21 Oct 2025 19:59:45 -0700

On 21 Oct 2025, at 17:34, Balbir Singh wrote:

> On 10/20/25 09:59, Zi Yan wrote:
>> On 19 Oct 2025, at 18:49, Balbir Singh wrote:
>>
>>> On 10/19/25 19:19, Wei Yang wrote:
>>>> On Wed, Oct 01, 2025 at 04:57:02PM +1000, Balbir Singh wrote:
>>>> [...]
>>>>> static int __folio_split(struct folio *folio, unsigned int new_order,
>>>>>           struct page *split_at, struct page *lock_at,
>>>>> -         struct list_head *list, bool uniform_split)
>>>>> +         struct list_head *list, bool uniform_split, bool unmapped)
>>>>> {
>>>>>   struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>>>>>   XA_STATE(xas, &folio->mapping->i_pages, folio->index);
>>>>> @@ -3765,13 +3757,15 @@ static int __folio_split(struct folio *folio, 
>>>>> unsigned int new_order,
>>>>>            * is taken to serialise against parallel split or collapse
>>>>>            * operations.
>>>>>            */
>>>>> -         anon_vma = folio_get_anon_vma(folio);
>>>>> -         if (!anon_vma) {
>>>>> -                 ret = -EBUSY;
>>>>> -                 goto out;
>>>>> +         if (!unmapped) {
>>>>> +                 anon_vma = folio_get_anon_vma(folio);
>>>>> +                 if (!anon_vma) {
>>>>> +                         ret = -EBUSY;
>>>>> +                         goto out;
>>>>> +                 }
>>>>> +                 anon_vma_lock_write(anon_vma);
>>>>>           }
>>>>>           mapping = NULL;
>>>>> -         anon_vma_lock_write(anon_vma);
>>>>>   } else {
>>>>>           unsigned int min_order;
>>>>>           gfp_t gfp;
>>>>> @@ -3838,7 +3832,8 @@ static int __folio_split(struct folio *folio, 
>>>>> unsigned int new_order,
>>>>>           goto out_unlock;
>>>>>   }
>>>>>
>>>>> - unmap_folio(folio);
>>>>> + if (!unmapped)
>>>>> +         unmap_folio(folio);
>>>>>
>>>>>   /* block interrupt reentry in xa_lock and spinlock */
>>>>>   local_irq_disable();
>>>>> @@ -3925,10 +3920,13 @@ static int __folio_split(struct folio *folio, 
>>>>> unsigned int new_order,
>>>>>
>>>>>                   next = folio_next(new_folio);
>>>>>
>>>>> +                 zone_device_private_split_cb(folio, new_folio);
>>>>> +
>>>>>                   expected_refs = folio_expected_ref_count(new_folio) + 1;
>>>>>                   folio_ref_unfreeze(new_folio, expected_refs);
>>>>>
>>>>> -                 lru_add_split_folio(folio, new_folio, lruvec, list);
>>>>> +                 if (!unmapped)
>>>>> +                         lru_add_split_folio(folio, new_folio, lruvec, 
>>>>> list);
>>>>>
>>>>>                   /*
>>>>>                    * Anonymous folio with swap cache.
>>>>> @@ -3959,6 +3957,8 @@ static int __folio_split(struct folio *folio, 
>>>>> unsigned int new_order,
>>>>>                   __filemap_remove_folio(new_folio, NULL);
>>>>>                   folio_put_refs(new_folio, nr_pages);
>>>>>           }
>>>>> +
>>>>> +         zone_device_private_split_cb(folio, NULL);
>>>>>           /*
>>>>>            * Unfreeze @folio only after all page cache entries, which
>>>>>            * used to point to it, have been updated with new folios.
>>>>> @@ -3982,6 +3982,9 @@ static int __folio_split(struct folio *folio, 
>>>>> unsigned int new_order,
>>>>>
>>>>>   local_irq_enable();
>>>>>
>>>>> + if (unmapped)
>>>>> +         return ret;
>>>>
>>>> As the comment of __folio_split() and __split_huge_page_to_list_to_order()
>>>> mentioned:
>>>>
>>>>   * The large folio must be locked
>>>>   * After splitting, the after-split folio containing @lock_at remains 
>>>> locked
>>>>
>>>> But here we seems to change the prerequisites.
>>>>
>>>> Hmm.. I am not sure this is correct.
>>>>
>>>
>>> The code is correct, but you are right in that the documentation needs to 
>>> be updated.
>>> When "unmapped", we do want to leave the folios locked after the split.
>>
>> Sigh, this "unmapped" code needs so many special branches and a different 
>> locking
>> requirement. It should be a separate function to avoid confusions.
>>
>
> Yep, I have a patch for it, I am also waiting on Matthew's feedback, FYI, 
> here is
> a WIP patch that can be applied on top of the series


Nice cleanup! Thanks.

>
> ---
>  include/linux/huge_mm.h |   5 +-
>  mm/huge_memory.c        | 137 ++++++++++++++++++++++++++++++++++------
>  mm/migrate_device.c     |   3 +-
>  3 files changed, 120 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index c4a811958cda..86e1cefaf391 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -366,7 +366,8 @@ unsigned long thp_get_unmapped_area_vmflags(struct file 
> *filp, unsigned long add
>
>  bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>  int __split_huge_page_to_list_to_order(struct page *page, struct list_head 
> *list,
> -             unsigned int new_order, bool unmapped);
> +             unsigned int new_order);
> +int split_unmapped_folio_to_order(struct folio *folio, unsigned int 
> new_order);
>  int min_order_for_split(struct folio *folio);
>  int split_folio_to_list(struct folio *folio, struct list_head *list);
>  bool uniform_split_supported(struct folio *folio, unsigned int new_order,
> @@ -379,7 +380,7 @@ int folio_split(struct folio *folio, unsigned int 
> new_order, struct page *page,
>  static inline int split_huge_page_to_list_to_order(struct page *page, struct 
> list_head *list,
>               unsigned int new_order)
>  {
> -     return __split_huge_page_to_list_to_order(page, list, new_order, false);
> +     return __split_huge_page_to_list_to_order(page, list, new_order);
>  }
>
>  /*
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8c82a0ac6e69..e20cbf68d037 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3711,7 +3711,6 @@ bool uniform_split_supported(struct folio *folio, 
> unsigned int new_order,
>   * @lock_at: a page within @folio to be left locked to caller
>   * @list: after-split folios will be put on it if non NULL
>   * @uniform_split: perform uniform split or not (non-uniform split)
> - * @unmapped: The pages are already unmapped, they are migration entries.
>   *
>   * It calls __split_unmapped_folio() to perform uniform and non-uniform 
> split.
>   * It is in charge of checking whether the split is supported or not and
> @@ -3727,7 +3726,7 @@ bool uniform_split_supported(struct folio *folio, 
> unsigned int new_order,
>   */
>  static int __folio_split(struct folio *folio, unsigned int new_order,
>               struct page *split_at, struct page *lock_at,
> -             struct list_head *list, bool uniform_split, bool unmapped)
> +             struct list_head *list, bool uniform_split)
>  {
>       struct deferred_split *ds_queue;
>       XA_STATE(xas, &folio->mapping->i_pages, folio->index);
> @@ -3777,14 +3776,12 @@ static int __folio_split(struct folio *folio, 
> unsigned int new_order,
>                * is taken to serialise against parallel split or collapse
>                * operations.
>                */
> -             if (!unmapped) {
> -                     anon_vma = folio_get_anon_vma(folio);
> -                     if (!anon_vma) {
> -                             ret = -EBUSY;
> -                             goto out;
> -                     }
> -                     anon_vma_lock_write(anon_vma);
> +             anon_vma = folio_get_anon_vma(folio);
> +             if (!anon_vma) {
> +                     ret = -EBUSY;
> +                     goto out;
>               }
> +             anon_vma_lock_write(anon_vma);
>               mapping = NULL;
>       } else {
>               unsigned int min_order;
> @@ -3852,8 +3849,7 @@ static int __folio_split(struct folio *folio, unsigned 
> int new_order,
>               goto out_unlock;
>       }
>
> -     if (!unmapped)
> -             unmap_folio(folio);
> +     unmap_folio(folio);
>
>       /* block interrupt reentry in xa_lock and spinlock */
>       local_irq_disable();
> @@ -3954,8 +3950,7 @@ static int __folio_split(struct folio *folio, unsigned 
> int new_order,
>                       expected_refs = folio_expected_ref_count(new_folio) + 1;
>                       folio_ref_unfreeze(new_folio, expected_refs);
>
> -                     if (!unmapped)
> -                             lru_add_split_folio(folio, new_folio, lruvec, 
> list);
> +                     lru_add_split_folio(folio, new_folio, lruvec, list);
>
>                       /*
>                        * Anonymous folio with swap cache.
> @@ -4011,9 +4006,6 @@ static int __folio_split(struct folio *folio, unsigned 
> int new_order,
>
>       local_irq_enable();
>
> -     if (unmapped)
> -             return ret;
> -
>       if (nr_shmem_dropped)
>               shmem_uncharge(mapping->host, nr_shmem_dropped);
>
> @@ -4057,6 +4049,111 @@ static int __folio_split(struct folio *folio, 
> unsigned int new_order,
>       return ret;
>  }
>
> +/*
> + * This function is a helper for splitting folios that have already been 
> unmapped.
> + * The use case is that the device or the CPU can refuse to migrate THP 
> pages in
> + * the middle of migration, due to allocation issues on either side
> + *
> + * The high level code is copied from __folio_split, since the pages are 
> anonymous
> + * and are already isolated from the LRU, the code has been simplified to not
> + * burden __folio_split with unmapped sprinkled into the code.

I wonder if it makes sense to remove CPU side folio from both deferred_split 
queue
and swap cache before migration to further simplify 
split_unmapped_folio_to_order().
Basically require that device private folios cannot be on deferred_split queue 
nor
swap cache.

> + *
> + * None of the split folios are unlocked
> + */
> +int split_unmapped_folio_to_order(struct folio *folio, unsigned int 
> new_order)
> +{
> +     int extra_pins;
> +     int ret = 0;
> +     struct folio *new_folio, *next;
> +     struct folio *end_folio = folio_next(folio);
> +     struct deferred_split *ds_queue;
> +     int old_order = folio_order(folio);
> +
> +     VM_WARN_ON_FOLIO(folio_mapped(folio), folio);
> +     VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
> +     VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
> +
> +     if (!can_split_folio(folio, 1, &extra_pins)) {
> +             ret = -EAGAIN;
> +             goto err;
> +     }
> +
> +     local_irq_disable();
> +     /* Prevent deferred_split_scan() touching ->_refcount */
> +     ds_queue = folio_split_queue_lock(folio);
> +     if (folio_ref_freeze(folio, 1 + extra_pins)) {
> +             int expected_refs;
> +             struct swap_cluster_info *ci = NULL;
> +
> +             if (old_order > 1) {
> +                     if (!list_empty(&folio->_deferred_list)) {
> +                             ds_queue->split_queue_len--;
> +                             /*
> +                              * Reinitialize page_deferred_list after
> +                              * removing the page from the split_queue,
> +                              * otherwise a subsequent split will see list
> +                              * corruption when checking the
> +                              * page_deferred_list.
> +                              */
> +                             list_del_init(&folio->_deferred_list);
> +                     }
> +                     if (folio_test_partially_mapped(folio)) {
> +                             folio_clear_partially_mapped(folio);
> +                             mod_mthp_stat(old_order,
> +                                     MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
> +                     }
> +                     /*
> +                      * Reinitialize page_deferred_list after removing the
> +                      * page from the split_queue, otherwise a subsequent
> +                      * split will see list corruption when checking the
> +                      * page_deferred_list.
> +                      */
> +                     list_del_init(&folio->_deferred_list);
> +             }
> +             split_queue_unlock(ds_queue);
> +
> +             if (folio_test_swapcache(folio))
> +                     ci = swap_cluster_get_and_lock(folio);
> +
> +             ret = __split_unmapped_folio(folio, new_order, &folio->page,
> +                                          NULL, NULL, true);
> +
> +             /*
> +              * Unfreeze after-split folios
> +              */
> +             for (new_folio = folio_next(folio); new_folio != end_folio;
> +                  new_folio = next) {
> +                     next = folio_next(new_folio);
> +
> +                     zone_device_private_split_cb(folio, new_folio);
> +
> +                     expected_refs = folio_expected_ref_count(new_folio) + 1;
> +                     folio_ref_unfreeze(new_folio, expected_refs);
> +                     if (ci)
> +                             __swap_cache_replace_folio(ci, folio, 
> new_folio);
> +             }
> +
> +             zone_device_private_split_cb(folio, NULL);
> +             /*
> +              * Unfreeze @folio only after all page cache entries, which
> +              * used to point to it, have been updated with new folios.
> +              * Otherwise, a parallel folio_try_get() can grab @folio
> +              * and its caller can see stale page cache entries.
> +              */
> +             expected_refs = folio_expected_ref_count(folio) + 1;
> +             folio_ref_unfreeze(folio, expected_refs);
> +
> +             if (ci)
> +                     swap_cluster_unlock(ci);
> +     } else {
> +             split_queue_unlock(ds_queue);
> +             ret = -EAGAIN;
> +     }
> +     local_irq_enable();
> +err:
> +     return ret;
> +}
> +
>  /*
>   * This function splits a large folio into smaller folios of order 
> @new_order.
>   * @page can point to any page of the large folio to split. The split 
> operation
> @@ -4105,12 +4202,11 @@ static int __folio_split(struct folio *folio, 
> unsigned int new_order,
>   * with the folio. Splitting to order 0 is compatible with all folios.
>   */
>  int __split_huge_page_to_list_to_order(struct page *page, struct list_head 
> *list,
> -                                  unsigned int new_order, bool unmapped)
> +                                  unsigned int new_order)
>  {
>       struct folio *folio = page_folio(page);
>
> -     return __folio_split(folio, new_order, &folio->page, page, list, true,
> -                             unmapped);
> +     return __folio_split(folio, new_order, &folio->page, page, list, true);
>  }
>
>  /*
> @@ -4138,8 +4234,7 @@ int __split_huge_page_to_list_to_order(struct page 
> *page, struct list_head *list
>  int folio_split(struct folio *folio, unsigned int new_order,
>               struct page *split_at, struct list_head *list)
>  {
> -     return __folio_split(folio, new_order, split_at, &folio->page, list,
> -                     false, false);
> +     return __folio_split(folio, new_order, split_at, &folio->page, list, 
> false);
>  }
>
>  int min_order_for_split(struct folio *folio)
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index c869b272e85a..23515f3ffc35 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -918,8 +918,7 @@ static int migrate_vma_split_unmapped_folio(struct 
> migrate_vma *migrate,
>
>       folio_get(folio);
>       split_huge_pmd_address(migrate->vma, addr, true);
> -     ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL,
> -                                                     0, true);
> +     ret = split_unmapped_folio_to_order(folio, 0);
>       if (ret)
>               return ret;
>       migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;
> -- 
> 2.51.0


--
Best Regards,
Yan, Zi

Re: [v7 11/16] mm/migrate_device: add THP splitting during migration

Reply via email to