On Thu, Jan 28, 2021 at 02:48:17PM -0800, Axel Rasmussen wrote:
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index f94a35296618..79e1f0155afa 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -135,11 +135,14 @@ void hugetlb_show_meminfo(void);
>  unsigned long hugetlb_total_pages(void);
>  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>                       unsigned long address, unsigned int flags);
> +#ifdef CONFIG_USERFAULTFD

I'm confused why this is needed.. hugetlb_mcopy_atomic_pte() should only be
called in userfaultfd.c, but if without uffd config set it won't compile
either:

        obj-$(CONFIG_USERFAULTFD) += userfaultfd.o

>  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
>                               struct vm_area_struct *dst_vma,
>                               unsigned long dst_addr,
>                               unsigned long src_addr,
> +                             enum mcopy_atomic_mode mode,
>                               struct page **pagep);
> +#endif
>  int hugetlb_reserve_pages(struct inode *inode, long from, long to,
>                                               struct vm_area_struct *vma,
>                                               vm_flags_t vm_flags);
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index fb9abaeb4194..2fcb686211e8 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
>  
>  extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long 
> reason);
>  
> +/*
> + * The mode of operation for __mcopy_atomic and its helpers.
> + *
> + * This is almost an implementation detail (mcopy_atomic below doesn't take 
> this
> + * as a parameter), but it's exposed here because memory-kind-specific
> + * implementations (e.g. hugetlbfs) need to know the mode of operation.
> + */
> +enum mcopy_atomic_mode {
> +     /* A normal copy_from_user into the destination range. */
> +     MCOPY_ATOMIC_NORMAL,
> +     /* Don't copy; map the destination range to the zero page. */
> +     MCOPY_ATOMIC_ZEROPAGE,
> +     /* Just setup the dst_vma, without modifying the underlying page(s). */
> +     MCOPY_ATOMIC_CONTINUE,
> +};
> +

Maybe better to keep this to where it's used, e.g. hugetlb.h where we've
defined hugetlb_mcopy_atomic_pte()?

[...]

> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6f9d8349f818..3d318ef3d180 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4647,6 +4647,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct 
> vm_area_struct *vma,
>       return ret;
>  }
>  
> +#ifdef CONFIG_USERFAULTFD

So I feel like you added the header ifdef for this.

IMHO we can drop both since that's what we have had.  I agree maybe it's better
to not compile that without CONFIG_USERFAULTFD but that may worth a standalone
patch anyways.

>  /*
>   * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
>   * modifications for huge pages.
> @@ -4656,6 +4657,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>                           struct vm_area_struct *dst_vma,
>                           unsigned long dst_addr,
>                           unsigned long src_addr,
> +                         enum mcopy_atomic_mode mode,
>                           struct page **pagep)
>  {
>       struct address_space *mapping;
> @@ -4668,7 +4670,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>       int ret;
>       struct page *page;
>  
> -     if (!*pagep) {
> +     mapping = dst_vma->vm_file->f_mapping;
> +     idx = vma_hugecache_offset(h, dst_vma, dst_addr);
> +
> +     if (!*pagep && mode != MCOPY_ATOMIC_CONTINUE) {
>               ret = -ENOMEM;
>               page = alloc_huge_page(dst_vma, dst_addr, 0);
>               if (IS_ERR(page))
> @@ -4685,6 +4690,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>                       /* don't free the page */
>                       goto out;
>               }
> +     } else if (mode == MCOPY_ATOMIC_CONTINUE) {
> +             ret = -EFAULT;
> +             page = find_lock_page(mapping, idx);
> +             *pagep = NULL;
> +             if (!page)
> +                     goto out;
>       } else {
>               page = *pagep;
>               *pagep = NULL;

I would write this as:

    if (mode == MCOPY_ATOMIC_CONTINUE)
        ...
    else if (!*pagep)
        ...
    else 
        ...

No strong opinion, but that'll look slightly cleaner to me.

[...]

> @@ -408,7 +407,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct 
> *dst_mm,
>                                     unsigned long dst_start,
>                                     unsigned long src_start,
>                                     unsigned long len,
> -                                   bool zeropage);
> +                                   enum mcopy_atomic_mode mode);
>  #endif /* CONFIG_HUGETLB_PAGE */
>  
>  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> @@ -417,7 +416,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct 
> mm_struct *dst_mm,
>                                               unsigned long dst_addr,
>                                               unsigned long src_addr,
>                                               struct page **page,
> -                                             bool zeropage,
> +                                             enum mcopy_atomic_mode mode,
>                                               bool wp_copy)
>  {
>       ssize_t err;
> @@ -433,22 +432,38 @@ static __always_inline ssize_t mfill_atomic_pte(struct 
> mm_struct *dst_mm,
>        * and not in the radix tree.
>        */
>       if (!(dst_vma->vm_flags & VM_SHARED)) {
> -             if (!zeropage)
> +             switch (mode) {
> +             case MCOPY_ATOMIC_NORMAL:
>                       err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
>                                              dst_addr, src_addr, page,
>                                              wp_copy);
> -             else
> +                     break;
> +             case MCOPY_ATOMIC_ZEROPAGE:
>                       err = mfill_zeropage_pte(dst_mm, dst_pmd,
>                                                dst_vma, dst_addr);
> +                     break;
> +             /* It only makes sense to CONTINUE for shared memory. */
> +             case MCOPY_ATOMIC_CONTINUE:
> +                     err = -EINVAL;
> +                     break;
> +             }
>       } else {
>               VM_WARN_ON_ONCE(wp_copy);
> -             if (!zeropage)
> +             switch (mode) {
> +             case MCOPY_ATOMIC_NORMAL:
>                       err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
>                                                    dst_vma, dst_addr,
>                                                    src_addr, page);
> -             else
> +                     break;
> +             case MCOPY_ATOMIC_ZEROPAGE:
>                       err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
>                                                      dst_vma, dst_addr);
> +                     break;
> +             case MCOPY_ATOMIC_CONTINUE:
> +                     /* FIXME: Add minor fault interception for shmem. */
> +                     err = -EINVAL;
> +                     break;
> +             }
>       }
>  
>       return err;

The whole chunk above is not needed for hugetlbfs it seems - I'd avoid touching
the anon/shmem code path until it's being supported.

What you need is probably set zeropage as below in __mcopy_atomic():

    zeropage = (mode == MCOPY_ATOMIC_ZEROPAGE);

Before passing it over to mfill_atomic_pte().  As long as we reject
UFFDIO_CONTINUE with !hugetlbfs correctly that'll be enough iiuc.

Thanks,

-- 
Peter Xu

Reply via email to