Some secondary MMUs cannot safely tolerate a user VMA becoming backed
by transparent huge pages after the range has been registered with an
interval notifier. Drivers can observe the page-table layout change
through invalidations, but devices without replayable faults, or ranges
that must stay mapped, cannot necessarily re-establish coherent device
mappings before later device access.

Add MMU_INTERVAL_NOTIFIER_BLOCK_THP so a driver can declare this
property when registering an interval notifier. The MM core then marks
the covered VMA range VM_NOHUGEPAGE and clears VM_HUGEPAGE while
holding mmap_lock for write. A later MADV_HUGEPAGE on the same active
range is treated as an ignored hint, leaving the MM-owned nohuge
policy intact. MADV_COLLAPSE already rejects VM_NOHUGEPAGE VMAs.

This keeps the policy in MM code instead of requiring device drivers
to edit VMA THP flags directly, and it only affects opt-in notifier
ranges at registration or flag-transition time.

Assisted-by: OpenAI-Codex:GPT-5.5
Signed-off-by: Yitao Jiang <[email protected]>
---
 include/linux/huge_mm.h      |   5 +-
 include/linux/mmu_notifier.h |  28 +++++
 mm/khugepaged.c              |   9 +-
 mm/madvise.c                 |   3 +-
 mm/mmu_notifier.c            | 204 +++++++++++++++++++++++++++++++++--
 5 files changed, 237 insertions(+), 12 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ad20f7f8c..3dae515ff 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -489,8 +489,8 @@ change_huge_pud(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
                        __split_huge_pud(__vma, __pud, __address);      \
        }  while (0)
 
-int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags,
-                    int advice);
+int hugepage_madvise(struct vm_area_struct *vma, unsigned long start,
+                    unsigned long end, vm_flags_t *vm_flags, int advice);
 int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
                     unsigned long end, bool *lock_dropped);
 void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
@@ -694,6 +694,7 @@ static inline bool unmap_huge_pmd_locked(struct 
vm_area_struct *vma,
        do { } while (0)
 
 static inline int hugepage_madvise(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end,
                                   vm_flags_t *vm_flags, int advice)
 {
        return -EINVAL;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a11a44eef..4accfb65f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -293,8 +293,16 @@ struct mmu_interval_notifier {
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
+       unsigned int flags;
 };
 
+/*
+ * The interval range cannot safely be backed by transparent huge pages while
+ * the notifier is active. The MM core owns the VMA policy change so drivers
+ * do not have to manipulate VM_HUGEPAGE/VM_NOHUGEPAGE directly.
+ */
+#define MMU_INTERVAL_NOTIFIER_BLOCK_THP BIT(0)
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 #ifdef CONFIG_LOCKDEP
@@ -347,7 +355,20 @@ int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
+int
+mmu_interval_notifier_insert_locked_flags(struct mmu_interval_notifier 
*interval_sub,
+                                         struct mm_struct *mm,
+                                         unsigned long start,
+                                         unsigned long length,
+                                         const struct 
mmu_interval_notifier_ops *ops,
+                                         unsigned int flags);
+int
+mmu_interval_notifier_set_flags_locked(struct mmu_interval_notifier 
*interval_sub,
+                                      unsigned int flags);
 void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);
+bool mmu_interval_notifier_range_block_thp(struct mm_struct *mm,
+                                          unsigned long start,
+                                          unsigned long end);
 
 /**
  * mmu_interval_set_seq - Save the invalidation sequence
@@ -637,6 +658,13 @@ static inline void 
mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
 {
 }
 
+static inline bool mmu_interval_notifier_range_block_thp(struct mm_struct *mm,
+                                                        unsigned long start,
+                                                        unsigned long end)
+{
+       return false;
+}
+
 #define mmu_notifier_range_update_to_read_only(r) false
 
 static inline void mmu_notifier_synchronize(void)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 617bca76d..a9b05e716 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -445,11 +445,16 @@ static unsigned int collapse_max_ptes_swap(struct 
collapse_control *cc,
        return khugepaged_max_ptes_swap;
 }
 
-int hugepage_madvise(struct vm_area_struct *vma,
-                    vm_flags_t *vm_flags, int advice)
+int hugepage_madvise(struct vm_area_struct *vma, unsigned long start,
+                    unsigned long end, vm_flags_t *vm_flags, int advice)
 {
        switch (advice) {
        case MADV_HUGEPAGE:
+               if ((*vm_flags & VM_NOHUGEPAGE) &&
+                   mmu_interval_notifier_range_block_thp(vma->vm_mm,
+                                                         start, end))
+                       return 0;
+
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
diff --git a/mm/madvise.c b/mm/madvise.c
index cd9bb0770..c7cee4fcf 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1416,7 +1416,8 @@ static int madvise_vma_behavior(struct madvise_behavior 
*madv_behavior)
                break;
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
-               error = hugepage_madvise(vma, &new_flags, behavior);
+               error = hugepage_madvise(vma, range->start, range->end,
+                                        &new_flags, behavior);
                if (error)
                        goto out;
                break;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 245b74f39..852a5682b 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -581,6 +581,49 @@ int __mmu_notifier_invalidate_range_start(struct 
mmu_notifier_range *range)
        return 0;
 }
 
+/**
+ * mmu_interval_notifier_range_block_thp - check if a range must not use THP
+ * @mm: mm_struct to check
+ * @start: start address
+ * @end: end address
+ *
+ * Return true if an active interval notifier covering the range requested
+ * MMU_INTERVAL_NOTIFIER_BLOCK_THP.
+ */
+bool mmu_interval_notifier_range_block_thp(struct mm_struct *mm,
+                                          unsigned long start,
+                                          unsigned long end)
+{
+       struct mmu_notifier_subscriptions *subscriptions;
+       struct mmu_interval_notifier *interval_sub;
+       struct interval_tree_node *node;
+       bool block_thp = false;
+
+       if (start >= end)
+               return false;
+
+       /* Pairs with the store in mmu_notifier_register(). */
+       subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
+       if (!subscriptions || !subscriptions->has_itree)
+               return false;
+
+       spin_lock(&subscriptions->lock);
+       for (node = interval_tree_iter_first(&subscriptions->itree, start,
+                                            end - 1);
+            node;
+            node = interval_tree_iter_next(node, start, end - 1)) {
+               interval_sub = container_of(node, struct mmu_interval_notifier,
+                                           interval_tree);
+               if (interval_sub->flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) {
+                       block_thp = true;
+                       break;
+               }
+       }
+       spin_unlock(&subscriptions->lock);
+
+       return block_thp;
+}
+
 static void
 mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
                        struct mmu_notifier_range *range)
@@ -933,13 +976,69 @@ void mmu_notifier_put(struct mmu_notifier *subscription)
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_put);
 
+#define MMU_INTERVAL_NOTIFIER_KNOWN_FLAGS \
+       (MMU_INTERVAL_NOTIFIER_BLOCK_THP)
+
+static int mmu_interval_notifier_check_flags(unsigned int flags)
+{
+       if (flags & ~MMU_INTERVAL_NOTIFIER_KNOWN_FLAGS)
+               return -EINVAL;
+       return 0;
+}
+
+static int
+mmu_interval_notifier_block_thp_locked(struct mm_struct *mm,
+                                      unsigned long start,
+                                      unsigned long end)
+{
+       struct vm_area_struct *vma, *prev;
+       struct vma_iterator vmi;
+
+       mmap_assert_write_locked(mm);
+
+       vma_iter_init(&vmi, mm, start);
+       vma = vma_iter_load(&vmi);
+       prev = vma_prev(&vmi);
+       if (vma && start > vma->vm_start)
+               prev = vma;
+
+       for_each_vma_range(vmi, vma, end) {
+               const unsigned long curr_start = max(vma->vm_start, start);
+               const unsigned long curr_end = min(vma->vm_end, end);
+               vma_flags_t new_flags;
+
+               if (vma->vm_flags & VM_NO_KHUGEPAGED)
+                       goto next;
+
+               new_flags = vma->flags;
+               vma_flags_set(&new_flags, VMA_NOHUGEPAGE_BIT);
+               vma_flags_clear(&new_flags, VMA_HUGEPAGE_BIT);
+               if (vma_flags_same_pair(&new_flags, &vma->flags))
+                       goto next;
+
+               vma = vma_modify_flags(&vmi, prev, vma, curr_start,
+                                      curr_end, &new_flags);
+               if (IS_ERR(vma))
+                       return PTR_ERR(vma);
+
+               vma_start_write(vma);
+               vma->flags = new_flags;
+next:
+               prev = vma;
+       }
+
+       return 0;
+}
+
 static int __mmu_interval_notifier_insert(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        struct mmu_notifier_subscriptions *subscriptions, unsigned long start,
-       unsigned long length, const struct mmu_interval_notifier_ops *ops)
+       unsigned long length, const struct mmu_interval_notifier_ops *ops,
+       unsigned int flags)
 {
        interval_sub->mm = mm;
        interval_sub->ops = ops;
+       interval_sub->flags = flags;
        RB_CLEAR_NODE(&interval_sub->interval_tree.rb);
        interval_sub->interval_tree.start = start;
        /*
@@ -1034,32 +1133,123 @@ int mmu_interval_notifier_insert(struct 
mmu_interval_notifier *interval_sub,
                subscriptions = mm->notifier_subscriptions;
        }
        return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
-                                             start, length, ops);
+                                             start, length, ops, 0);
 }
 EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
 
-int mmu_interval_notifier_insert_locked(
-       struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
-       unsigned long start, unsigned long length,
-       const struct mmu_interval_notifier_ops *ops)
+/**
+ * mmu_interval_notifier_insert_locked_flags - Insert an interval notifier
+ * @interval_sub: Interval subscription to register
+ * @mm: mm_struct to attach to
+ * @start: Starting virtual address to monitor
+ * @length: Length of the range to monitor
+ * @ops: Interval notifier operations to be called on matching events
+ * @flags: MMU_INTERVAL_NOTIFIER_* flags
+ *
+ * Like mmu_interval_notifier_insert_locked(), but lets callers request
+ * additional MM-owned policy for the interval while holding mmap_lock for
+ * write.
+ */
+int
+mmu_interval_notifier_insert_locked_flags(struct mmu_interval_notifier 
*interval_sub,
+                                         struct mm_struct *mm,
+                                         unsigned long start,
+                                         unsigned long length,
+                                         const struct 
mmu_interval_notifier_ops *ops,
+                                         unsigned int flags)
 {
        struct mmu_notifier_subscriptions *subscriptions =
                mm->notifier_subscriptions;
+       unsigned long end;
        int ret;
 
        mmap_assert_write_locked(mm);
 
+       ret = mmu_interval_notifier_check_flags(flags);
+       if (ret)
+               return ret;
+
+       if (flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) {
+               if (length == 0 || check_add_overflow(start, length, &end))
+                       return -EOVERFLOW;
+       }
+
        if (!subscriptions || !subscriptions->has_itree) {
                ret = __mmu_notifier_register(NULL, mm);
                if (ret)
                        return ret;
                subscriptions = mm->notifier_subscriptions;
        }
+
+       if (flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) {
+               ret = mmu_interval_notifier_block_thp_locked(mm, start, end);
+               if (ret)
+                       return ret;
+       }
+
        return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
-                                             start, length, ops);
+                                             start, length, ops, flags);
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked_flags);
+
+int mmu_interval_notifier_insert_locked(struct mmu_interval_notifier 
*interval_sub,
+                                       struct mm_struct *mm,
+                                       unsigned long start,
+                                       unsigned long length,
+                                       const struct mmu_interval_notifier_ops 
*ops)
+{
+       return mmu_interval_notifier_insert_locked_flags(interval_sub, mm,
+                                                        start, length,
+                                                        ops, 0);
 }
 EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
 
+/**
+ * mmu_interval_notifier_set_flags_locked - update an interval notifier's flags
+ * @interval_sub: Interval subscription to update
+ * @flags: MMU_INTERVAL_NOTIFIER_* flags
+ *
+ * Update MMU interval notifier flags while holding mmap_lock for write. When
+ * enabling MMU_INTERVAL_NOTIFIER_BLOCK_THP, the MM core first updates the VMA
+ * THP policy for the notifier's address range.
+ */
+int
+mmu_interval_notifier_set_flags_locked(struct mmu_interval_notifier 
*interval_sub,
+                                      unsigned int flags)
+{
+       struct mm_struct *mm = interval_sub->mm;
+       unsigned long start = interval_sub->interval_tree.start;
+       unsigned long end;
+       int ret;
+
+       ret = mmu_interval_notifier_check_flags(flags);
+       if (ret)
+               return ret;
+
+       if (WARN_ON_ONCE(!mm))
+               return -EINVAL;
+
+       mmap_assert_write_locked(mm);
+
+       if ((flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP) &&
+           !(interval_sub->flags & MMU_INTERVAL_NOTIFIER_BLOCK_THP)) {
+               if (interval_sub->interval_tree.last == ULONG_MAX)
+                       return -EOVERFLOW;
+               end = interval_sub->interval_tree.last + 1;
+
+               ret = mmu_interval_notifier_block_thp_locked(mm, start, end);
+               if (ret)
+                       return ret;
+       }
+
+       spin_lock(&mm->notifier_subscriptions->lock);
+       interval_sub->flags = flags;
+       spin_unlock(&mm->notifier_subscriptions->lock);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mmu_interval_notifier_set_flags_locked);
+
 static bool
 mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
                          unsigned long seq)
-- 
2.53.0

Reply via email to