offline_and_remove_memory() handles a single contiguous range.

Callers that manage a device composed of several ranges (dax/kmem)
currently have to call it in a loop, which gives up atomicity.

In addition to pushing rollback logic into the driver, the lack
of atomicity creates a race condition between system daemons trying
to manage the same resource:

   - Manager 1:  Offlines memory blocks.    Removes device.
                                        ^^^^
   - Manager 2:  Detects offline memory blocks, re-onlines them.

Add offline_and_remove_memory_ranges(), which takes an array of ranges
and processes them as one operation under a single lock_device_hotplug():

  - Phase 1 offlines every block of every range.
  - Phase 2 removes the ranges only if all ranges are offline.
  - If any offline fails, the whole operation is reverted.

This gives callers all-or-nothing semantics for the offline step, so a
failed or interrupted unplug leaves the device in a consistent state.

This also resolves the battling managers race - the second manager's
operation simply fails when the block is destroyed / cannot be onlined.

offline_and_remove_memory() becomes a thin wrapper that passes its single
range to the new helper, so the offline/rollback logic lives in one place.

Suggested-by: David Hildenbrand (Arm) <[email protected]>
Signed-off-by: Gregory Price <[email protected]>
---
 include/linux/memory_hotplug.h |  7 +++
 mm/memory_hotplug.c            | 94 ++++++++++++++++++++++++----------
 2 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d3edeb80aadb..7f1da7c428dc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -267,6 +267,7 @@ extern int offline_pages(unsigned long start_pfn, unsigned 
long nr_pages,
 extern int remove_memory(u64 start, u64 size);
 extern void __remove_memory(u64 start, u64 size);
 extern int offline_and_remove_memory(u64 start, u64 size);
+int offline_and_remove_memory_ranges(const struct range *ranges, int 
nr_ranges);
 
 #else
 static inline void try_offline_node(int nid) {}
@@ -283,6 +284,12 @@ static inline int remove_memory(u64 start, u64 size)
 }
 
 static inline void __remove_memory(u64 start, u64 size) {}
+
+static inline int offline_and_remove_memory_ranges(const struct range *ranges,
+                                                  int nr_ranges)
+{
+       return -EBUSY;
+}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a66346def504..7d56e0c6ede0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -2429,58 +2429,98 @@ static int try_reonline_memory_block(struct 
memory_block *mem, void *arg)
  */
 int offline_and_remove_memory(u64 start, u64 size)
 {
-       const unsigned long mb_count = size / memory_block_size_bytes();
+       struct range range = { .start = start, .end = start + size - 1 };
+
+       return offline_and_remove_memory_ranges(&range, 1);
+}
+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
+
+/**
+ * offline_and_remove_memory_ranges - offline and remove multiple memory ranges
+ * @ranges: array of physical address ranges to offline and remove
+ * @nr_ranges: number of entries in @ranges
+ *
+ * Offline and remove several memory ranges as one operation, serialized
+ * against other hotplug operations by a single lock_device_hotplug().
+ *
+ * This offlines all ranges before removing any of them.  If offlining any
+ * range fails, the entire process is reverted and nothing is removed.
+ * This provides a fully atomic semantic for unplugging an entire device.
+ *
+ * Each range must be memory-block aligned in start and size.
+ *
+ * Return: 0 on success, negative errno otherwise.  On failure no range has
+ * been removed.
+ */
+int offline_and_remove_memory_ranges(const struct range *ranges, int nr_ranges)
+{
+       unsigned long mb_total = 0;
        uint8_t *online_types, *tmp;
-       int rc;
+       int i, rc = 0;
 
-       if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
-           !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+       if (!ranges || nr_ranges <= 0)
                return -EINVAL;
 
+       for (i = 0; i < nr_ranges; i++) {
+               u64 start = ranges[i].start;
+               u64 size = range_len(&ranges[i]);
+
+               if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
+                   !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+                       return -EINVAL;
+               mb_total += size / memory_block_size_bytes();
+       }
+
        /*
-        * We'll remember the old online type of each memory block, so we can
-        * try to revert whatever we did when offlining one memory block fails
-        * after offlining some others succeeded.
+        * Remember the old online type of every memory block across all ranges,
+        * so we can revert if offlining a later block fails.  All entries start
+        * as MMOP_OFFLINE so blocks we never touched are skipped on rollback.
         */
-       online_types = kmalloc_array(mb_count, sizeof(*online_types),
+       online_types = kmalloc_array(mb_total, sizeof(*online_types),
                                     GFP_KERNEL);
        if (!online_types)
                return -ENOMEM;
-       /*
-        * Initialize all states to MMOP_OFFLINE, so when we abort processing in
-        * try_offline_memory_block(), we'll skip all unprocessed blocks in
-        * try_reonline_memory_block().
-        */
-       memset(online_types, MMOP_OFFLINE, mb_count);
+       memset(online_types, MMOP_OFFLINE, mb_total);
 
        lock_device_hotplug();
 
+       /* Phase 1: offline every block in every range. */
        tmp = online_types;
-       rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
+       for (i = 0; i < nr_ranges; i++) {
+               rc = walk_memory_blocks(ranges[i].start, range_len(&ranges[i]),
+                                       &tmp, try_offline_memory_block);
+               if (rc)
+                       break;
+       }
 
        /*
-        * In case we succeeded to offline all memory, remove it.
-        * This cannot fail as it cannot get onlined in the meantime.
+        * Phase 2: Remove each range. This essentially cannot fail as we hold
+        * the hotplug lock . WARN if that assumption is ever broken.
         */
        if (!rc) {
-               rc = try_remove_memory(start, size);
-               if (rc)
-                       pr_err("%s: Failed to remove memory: %d", __func__, rc);
+               for (i = 0; i < nr_ranges; i++) {
+                       rc = try_remove_memory(ranges[i].start,
+                                              range_len(&ranges[i]));
+                       if (WARN_ON_ONCE(rc)) {
+                               pr_err("%s: Failed to remove memory: %d",
+                                      __func__, rc);
+                               break;
+                       }
+               }
        }
 
-       /*
-        * Rollback what we did. While memory onlining might theoretically fail
-        * (nacked by a notifier), it barely ever happens.
-        */
+       /* On fail: roll back. Blocks that were already offline are skipped */
        if (rc) {
                tmp = online_types;
-               walk_memory_blocks(start, size, &tmp,
-                                  try_reonline_memory_block);
+               for (i = 0; i < nr_ranges; i++)
+                       walk_memory_blocks(ranges[i].start,
+                                          range_len(&ranges[i]), &tmp,
+                                          try_reonline_memory_block);
        }
        unlock_device_hotplug();
 
        kfree(online_types);
        return rc;
 }
-EXPORT_SYMBOL_GPL(offline_and_remove_memory);
+EXPORT_SYMBOL_GPL(offline_and_remove_memory_ranges);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
-- 
2.54.0


Reply via email to