mems_default is intersect(effective_mems, default_sysram_nodes). This allows hotplugged memory nodes to be marked "protected". A protected node's memory is not default-allocable via standard methods (basic pages faults, mempolicies, etc).
When checking node_allowed, check for GFP_PROTECTED to determine if the check should be made against mems_default or mems_allowed, since mems_default only contains sysram nodes. Signed-off-by: Gregory Price <[email protected]> --- include/linux/cpuset.h | 8 ++-- kernel/cgroup/cpuset-internal.h | 8 ++++ kernel/cgroup/cpuset-v1.c | 7 +++ kernel/cgroup/cpuset.c | 83 ++++++++++++++++++++++++++------- mm/memcontrol.c | 2 +- mm/mempolicy.c | 8 ++-- mm/migrate.c | 4 +- 7 files changed, 93 insertions(+), 27 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 4db08c580cc3..7f683e4cf6c3 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -77,7 +77,7 @@ extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); -extern nodemask_t cpuset_mems_allowed(struct task_struct *p); +extern nodemask_t cpuset_mems_default(struct task_struct *p); #define cpuset_current_mems_default (current->mems_default) void cpuset_init_current_mems_default(void); int cpuset_nodemask_valid_mems_default(const nodemask_t *nodemask); @@ -173,7 +173,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } -extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); +extern bool cpuset_node_default(struct cgroup *cgroup, int nid); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -211,7 +211,7 @@ static inline bool cpuset_cpu_is_isolated(int cpu) return false; } -static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) +static inline nodemask_t cpuset_mems_default(struct task_struct *p) { return node_possible_map; } @@ -294,7 +294,7 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } -static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +static inline bool cpuset_node_default(struct cgroup *cgroup, int nid) { return true; } diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 337608f408ce..6978e04477b2 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -55,6 +55,7 @@ typedef enum { FILE_MEMLIST, FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, + FILE_MEMS_DEFAULT, FILE_SUBPARTS_CPULIST, FILE_EXCLUSIVE_CPULIST, FILE_EFFECTIVE_XCPULIST, @@ -104,6 +105,13 @@ struct cpuset { cpumask_var_t effective_cpus; nodemask_t effective_mems; + /* + * Default Memory Nodes for tasks. + * This is the intersection of effective_mems and default_sysram_nodes. + * Tasks will have their mems_default set to this value. + */ + nodemask_t mems_default; + /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 12e76774c75b..a06f2b032e0d 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -293,6 +293,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, cpumask_copy(cs->effective_cpus, new_cpus); cs->mems_allowed = *new_mems; cs->effective_mems = *new_mems; + cpuset_update_mems_default(cs); cpuset_callback_unlock_irq(); /* @@ -532,6 +533,12 @@ struct cftype cpuset1_files[] = { .private = FILE_EFFECTIVE_MEMLIST, }, + { + .name = "mems_default", + .seq_show = cpuset_common_seq_show, + .private = FILE_MEMS_DEFAULT, + }, + { .name = "cpu_exclusive", .read_u64 = cpuset_read_u64, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b05c07489a4d..ea5ca1a05cf5 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -29,6 +29,7 @@ #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> +#include <linux/memory-tiers.h> #include <linux/export.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -430,9 +431,9 @@ static void guarantee_active_cpus(struct task_struct *tsk, */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) + while (!nodes_intersects(cs->mems_default, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); + nodes_and(*pmask, cs->mems_default, node_states[N_MEMORY]); } /** @@ -2748,7 +2749,7 @@ void cpuset_update_tasks_nodemask(struct cpuset *cs) migrate = is_memory_migrate(cs); - mpol_rebind_mm(mm, &cs->mems_allowed); + mpol_rebind_mm(mm, &cs->mems_default); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); else @@ -2808,6 +2809,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; + if (!nodes_empty(default_sysram_nodelist)) + nodes_and(cp->mems_default, cp->effective_mems, + default_sysram_nodelist); spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && @@ -3234,7 +3238,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * by skipping the task iteration and update. */ if (cpuset_v2() && !cpus_updated && !mems_updated) { - cpuset_attach_nodemask_to = cs->effective_mems; + cpuset_attach_nodemask_to = cs->mems_default; goto out; } @@ -3249,7 +3253,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * if there is no change in effective_mems and CS_MEMORY_MIGRATE is * not set. */ - cpuset_attach_nodemask_to = cs->effective_mems; + cpuset_attach_nodemask_to = cs->mems_default; if (!is_memory_migrate(cs) && !mems_updated) goto out; @@ -3371,6 +3375,9 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; + case FILE_MEMS_DEFAULT: + seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_default)); + break; case FILE_EXCLUSIVE_CPULIST: seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); break; @@ -3482,6 +3489,12 @@ static struct cftype dfl_files[] = { .private = FILE_EFFECTIVE_MEMLIST, }, + { + .name = "mems.default", + .seq_show = cpuset_common_seq_show, + .private = FILE_MEMS_DEFAULT, + }, + { .name = "cpus.partition", .seq_show = cpuset_partition_show, @@ -3585,6 +3598,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; + if (!nodes_empty(default_sysram_nodelist)) + nodes_and(cs->mems_default, cs->effective_mems, + default_sysram_nodelist); } spin_unlock_irq(&callback_lock); @@ -3616,6 +3632,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; + if (!nodes_empty(default_sysram_nodelist)) + nodes_and(cs->mems_default, cs->effective_mems, + default_sysram_nodelist); cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); @@ -3818,6 +3837,9 @@ int __init cpuset_init(void) cpumask_setall(top_cpuset.effective_xcpus); cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); + if (!nodes_empty(default_sysram_nodelist)) + nodes_and(top_cpuset.mems_default, top_cpuset.effective_mems, + default_sysram_nodelist); fmeter_init(&top_cpuset.fmeter); INIT_LIST_HEAD(&remote_children); @@ -3848,6 +3870,9 @@ hotplug_update_tasks(struct cpuset *cs, spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; + if (!nodes_empty(default_sysram_nodelist)) + nodes_and(cs->mems_default, cs->effective_mems, + default_sysram_nodelist); spin_unlock_irq(&callback_lock); if (cpus_updated) @@ -4039,6 +4064,10 @@ static void cpuset_handle_hotplug(void) if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; + if (!nodes_empty(default_sysram_nodelist)) + nodes_and(top_cpuset.mems_default, + top_cpuset.effective_mems, + default_sysram_nodelist); spin_unlock_irq(&callback_lock); cpuset_update_tasks_nodemask(&top_cpuset); } @@ -4109,6 +4138,9 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; + if (!nodes_empty(default_sysram_nodelist)) + nodes_and(top_cpuset.mems_default, top_cpuset.effective_mems, + default_sysram_nodelist); hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); @@ -4205,22 +4237,27 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) return changed; } +/* + * At this point in time, no hotplug nodes can have been added, so just set + * the mems_default of the init task to the set of N_MEMORY nodes. + */ void __init cpuset_init_current_mems_default(void) { - nodes_setall(current->mems_default); + nodes_clear(current->mems_default); + nodes_or(current->mems_default, current->mems_default, node_states[N_MEMORY]); } /** - * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. + * cpuset_mems_default - return mems_default mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->mems_default. * - * Description: Returns the nodemask_t mems_allowed of the cpuset + * Description: Returns the nodemask_t mems_default of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of node_states[N_MEMORY], even if this means going outside the * tasks cpuset. **/ -nodemask_t cpuset_mems_allowed(struct task_struct *tsk) +nodemask_t cpuset_mems_default(struct task_struct *tsk) { nodemask_t mask; unsigned long flags; @@ -4295,17 +4332,29 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. + * GFP_PROTECTED - allow non-sysram nodes in mems_allowed */ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; + bool protected_node = gfp_mask & __GFP_PROTECTED; if (in_interrupt()) return true; - if (node_isset(node, current->mems_default)) - return true; + + if (protected_node) { + rcu_read_lock(); + cs = task_cs(current); + allowed = node_isset(node, cs->mems_allowed); + rcu_read_unlock(); + } else if (node_isset(node, current->mems_default)) + allowed = true; + + if (allowed) + return allowed; + /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -4322,13 +4371,15 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) spin_lock_irqsave(&callback_lock, flags); cs = nearest_hardwall_ancestor(task_cs(current)); - allowed = node_isset(node, cs->mems_allowed); + allowed = node_isset(node, cs->mems_allowed); /* include protected */ + if (!protected_node && !nodes_empty(default_sysram_nodelist)) + allowed &= node_isset(node, default_sysram_nodelist); spin_unlock_irqrestore(&callback_lock, flags); return allowed; } -bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +bool cpuset_node_default(struct cgroup *cgroup, int nid) { struct cgroup_subsys_state *css; struct cpuset *cs; @@ -4347,7 +4398,7 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid) return true; /* - * Normally, accessing effective_mems would require the cpuset_mutex + * Normally, accessing mems_default would require the cpuset_mutex * or callback_lock - but node_isset is atomic and the reference * taken via cgroup_get_e_css is sufficient to protect css. * @@ -4359,7 +4410,7 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid) * cannot make strong isolation guarantees, so this is acceptable. */ cs = container_of(css, struct cpuset, css); - allowed = node_isset(nid, cs->effective_mems); + allowed = node_isset(nid, cs->mems_default); css_put(css); return allowed; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f4..a25584cb281e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5599,5 +5599,5 @@ subsys_initcall(mem_cgroup_swap_init); bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { - return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; + return memcg ? cpuset_node_default(memcg->css.cgroup, nid) : true; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6225d4d23010..5360333dc06d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1831,14 +1831,14 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, } rcu_read_unlock(); - task_nodes = cpuset_mems_allowed(task); + task_nodes = cpuset_mems_default(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } - task_nodes = cpuset_mems_allowed(current); + task_nodes = cpuset_mems_default(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; @@ -2738,7 +2738,7 @@ int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() - * with the mems_default returned by cpuset_mems_allowed(). This + * with the mems_default returned by cpuset_mems_default(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * @@ -2763,7 +2763,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) *new = *old; if (current_cpuset_is_being_rebound()) { - nodemask_t mems = cpuset_mems_allowed(current); + nodemask_t mems = cpuset_mems_default(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); diff --git a/mm/migrate.c b/mm/migrate.c index c0e9f15be2a2..f9a910b43a9f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2526,7 +2526,7 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) */ if (!pid) { mmget(current->mm); - *mem_nodes = cpuset_mems_allowed(current); + *mem_nodes = cpuset_mems_default(current); return current->mm; } @@ -2547,7 +2547,7 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) goto out; - *mem_nodes = cpuset_mems_allowed(task); + *mem_nodes = cpuset_mems_default(task); mm = get_task_mm(task); out: put_task_struct(task); -- 2.51.1

