[Devel] [PATCH RHEL10 COMMIT] sched: Clean up vCPU handling logic

Konstantin Khorenko Thu, 19 Mar 2026 06:25:10 -0700

The commit is pushed to "branch-rh10-6.12.0-55.52.1.4.x.vz10-ovz" and will 
appear at [email protected]:openvz/vzkernel.git
after rh10-6.12.0-55.52.1.4.10.vz10
------>
commit 8c0b4b2d5a2b665640b114fd9428077af23ec7ba
Author: Dmitry Sepp <[email protected]>
Date:   Thu Mar 19 09:47:18 2026 +0000


    sched: Clean up vCPU handling logic
    
    The idea behind the change is to transition from the existing spatial
    vCPU handling approach that introduces costly modification to the
    scheduling logic to ensure the requested CPU count is obeyed (10%+
    performance drop in some tests) to temporal isolation that can be
    provided by the cgroup2 cpu.max.
    
    Drop the legacy unneeded vCPU handling code. Remove the 'cpu.rate'
    control in favor of the internal calculation based on 'quota' and
    'period' from 'cpu.max'. As 'cpu.max' is not implicitly used to set the
    rate, do not override nr_cpus when handling writes to 'cpu.max'.
    
    https://virtuozzo.atlassian.net/browse/VSTOR-124385
    
    Signed-off-by: Dmitry Sepp <[email protected]>
    
    ======
    Patchset description:
    sched: Clean up vCPU handling code
    
    The idea behind the change is to transition from the existing spatial
    vCPU handling approach that introduces costly modification to the
    scheduling logic to ensure the requested CPU count is obeyed
    (10%+ performance drop in some tests, see below) to
    temporal isolation that can be provided by the cgroup2 cpu.max.
    
    Reference test results:
    
    1. Clean setup, no vCPU related modifications:
       ~/at_process_ctxswitch_pipe -w -p 2 -t 15
       rate_total: 856509.625000, avg: 428254.812500
    
    2. vCPU related modifications (present state):
       ~/at_process_ctxswitch_pipe -w -p 2 -t 15
       rate_total: 735626.812500, avg: 367813.406250
    
    3. Cleaned-up vCPU handling:
       ~/at_process_ctxswitch_pipe -w -p 2 -t 15
       rate_total: 840074.750000, avg: 420037.375000
    
    Feature: sched: ability to limit number of CPUs available to a CT
---
 include/linux/sched.h          |   6 -
 include/linux/sched/topology.h |   5 -
 kernel/sched/core.c            |  98 ++--------
 kernel/sched/fair.c            | 408 -----------------------------------------
 kernel/sched/sched.h           |  10 -
 5 files changed, 12 insertions(+), 515 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f7892c449d21..493073a97f023 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -527,9 +527,6 @@ struct sched_statistics {
        u64                             nr_migrations_cold;
        u64                             nr_failed_migrations_affine;
        u64                             nr_failed_migrations_running;
-#ifdef CONFIG_CFS_CPULIMIT
-       u64                             nr_failed_migrations_cpulimit;
-#endif
        u64                             nr_failed_migrations_hot;
        u64                             nr_forced_migrations;
 
@@ -558,9 +555,6 @@ struct sched_entity {
        u64                             min_slice;
 
        struct list_head                group_node;
-#ifdef CONFIG_CFS_CPULIMIT
-       struct list_head                cfs_rq_node;
-#endif
        unsigned char                   on_rq;
        unsigned char                   sched_delayed;
        unsigned char                   rel_deadline;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 1f13b26efef5c..4237daa5ac7a2 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -125,11 +125,6 @@ struct sched_domain {
        unsigned int alb_failed;
        unsigned int alb_pushed;
 
-       /* cpulimit balancing */
-       unsigned int clb_count;
-       unsigned int clb_failed;
-       unsigned int clb_pushed;
-
        /* SD_BALANCE_EXEC stats */
        unsigned int sbe_count;
        unsigned int sbe_balanced;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28fb5d0ecd898..cf2afc1307c5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8710,9 +8710,6 @@ void __init sched_init(void)
        INIT_LIST_HEAD(&root_task_group.children);
        INIT_LIST_HEAD(&root_task_group.siblings);
        autogroup_init(&init_task);
-#ifdef CONFIG_CFS_CPULIMIT
-       root_task_group.topmost_limited_ancestor = &root_task_group;
-#endif
 #endif /* CONFIG_CGROUP_SCHED */
 
        for_each_possible_cpu(i) {
@@ -9149,8 +9146,6 @@ struct task_group *sched_create_group(struct task_group 
*parent)
        return ERR_PTR(-ENOMEM);
 }
 
-static void tg_update_topmost_limited_ancestor(struct task_group *tg);
-
 void sched_online_group(struct task_group *tg, struct task_group *parent)
 {
        unsigned long flags;
@@ -9164,9 +9159,6 @@ void sched_online_group(struct task_group *tg, struct 
task_group *parent)
        tg->parent = parent;
        INIT_LIST_HEAD(&tg->children);
        list_add_rcu(&tg->siblings, &parent->children);
-#ifdef CONFIG_CFS_BANDWIDTH
-       tg_update_topmost_limited_ancestor(tg);
-#endif
        spin_unlock_irqrestore(&task_group_lock, flags);
 
        online_fair_sched_group(tg);
@@ -9650,7 +9642,6 @@ static const u64 min_cfs_quota_period = 1 * 
NSEC_PER_MSEC; /* 1ms */
 static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static void tg_limit_toggled(struct task_group *tg);
 
 static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
                                u64 burst)
@@ -9730,10 +9721,6 @@ static int __tg_set_cfs_bandwidth(struct task_group *tg, 
u64 period, u64 quota,
                if (cfs_rq->throttled)
                        unthrottle_cfs_rq(cfs_rq);
        }
-
-       if (runtime_enabled != runtime_was_enabled)
-               tg_limit_toggled(tg);
-
        if (runtime_was_enabled && !runtime_enabled)
                cfs_bandwidth_usage_dec();
 
@@ -9746,9 +9733,17 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, 
u64 period, u64 quota,
                                u64 burst)
 {
        int ret;
+       unsigned int nr_cpus;
 
        guard(cpus_read_lock)();
        guard(mutex)(&cfs_constraints_mutex);
+
+       if (tg->nr_cpus != 0) {
+               nr_cpus = DIV_ROUND_UP_ULL(quota, period);
+               if (nr_cpus > tg->nr_cpus)
+                       return -EINVAL;
+       }
+
        ret = __tg_set_cfs_bandwidth(tg, period, quota, burst);
        tg_update_cpu_limit(tg);
 
@@ -10002,49 +9997,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, 
void *v)
 }
 
 #ifdef CONFIG_CFS_CPULIMIT
-static int __tg_update_topmost_limited_ancestor(struct task_group *tg, void 
*unused)
-{
-       struct task_group *parent = tg->parent;
-
-       /*
-        * Parent and none of its uncestors is limited? The task group should
-        * become a topmost limited uncestor then, provided it has a limit set.
-        * Otherwise inherit topmost limited ancestor from the parent.
-        */
-       if (parent->topmost_limited_ancestor == parent &&
-           parent->cfs_bandwidth.quota == RUNTIME_INF)
-               tg->topmost_limited_ancestor = tg;
-       else
-               tg->topmost_limited_ancestor = parent->topmost_limited_ancestor;
-       return 0;
-}
-
-static void tg_update_topmost_limited_ancestor(struct task_group *tg)
-{
-       __tg_update_topmost_limited_ancestor(tg, NULL);
-}
-
-static void tg_limit_toggled(struct task_group *tg)
-{
-       if (tg->topmost_limited_ancestor != tg) {
-               /*
-                * This task group is not a topmost limited ancestor, so both
-                * it and all its children must already point to their topmost
-                * limited ancestor, and we have nothing to do.
-                */
-               return;
-       }
-
-       /*
-        * This task group is a topmost limited ancestor. Walk over all its
-        * children and update their pointers to the topmost limited ancestor.
-        */
-
-       spin_lock_irq(&task_group_lock);
-       walk_tg_tree_from(tg, __tg_update_topmost_limited_ancestor, tg_nop, 
NULL);
-       spin_unlock_irq(&task_group_lock);
-}
-
 static void tg_update_cpu_limit(struct task_group *tg)
 {
        long quota, period;
@@ -10059,14 +10011,13 @@ static void tg_update_cpu_limit(struct task_group *tg)
        }
 
        tg->cpu_rate = rate;
-       tg->nr_cpus = 0;
 }
 
-static int tg_set_cpu_limit(struct task_group *tg,
-                           unsigned long cpu_rate, unsigned int nr_cpus)
+static int tg_set_cpu_limit(struct task_group *tg, unsigned int nr_cpus)
 {
        int ret;
        unsigned long rate;
+       unsigned long cpu_rate = tg->cpu_rate;
        u64 quota = RUNTIME_INF;
        u64 burst = tg_get_cfs_burst(tg);
        u64 period = default_cfs_period();
@@ -10090,21 +10041,6 @@ static int tg_set_cpu_limit(struct task_group *tg,
        return ret;
 }
 
-static u64 cpu_rate_read_u64(struct cgroup_subsys_state *css, struct cftype 
*cft)
-{
-       return css_tg(css)->cpu_rate;
-}
-
-static int cpu_rate_write_u64(struct cgroup_subsys_state *css,
-                             struct cftype *cftype, u64 rate)
-{
-       struct task_group *tg = css_tg(css);
-
-       if (rate > num_online_cpus() * MAX_CPU_RATE)
-               rate = num_online_cpus() * MAX_CPU_RATE;
-       return tg_set_cpu_limit(tg, rate, tg->nr_cpus);
-}
-
 static u64 nr_cpus_read_u64(struct cgroup_subsys_state *css, struct cftype 
*cft)
 {
        return css_tg(css)->nr_cpus;
@@ -10117,15 +10053,9 @@ static int nr_cpus_write_u64(struct 
cgroup_subsys_state *css,
 
        if (nr_cpus > num_online_cpus())
                nr_cpus = num_online_cpus();
-       return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
+       return tg_set_cpu_limit(tg, nr_cpus);
 }
 #else
-static void tg_update_topmost_limited_ancestor(struct task_group *tg)
-{
-}
-static void tg_limit_toggled(struct task_group *tg)
-{
-}
 static void tg_update_cpu_limit(struct task_group *tg)
 {
 }
@@ -10257,13 +10187,9 @@ static struct cftype cpu_legacy_files[] = {
        },
 #endif
 #ifdef CONFIG_CFS_CPULIMIT
-       {
-               .name = "rate",
-               .read_u64 = cpu_rate_read_u64,
-               .write_u64 = cpu_rate_write_u64,
-       },
        {
                .name = "nr_cpus",
+               .flags = CFTYPE_NOT_ON_ROOT,
                .read_u64 = nr_cpus_read_u64,
                .write_u64 = nr_cpus_write_u64,
        },
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5879d9a999089..f8d9d9ac0e83e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -129,10 +129,6 @@ static unsigned int sysctl_sched_cfs_bandwidth_slice       
        = 5000UL;
 static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
 #endif
 
-#ifdef CONFIG_CFS_CPULIMIT
-unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
-#endif
-
 #ifdef CONFIG_SYSCTL
 static struct ctl_table sched_fair_sysctls[] = {
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -155,16 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = {
                .extra1         = SYSCTL_ZERO,
        },
 #endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_CFS_CPULIMIT
-       {
-               .procname       = "sched_vcpu_hotslice",
-               .data           = &sysctl_sched_vcpu_hotslice,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = SYSCTL_ZERO,
-       },
-#endif
 };
 
 static int __init sched_fair_sysctl_init(void)
@@ -530,88 +516,6 @@ static int se_is_idle(struct sched_entity *se)
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_CFS_CPULIMIT
-static int cfs_rq_active(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq->active;
-}
-
-static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
-{
-       /* if we canceled delayed dec, there is no need to do inc */
-       if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
-               atomic_inc(&cfs_rq->tg->nr_cpus_active);
-       cfs_rq->active = 1;
-}
-
-static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
-{
-       if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
-               postpone = 0;
-
-       if (!postpone) {
-               cfs_rq->active = 0;
-               atomic_dec(&cfs_rq->tg->nr_cpus_active);
-       } else {
-               hrtimer_start_range_ns(&cfs_rq->active_timer,
-                               ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
-                               HRTIMER_MODE_REL_PINNED);
-       }
-}
-
-static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
-{
-       struct cfs_rq *cfs_rq =
-               container_of(timer, struct cfs_rq, active_timer);
-       struct rq *rq = rq_of(cfs_rq);
-       unsigned long flags;
-
-       raw_spin_rq_lock_irqsave(rq, flags);
-       cfs_rq->active = !list_empty(&cfs_rq->tasks);
-       raw_spin_rq_unlock_irqrestore(rq, flags);
-
-       atomic_dec(&cfs_rq->tg->nr_cpus_active);
-
-       return HRTIMER_NORESTART;
-}
-
-static int check_cpulimit_spread(struct task_group *tg, int target_cpu)
-{
-       int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
-       int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
-
-       nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
-               min_t(int, nr_cpus_limit, tg->nr_cpus) :
-               max_t(int, nr_cpus_limit, tg->nr_cpus);
-
-       if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
-               return 1;
-
-       if (nr_cpus_active > nr_cpus_limit)
-               return -1;
-
-       return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
-}
-#else /* !CONFIG_CFS_CPULIMIT */
-static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
-{
-}
-
-static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
-{
-}
-
-static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer 
*timer)
-{
-       return 0;
-}
-
-static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
-{
-       return 1;
-}
-#endif /* CONFIG_CFS_CPULIMIT */
-
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
@@ -3771,9 +3675,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
                account_numa_enqueue(rq, task_of(se));
                list_add(&se->group_node, &rq->cfs_tasks);
-#ifdef CONFIG_CFS_CPULIMIT
-               list_add(&se->cfs_rq_node, &cfs_rq->tasks);
-#endif
        }
 #endif
        cfs_rq->nr_running++;
@@ -3789,9 +3690,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
        if (entity_is_task(se)) {
                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
-#ifdef CONFIG_CFS_CPULIMIT
-               list_del(&se->cfs_rq_node);
-#endif
        }
 #endif
        cfs_rq->nr_running--;
@@ -5393,8 +5291,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 {
        bool curr = cfs_rq->curr == se;
 
-       if (!cfs_rq->load.weight)
-               inc_nr_active_cfs_rqs(cfs_rq);
        /*
         * If we're the current task, we must renormalise before calling
         * update_curr().
@@ -5600,9 +5496,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
        if (cfs_rq->nr_running == 0)
                update_idle_cfs_rq_clock_pelt(cfs_rq);
 
-       if (!cfs_rq->load.weight)
-               dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
-
        return true;
 }
 
@@ -6648,10 +6541,6 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        cfs_rq->runtime_enabled = 0;
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
        INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#ifdef CONFIG_CFS_CPULIMIT
-       hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       cfs_rq->active_timer.function = sched_cfs_active_timer;
-#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7294,9 +7183,6 @@ static bool dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
-#ifdef CONFIG_CFS_CPULIMIT
-static DEFINE_PER_CPU(struct balance_callback, cpulimit_cb_head);
-#endif
 
 #ifdef CONFIG_NO_HZ_COMMON
 
@@ -8656,38 +8542,6 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
        return target;
 }
 
-static bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
-{
-#ifdef CONFIG_CFS_CPULIMIT
-       struct task_group *tg;
-       struct sched_domain *sd;
-       int prev_cpu = task_cpu(p);
-       int cpu;
-
-       tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
-       if (check_cpulimit_spread(tg, *new_cpu) > 0)
-               return false;
-
-       if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
-               return true;
-
-       if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
-               *new_cpu = prev_cpu;
-               return true;
-       }
-
-       for_each_domain(*new_cpu, sd) {
-               for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) {
-                       if (cfs_rq_active(tg->cfs_rq[cpu])) {
-                               *new_cpu = cpu;
-                               return true;
-                       }
-               }
-       }
-#endif
-       return false;
-}
-
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
@@ -8756,9 +8610,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int wake_flags)
                        break;
        }
 
-       if (select_runnable_cpu(p, &new_cpu))
-               goto unlock;
-
        if (unlikely(sd)) {
                /* Slow path */
                new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, 
sd_flag);
@@ -8766,7 +8617,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int wake_flags)
                /* Fast path */
                new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
        }
-unlock:
        rcu_read_unlock();
 
        return new_cpu;
@@ -8992,51 +8842,6 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool 
first);
 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool 
first);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
-static int cpulimit_balance_cpu_stop(void *data);
-
-static void trigger_cpulimit_balance(struct rq *this_rq)
-{
-       struct task_struct *p = this_rq->curr;
-       struct task_group *tg;
-       int this_cpu, cpu, target_cpu = -1;
-       struct sched_domain *sd;
-
-       this_cpu = cpu_of(this_rq);
-
-       if (!p->se.on_rq || this_rq->active_balance)
-               return;
-
-       tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
-       if (check_cpulimit_spread(tg, this_cpu) >= 0)
-               return;
-
-       rcu_read_lock();
-       for_each_domain(this_cpu, sd) {
-               for_each_cpu_and(cpu, sched_domain_span(sd),
-                                p->cpus_ptr) {
-                       if (cpu != this_cpu &&
-                           cfs_rq_active(tg->cfs_rq[cpu])) {
-                               target_cpu = cpu;
-                               goto unlock;
-                       }
-               }
-       }
-unlock:
-       rcu_read_unlock();
-
-       if (target_cpu >= 0) {
-               this_rq->active_balance = 1;
-               this_rq->push_cpu = target_cpu;
-               raw_spin_rq_unlock(this_rq);
-               stop_one_cpu_nowait(this_rq->cpu,
-                                   cpulimit_balance_cpu_stop, this_rq,
-                                   &this_rq->active_balance_work);
-               raw_spin_rq_lock(this_rq);
-       }
-}
-#endif
-
 struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags 
*rf)
 {
@@ -9091,20 +8896,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf
                __set_next_task_fair(rq, p, true);
        }
 
-#ifdef CONFIG_CFS_CPULIMIT
-       queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), 
trigger_cpulimit_balance);
-#endif
-
        return p;
 
 simple:
 #endif
        put_prev_set_next_task(rq, prev, p);
 
-#ifdef CONFIG_CFS_CPULIMIT
-       queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), 
trigger_cpulimit_balance);
-#endif
-
        return p;
 
 idle:
@@ -9529,37 +9326,6 @@ static inline int migrate_degrades_locality(struct 
task_struct *p,
 }
 #endif
 
-static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
-{
-#ifdef CONFIG_CFS_CPULIMIT
-       struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
-
-       if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
-               int cpu;
-
-               schedstat_inc(p->stats.nr_failed_migrations_cpulimit);
-
-               env->flags |= LBF_SOME_PINNED;
-
-               if (check_cpulimit_spread(tg, env->src_cpu) != 0)
-                       return 0;
-
-               if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
-                       return 0;
-
-               for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-                       if (cfs_rq_active(tg->cfs_rq[cpu])) {
-                               env->flags |= LBF_DST_PINNED;
-                               env->new_dst_cpu = cpu;
-                               break;
-                       }
-               }
-               return 0;
-       }
-#endif
-       return 1;
-}
-
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -9570,8 +9336,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env 
*env)
 
        lockdep_assert_rq_held(env->src_rq);
 
-        if (!can_migrate_task_cpulimit(p, env))
-                return 0;
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or
@@ -9935,161 +9699,6 @@ static inline void update_blocked_load_tick(struct rq 
*rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 
{}
 #endif
 
-#ifdef CONFIG_CFS_CPULIMIT
-static unsigned long entity_h_load(struct sched_entity *se);
-
-static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
-{
-       struct sched_entity *se;
-       struct task_struct *p;
-
-       list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
-               p = task_of(se);
-               if (task_curr(p) ||
-                   !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr))
-                       return 0;
-       }
-       env->flags &= ~LBF_ALL_PINNED;
-       return 1;
-}
-
-static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
-{
-       struct sched_entity *se, *tmp;
-       int moved = 0;
-
-       list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
-               struct task_struct *p = task_of(se);
-               detach_task(p, env);
-               attach_task(env->dst_rq, p);
-               moved++;
-       }
-       return moved;
-}
-
-static int move_task_groups(struct lb_env *env)
-{
-       struct cfs_rq *cfs_rq, *pos;
-       struct task_group *tg;
-       unsigned long load;
-       int cur_pulled, pulled = 0;
-
-       if (env->imbalance <= 0)
-               return 0;
-
-       for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
-               if (cfs_rq->tg == &root_task_group)
-                       continue;
-               /*
-                * A child always goes before its parent in a leaf_cfs_rq_list.
-                * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
-                * we could not migrate the child and therefore we should not
-                * even try to migrate the parent.
-                */
-               if (cfs_rq->nr_running != cfs_rq->h_nr_running)
-                       continue;
-
-               tg = cfs_rq->tg->topmost_limited_ancestor;
-
-               if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
-                   cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
-                       continue;
-
-               load = entity_h_load(tg->se[env->src_cpu]);
-               if ((load / 2) > env->imbalance)
-                       continue;
-
-               if (!can_migrate_task_group(cfs_rq, env))
-                       continue;
-
-               cur_pulled = move_task_group(cfs_rq, env);
-               pulled += cur_pulled;
-               env->imbalance -= load;
-
-               env->loop += cur_pulled;
-               if (env->loop > env->loop_max)
-                       break;
-
-               if (env->imbalance <= 0)
-                       break;
-       }
-       return pulled;
-}
-
-static int do_cpulimit_balance(struct lb_env *env)
-{
-       struct cfs_rq *cfs_rq, *pos;
-       struct task_group *tg;
-       int pushed = 0;
-
-       for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
-               if (cfs_rq->tg == &root_task_group)
-                       continue;
-               /* see move_task_groups for why we skip such groups */
-               if (cfs_rq->nr_running != cfs_rq->h_nr_running)
-                       continue;
-               tg = cfs_rq->tg->topmost_limited_ancestor;
-               if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
-                   cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
-                   can_migrate_task_group(cfs_rq, env))
-                       pushed += move_task_group(cfs_rq, env);
-       }
-       return pushed;
-}
-
-static int cpulimit_balance_cpu_stop(void *data)
-{
-       struct rq *rq = data;
-       int cpu = cpu_of(rq);
-       int target_cpu = rq->push_cpu;
-       struct rq *target_rq = cpu_rq(target_cpu);
-       struct sched_domain *sd;
-
-       raw_spin_rq_lock_irq(rq);
-
-       if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
-                    !cpu_online(target_cpu)))
-               goto out_unlock;
-
-       if (unlikely(!rq->nr_running))
-               goto out_unlock;
-
-       BUG_ON(rq == target_rq);
-
-       double_lock_balance(rq, target_rq);
-       rcu_read_lock();
-       for_each_domain(target_cpu, sd) {
-               if (cpumask_test_cpu(cpu, sched_domain_span(sd)))
-                               break;
-       }
-       if (likely(sd)) {
-               struct lb_env env = {
-                       .sd             = sd,
-                       .dst_cpu        = target_cpu,
-                       .dst_rq         = target_rq,
-                       .src_cpu        = cpu,
-                       .src_rq         = rq,
-               };
-
-               schedstat_inc(sd->clb_count);
-
-               update_rq_clock(rq);
-               update_rq_clock(target_rq);
-               if (do_cpulimit_balance(&env))
-                       schedstat_inc(sd->clb_pushed);
-               else
-                       schedstat_inc(sd->clb_failed);
-       }
-       rcu_read_unlock();
-       double_unlock_balance(rq, target_rq);
-
-out_unlock:
-       rq->active_balance = 0;
-       raw_spin_rq_unlock_irq(rq);
-       return 0;
-}
-#endif /* CONFIG_CFS_CPULIMIT */
-
 static bool __update_blocked_others(struct rq *rq, bool *done)
 {
        bool updated;
@@ -12126,20 +11735,6 @@ static int sched_balance_rq(int this_cpu, struct rq 
*this_rq,
 
                local_irq_restore(rf.flags);
 
-#ifdef CONFIG_CFS_CPULIMIT
-               if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
-                       env.loop = 0;
-                       local_irq_save(rf.flags);
-                       double_rq_lock(env.dst_rq, busiest);
-                       rq_repin_lock(busiest, &rf);
-                       update_rq_clock(env.dst_rq);
-                       cur_ld_moved = ld_moved = move_task_groups(&env);
-                       rq_unpin_lock(busiest, &rf);
-                       double_rq_unlock(env.dst_rq, busiest);
-                       local_irq_restore(rf.flags);
-                }
-#endif
-
                if (env.flags & LBF_NEED_BREAK) {
                        env.flags &= ~LBF_NEED_BREAK;
                        goto more_balance;
@@ -13640,9 +13235,6 @@ static void set_next_task_fair(struct rq *rq, struct 
task_struct *p, bool first)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT_CACHED;
-#ifdef CONFIG_CFS_CPULIMIT
-       INIT_LIST_HEAD(&cfs_rq->tasks);
-#endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifdef CONFIG_SMP
        raw_spin_lock_init(&cfs_rq->removed.lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0d3ff69581990..dd5f33d978d3e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -515,9 +515,6 @@ struct task_group {
 #define MAX_CPU_RATE 1024
        unsigned long cpu_rate;
        unsigned int nr_cpus;
-       atomic_t nr_cpus_active;
-       struct task_group *topmost_limited_ancestor; /* self if none of the
-                                                       ancestors is limited */
 #endif
 };
 
@@ -696,9 +693,6 @@ struct cfs_rq {
 #endif
 
        struct rb_root_cached   tasks_timeline;
-#ifdef CONFIG_CFS_CPULIMIT
-       struct list_head tasks;
-#endif
 
        /*
         * 'curr' points to currently running entity on this cfs_rq.
@@ -781,10 +775,6 @@ struct cfs_rq {
        struct list_head        throttled_list;
        struct list_head        throttled_csd_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
-#ifdef CONFIG_CFS_CPULIMIT
-       int active;
-       struct hrtimer active_timer;
-#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
 
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL10 COMMIT] sched: Clean up vCPU handling logic

Reply via email to