Looks good.
Some more things to improve:
1) we can remove cfs_rq.tasks as it looks unused + initialization in
init_cfs_rq()
2) remove sched_domain.clb_{count,failed,pushed}, also looks unused
On 3/16/26 16:17, Dmitry Sepp wrote:
> The idea behind the change is to transition from the existing spatial
> vCPU handling approach that introduces costly modification to the
> scheduling logic to ensure the requested CPU count is obeyed (10%+
> performance drop in some tests) to temporal isolation that can be
> provided by the cgroup2 cpu.max.
>
> Drop the legacy unneeded vCPU handling code. Remove the 'cpu.rate'
> control in favor of the internal calculation based on 'quota' and
> 'period' from 'cpu.max'.
>
> https://virtuozzo.atlassian.net/browse/VSTOR-124385
>
> Signed-off-by: Dmitry Sepp <[email protected]>
> ---
> include/linux/sched.h | 6 -
> kernel/sched/core.c | 89 +---------
> kernel/sched/fair.c | 405 ------------------------------------------
> kernel/sched/sched.h | 3 -
> 4 files changed, 3 insertions(+), 500 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0f7892c449d2..493073a97f02 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -527,9 +527,6 @@ struct sched_statistics {
> u64 nr_migrations_cold;
> u64 nr_failed_migrations_affine;
> u64 nr_failed_migrations_running;
> -#ifdef CONFIG_CFS_CPULIMIT
> - u64 nr_failed_migrations_cpulimit;
> -#endif
> u64 nr_failed_migrations_hot;
> u64 nr_forced_migrations;
>
> @@ -558,9 +555,6 @@ struct sched_entity {
> u64 min_slice;
>
> struct list_head group_node;
> -#ifdef CONFIG_CFS_CPULIMIT
> - struct list_head cfs_rq_node;
> -#endif
> unsigned char on_rq;
> unsigned char sched_delayed;
> unsigned char rel_deadline;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 28fb5d0ecd89..f66ee9d07387 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8710,9 +8710,6 @@ void __init sched_init(void)
> INIT_LIST_HEAD(&root_task_group.children);
> INIT_LIST_HEAD(&root_task_group.siblings);
> autogroup_init(&init_task);
> -#ifdef CONFIG_CFS_CPULIMIT
> - root_task_group.topmost_limited_ancestor = &root_task_group;
> -#endif
> #endif /* CONFIG_CGROUP_SCHED */
>
> for_each_possible_cpu(i) {
> @@ -9149,8 +9146,6 @@ struct task_group *sched_create_group(struct task_group
> *parent)
> return ERR_PTR(-ENOMEM);
> }
>
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg);
> -
> void sched_online_group(struct task_group *tg, struct task_group *parent)
> {
> unsigned long flags;
> @@ -9164,9 +9159,6 @@ void sched_online_group(struct task_group *tg, struct
> task_group *parent)
> tg->parent = parent;
> INIT_LIST_HEAD(&tg->children);
> list_add_rcu(&tg->siblings, &parent->children);
> -#ifdef CONFIG_CFS_BANDWIDTH
> - tg_update_topmost_limited_ancestor(tg);
> -#endif
> spin_unlock_irqrestore(&task_group_lock, flags);
>
> online_fair_sched_group(tg);
> @@ -9650,7 +9642,6 @@ static const u64 min_cfs_quota_period = 1 *
> NSEC_PER_MSEC; /* 1ms */
> static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
>
> static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
> -static void tg_limit_toggled(struct task_group *tg);
>
> static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64
> quota,
> u64 burst)
> @@ -9730,10 +9721,6 @@ static int __tg_set_cfs_bandwidth(struct task_group
> *tg, u64 period, u64 quota,
> if (cfs_rq->throttled)
> unthrottle_cfs_rq(cfs_rq);
> }
> -
> - if (runtime_enabled != runtime_was_enabled)
> - tg_limit_toggled(tg);
> -
> if (runtime_was_enabled && !runtime_enabled)
> cfs_bandwidth_usage_dec();
>
> @@ -10002,49 +9989,6 @@ static int cpu_cfs_local_stat_show(struct seq_file
> *sf, void *v)
> }
>
> #ifdef CONFIG_CFS_CPULIMIT
> -static int __tg_update_topmost_limited_ancestor(struct task_group *tg, void
> *unused)
> -{
> - struct task_group *parent = tg->parent;
> -
> - /*
> - * Parent and none of its uncestors is limited? The task group should
> - * become a topmost limited uncestor then, provided it has a limit set.
> - * Otherwise inherit topmost limited ancestor from the parent.
> - */
> - if (parent->topmost_limited_ancestor == parent &&
> - parent->cfs_bandwidth.quota == RUNTIME_INF)
> - tg->topmost_limited_ancestor = tg;
> - else
> - tg->topmost_limited_ancestor = parent->topmost_limited_ancestor;
> - return 0;
> -}
> -
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg)
> -{
> - __tg_update_topmost_limited_ancestor(tg, NULL);
> -}
> -
> -static void tg_limit_toggled(struct task_group *tg)
> -{
> - if (tg->topmost_limited_ancestor != tg) {
> - /*
> - * This task group is not a topmost limited ancestor, so both
> - * it and all its children must already point to their topmost
> - * limited ancestor, and we have nothing to do.
> - */
> - return;
> - }
> -
> - /*
> - * This task group is a topmost limited ancestor. Walk over all its
> - * children and update their pointers to the topmost limited ancestor.
> - */
> -
> - spin_lock_irq(&task_group_lock);
> - walk_tg_tree_from(tg, __tg_update_topmost_limited_ancestor, tg_nop,
> NULL);
> - spin_unlock_irq(&task_group_lock);
> -}
> -
> static void tg_update_cpu_limit(struct task_group *tg)
> {
> long quota, period;
> @@ -10059,14 +10003,13 @@ static void tg_update_cpu_limit(struct task_group
> *tg)
> }
>
> tg->cpu_rate = rate;
> - tg->nr_cpus = 0;
> }
>
> -static int tg_set_cpu_limit(struct task_group *tg,
> - unsigned long cpu_rate, unsigned int nr_cpus)
> +static int tg_set_cpu_limit(struct task_group *tg, unsigned int nr_cpus)
> {
> int ret;
> unsigned long rate;
> + unsigned long cpu_rate = tg->cpu_rate;
> u64 quota = RUNTIME_INF;
> u64 burst = tg_get_cfs_burst(tg);
> u64 period = default_cfs_period();
> @@ -10090,21 +10033,6 @@ static int tg_set_cpu_limit(struct task_group *tg,
> return ret;
> }
>
> -static u64 cpu_rate_read_u64(struct cgroup_subsys_state *css, struct cftype
> *cft)
> -{
> - return css_tg(css)->cpu_rate;
> -}
> -
> -static int cpu_rate_write_u64(struct cgroup_subsys_state *css,
> - struct cftype *cftype, u64 rate)
> -{
> - struct task_group *tg = css_tg(css);
> -
> - if (rate > num_online_cpus() * MAX_CPU_RATE)
> - rate = num_online_cpus() * MAX_CPU_RATE;
> - return tg_set_cpu_limit(tg, rate, tg->nr_cpus);
> -}
> -
> static u64 nr_cpus_read_u64(struct cgroup_subsys_state *css, struct cftype
> *cft)
> {
> return css_tg(css)->nr_cpus;
> @@ -10117,15 +10045,9 @@ static int nr_cpus_write_u64(struct
> cgroup_subsys_state *css,
>
> if (nr_cpus > num_online_cpus())
> nr_cpus = num_online_cpus();
> - return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
> + return tg_set_cpu_limit(tg, nr_cpus);
> }
> #else
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg)
> -{
> -}
> -static void tg_limit_toggled(struct task_group *tg)
> -{
> -}
> static void tg_update_cpu_limit(struct task_group *tg)
> {
> }
> @@ -10257,11 +10179,6 @@ static struct cftype cpu_legacy_files[] = {
> },
> #endif
> #ifdef CONFIG_CFS_CPULIMIT
> - {
> - .name = "rate",
> - .read_u64 = cpu_rate_read_u64,
> - .write_u64 = cpu_rate_write_u64,
> - },
> {
> .name = "nr_cpus",
> .read_u64 = nr_cpus_read_u64,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5879d9a99908..21a3981a89f1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -129,10 +129,6 @@ static unsigned int sysctl_sched_cfs_bandwidth_slice
> = 5000UL;
> static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
> #endif
>
> -#ifdef CONFIG_CFS_CPULIMIT
> -unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
> -#endif
> -
> #ifdef CONFIG_SYSCTL
> static struct ctl_table sched_fair_sysctls[] = {
> #ifdef CONFIG_CFS_BANDWIDTH
> @@ -155,16 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = {
> .extra1 = SYSCTL_ZERO,
> },
> #endif /* CONFIG_NUMA_BALANCING */
> -#ifdef CONFIG_CFS_CPULIMIT
> - {
> - .procname = "sched_vcpu_hotslice",
> - .data = &sysctl_sched_vcpu_hotslice,
> - .maxlen = sizeof(unsigned int),
> - .mode = 0644,
> - .proc_handler = proc_dointvec_minmax,
> - .extra1 = SYSCTL_ZERO,
> - },
> -#endif
> };
>
> static int __init sched_fair_sysctl_init(void)
> @@ -530,88 +516,6 @@ static int se_is_idle(struct sched_entity *se)
>
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> -#ifdef CONFIG_CFS_CPULIMIT
> -static int cfs_rq_active(struct cfs_rq *cfs_rq)
> -{
> - return cfs_rq->active;
> -}
> -
> -static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
> -{
> - /* if we canceled delayed dec, there is no need to do inc */
> - if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
> - atomic_inc(&cfs_rq->tg->nr_cpus_active);
> - cfs_rq->active = 1;
> -}
> -
> -static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
> -{
> - if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
> - postpone = 0;
> -
> - if (!postpone) {
> - cfs_rq->active = 0;
> - atomic_dec(&cfs_rq->tg->nr_cpus_active);
> - } else {
> - hrtimer_start_range_ns(&cfs_rq->active_timer,
> - ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
> - HRTIMER_MODE_REL_PINNED);
> - }
> -}
> -
> -static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
> -{
> - struct cfs_rq *cfs_rq =
> - container_of(timer, struct cfs_rq, active_timer);
> - struct rq *rq = rq_of(cfs_rq);
> - unsigned long flags;
> -
> - raw_spin_rq_lock_irqsave(rq, flags);
> - cfs_rq->active = !list_empty(&cfs_rq->tasks);
> - raw_spin_rq_unlock_irqrestore(rq, flags);
> -
> - atomic_dec(&cfs_rq->tg->nr_cpus_active);
> -
> - return HRTIMER_NORESTART;
> -}
> -
> -static int check_cpulimit_spread(struct task_group *tg, int target_cpu)
> -{
> - int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
> - int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
> -
> - nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
> - min_t(int, nr_cpus_limit, tg->nr_cpus) :
> - max_t(int, nr_cpus_limit, tg->nr_cpus);
> -
> - if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
> - return 1;
> -
> - if (nr_cpus_active > nr_cpus_limit)
> - return -1;
> -
> - return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
> -}
> -#else /* !CONFIG_CFS_CPULIMIT */
> -static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
> -{
> -}
> -
> -static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
> -{
> -}
> -
> -static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer
> *timer)
> -{
> - return 0;
> -}
> -
> -static inline int check_cpulimit_spread(struct task_group *tg, int
> target_cpu)
> -{
> - return 1;
> -}
> -#endif /* CONFIG_CFS_CPULIMIT */
> -
> static __always_inline
> void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
>
> @@ -3771,9 +3675,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct
> sched_entity *se)
>
> account_numa_enqueue(rq, task_of(se));
> list_add(&se->group_node, &rq->cfs_tasks);
> -#ifdef CONFIG_CFS_CPULIMIT
> - list_add(&se->cfs_rq_node, &cfs_rq->tasks);
> -#endif
> }
> #endif
> cfs_rq->nr_running++;
> @@ -3789,9 +3690,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct
> sched_entity *se)
> if (entity_is_task(se)) {
> account_numa_dequeue(rq_of(cfs_rq), task_of(se));
> list_del_init(&se->group_node);
> -#ifdef CONFIG_CFS_CPULIMIT
> - list_del(&se->cfs_rq_node);
> -#endif
> }
> #endif
> cfs_rq->nr_running--;
> @@ -5393,8 +5291,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
> {
> bool curr = cfs_rq->curr == se;
>
> - if (!cfs_rq->load.weight)
> - inc_nr_active_cfs_rqs(cfs_rq);
> /*
> * If we're the current task, we must renormalise before calling
> * update_curr().
> @@ -5600,9 +5496,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
> if (cfs_rq->nr_running == 0)
> update_idle_cfs_rq_clock_pelt(cfs_rq);
>
> - if (!cfs_rq->load.weight)
> - dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
> -
> return true;
> }
>
> @@ -6648,10 +6541,6 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> cfs_rq->runtime_enabled = 0;
> INIT_LIST_HEAD(&cfs_rq->throttled_list);
> INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
> -#ifdef CONFIG_CFS_CPULIMIT
> - hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> - cfs_rq->active_timer.function = sched_cfs_active_timer;
> -#endif
> }
>
> void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> @@ -7294,9 +7183,6 @@ static bool dequeue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
> static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
> static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
> static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> -#ifdef CONFIG_CFS_CPULIMIT
> -static DEFINE_PER_CPU(struct balance_callback, cpulimit_cb_head);
> -#endif
>
> #ifdef CONFIG_NO_HZ_COMMON
>
> @@ -8656,38 +8542,6 @@ static int find_energy_efficient_cpu(struct
> task_struct *p, int prev_cpu)
> return target;
> }
>
> -static bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
> -{
> -#ifdef CONFIG_CFS_CPULIMIT
> - struct task_group *tg;
> - struct sched_domain *sd;
> - int prev_cpu = task_cpu(p);
> - int cpu;
> -
> - tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> - if (check_cpulimit_spread(tg, *new_cpu) > 0)
> - return false;
> -
> - if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
> - return true;
> -
> - if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
> - *new_cpu = prev_cpu;
> - return true;
> - }
> -
> - for_each_domain(*new_cpu, sd) {
> - for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) {
> - if (cfs_rq_active(tg->cfs_rq[cpu])) {
> - *new_cpu = cpu;
> - return true;
> - }
> - }
> - }
> -#endif
> - return false;
> -}
> -
> /*
> * select_task_rq_fair: Select target runqueue for the waking task in domains
> * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8756,9 +8610,6 @@ select_task_rq_fair(struct task_struct *p, int
> prev_cpu, int wake_flags)
> break;
> }
>
> - if (select_runnable_cpu(p, &new_cpu))
> - goto unlock;
> -
> if (unlikely(sd)) {
> /* Slow path */
> new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu,
> sd_flag);
> @@ -8766,7 +8617,6 @@ select_task_rq_fair(struct task_struct *p, int
> prev_cpu, int wake_flags)
> /* Fast path */
> new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
> }
> -unlock:
> rcu_read_unlock();
>
> return new_cpu;
> @@ -8992,51 +8842,6 @@ static struct task_struct *pick_task_fair(struct rq
> *rq)
> static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool
> first);
> static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool
> first);
>
> -#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
> -static int cpulimit_balance_cpu_stop(void *data);
> -
> -static void trigger_cpulimit_balance(struct rq *this_rq)
> -{
> - struct task_struct *p = this_rq->curr;
> - struct task_group *tg;
> - int this_cpu, cpu, target_cpu = -1;
> - struct sched_domain *sd;
> -
> - this_cpu = cpu_of(this_rq);
> -
> - if (!p->se.on_rq || this_rq->active_balance)
> - return;
> -
> - tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> - if (check_cpulimit_spread(tg, this_cpu) >= 0)
> - return;
> -
> - rcu_read_lock();
> - for_each_domain(this_cpu, sd) {
> - for_each_cpu_and(cpu, sched_domain_span(sd),
> - p->cpus_ptr) {
> - if (cpu != this_cpu &&
> - cfs_rq_active(tg->cfs_rq[cpu])) {
> - target_cpu = cpu;
> - goto unlock;
> - }
> - }
> - }
> -unlock:
> - rcu_read_unlock();
> -
> - if (target_cpu >= 0) {
> - this_rq->active_balance = 1;
> - this_rq->push_cpu = target_cpu;
> - raw_spin_rq_unlock(this_rq);
> - stop_one_cpu_nowait(this_rq->cpu,
> - cpulimit_balance_cpu_stop, this_rq,
> - &this_rq->active_balance_work);
> - raw_spin_rq_lock(this_rq);
> - }
> -}
> -#endif
> -
> struct task_struct *
> pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags
> *rf)
> {
> @@ -9091,20 +8896,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct
> *prev, struct rq_flags *rf
> __set_next_task_fair(rq, p, true);
> }
>
> -#ifdef CONFIG_CFS_CPULIMIT
> - queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu),
> trigger_cpulimit_balance);
> -#endif
> -
> return p;
>
> simple:
> #endif
> put_prev_set_next_task(rq, prev, p);
>
> -#ifdef CONFIG_CFS_CPULIMIT
> - queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu),
> trigger_cpulimit_balance);
> -#endif
> -
> return p;
>
> idle:
> @@ -9529,37 +9326,6 @@ static inline int migrate_degrades_locality(struct
> task_struct *p,
> }
> #endif
>
> -static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env
> *env)
> -{
> -#ifdef CONFIG_CFS_CPULIMIT
> - struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> -
> - if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
> - int cpu;
> -
> - schedstat_inc(p->stats.nr_failed_migrations_cpulimit);
> -
> - env->flags |= LBF_SOME_PINNED;
> -
> - if (check_cpulimit_spread(tg, env->src_cpu) != 0)
> - return 0;
> -
> - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
> - return 0;
> -
> - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
> - if (cfs_rq_active(tg->cfs_rq[cpu])) {
> - env->flags |= LBF_DST_PINNED;
> - env->new_dst_cpu = cpu;
> - break;
> - }
> - }
> - return 0;
> - }
> -#endif
> - return 1;
> -}
> -
> /*
> * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
> */
> @@ -9570,8 +9336,6 @@ int can_migrate_task(struct task_struct *p, struct
> lb_env *env)
>
> lockdep_assert_rq_held(env->src_rq);
>
> - if (!can_migrate_task_cpulimit(p, env))
> - return 0;
> /*
> * We do not migrate tasks that are:
> * 1) throttled_lb_pair, or
> @@ -9935,161 +9699,6 @@ static inline void update_blocked_load_tick(struct rq
> *rq) {}
> static inline void update_blocked_load_status(struct rq *rq, bool
> has_blocked) {}
> #endif
>
> -#ifdef CONFIG_CFS_CPULIMIT
> -static unsigned long entity_h_load(struct sched_entity *se);
> -
> -static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
> -{
> - struct sched_entity *se;
> - struct task_struct *p;
> -
> - list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
> - p = task_of(se);
> - if (task_curr(p) ||
> - !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr))
> - return 0;
> - }
> - env->flags &= ~LBF_ALL_PINNED;
> - return 1;
> -}
> -
> -static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
> -{
> - struct sched_entity *se, *tmp;
> - int moved = 0;
> -
> - list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
> - struct task_struct *p = task_of(se);
> - detach_task(p, env);
> - attach_task(env->dst_rq, p);
> - moved++;
> - }
> - return moved;
> -}
> -
> -static int move_task_groups(struct lb_env *env)
> -{
> - struct cfs_rq *cfs_rq, *pos;
> - struct task_group *tg;
> - unsigned long load;
> - int cur_pulled, pulled = 0;
> -
> - if (env->imbalance <= 0)
> - return 0;
> -
> - for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
> - if (cfs_rq->tg == &root_task_group)
> - continue;
> - /*
> - * A child always goes before its parent in a leaf_cfs_rq_list.
> - * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
> - * we could not migrate the child and therefore we should not
> - * even try to migrate the parent.
> - */
> - if (cfs_rq->nr_running != cfs_rq->h_nr_running)
> - continue;
> -
> - tg = cfs_rq->tg->topmost_limited_ancestor;
> -
> - if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
> - cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
> - continue;
> -
> - load = entity_h_load(tg->se[env->src_cpu]);
> - if ((load / 2) > env->imbalance)
> - continue;
> -
> - if (!can_migrate_task_group(cfs_rq, env))
> - continue;
> -
> - cur_pulled = move_task_group(cfs_rq, env);
> - pulled += cur_pulled;
> - env->imbalance -= load;
> -
> - env->loop += cur_pulled;
> - if (env->loop > env->loop_max)
> - break;
> -
> - if (env->imbalance <= 0)
> - break;
> - }
> - return pulled;
> -}
> -
> -static int do_cpulimit_balance(struct lb_env *env)
> -{
> - struct cfs_rq *cfs_rq, *pos;
> - struct task_group *tg;
> - int pushed = 0;
> -
> - for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
> - if (cfs_rq->tg == &root_task_group)
> - continue;
> - /* see move_task_groups for why we skip such groups */
> - if (cfs_rq->nr_running != cfs_rq->h_nr_running)
> - continue;
> - tg = cfs_rq->tg->topmost_limited_ancestor;
> - if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
> - cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
> - can_migrate_task_group(cfs_rq, env))
> - pushed += move_task_group(cfs_rq, env);
> - }
> - return pushed;
> -}
> -
> -static int cpulimit_balance_cpu_stop(void *data)
> -{
> - struct rq *rq = data;
> - int cpu = cpu_of(rq);
> - int target_cpu = rq->push_cpu;
> - struct rq *target_rq = cpu_rq(target_cpu);
> - struct sched_domain *sd;
> -
> - raw_spin_rq_lock_irq(rq);
> -
> - if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
> - !cpu_online(target_cpu)))
> - goto out_unlock;
> -
> - if (unlikely(!rq->nr_running))
> - goto out_unlock;
> -
> - BUG_ON(rq == target_rq);
> -
> - double_lock_balance(rq, target_rq);
> - rcu_read_lock();
> - for_each_domain(target_cpu, sd) {
> - if (cpumask_test_cpu(cpu, sched_domain_span(sd)))
> - break;
> - }
> - if (likely(sd)) {
> - struct lb_env env = {
> - .sd = sd,
> - .dst_cpu = target_cpu,
> - .dst_rq = target_rq,
> - .src_cpu = cpu,
> - .src_rq = rq,
> - };
> -
> - schedstat_inc(sd->clb_count);
> -
> - update_rq_clock(rq);
> - update_rq_clock(target_rq);
> - if (do_cpulimit_balance(&env))
> - schedstat_inc(sd->clb_pushed);
> - else
> - schedstat_inc(sd->clb_failed);
> - }
> - rcu_read_unlock();
> - double_unlock_balance(rq, target_rq);
> -
> -out_unlock:
> - rq->active_balance = 0;
> - raw_spin_rq_unlock_irq(rq);
> - return 0;
> -}
> -#endif /* CONFIG_CFS_CPULIMIT */
> -
> static bool __update_blocked_others(struct rq *rq, bool *done)
> {
> bool updated;
> @@ -12126,20 +11735,6 @@ static int sched_balance_rq(int this_cpu, struct rq
> *this_rq,
>
> local_irq_restore(rf.flags);
>
> -#ifdef CONFIG_CFS_CPULIMIT
> - if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
> - env.loop = 0;
> - local_irq_save(rf.flags);
> - double_rq_lock(env.dst_rq, busiest);
> - rq_repin_lock(busiest, &rf);
> - update_rq_clock(env.dst_rq);
> - cur_ld_moved = ld_moved = move_task_groups(&env);
> - rq_unpin_lock(busiest, &rf);
> - double_rq_unlock(env.dst_rq, busiest);
> - local_irq_restore(rf.flags);
> - }
> -#endif
> -
> if (env.flags & LBF_NEED_BREAK) {
> env.flags &= ~LBF_NEED_BREAK;
> goto more_balance;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 0d3ff6958199..b4ad44e3a3ab 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -515,9 +515,6 @@ struct task_group {
> #define MAX_CPU_RATE 1024
> unsigned long cpu_rate;
> unsigned int nr_cpus;
> - atomic_t nr_cpus_active;
> - struct task_group *topmost_limited_ancestor; /* self if none of the
> - ancestors is limited */
> #endif
> };
>
--
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel