Re: [Devel] [PATCH vz10 1/2] sched: Clean up vCPU handling logic

Pavel Tikhomirov Mon, 16 Mar 2026 09:59:26 -0700

Looks good.

Some more things to improve:


1) we can remove cfs_rq.tasks as it looks unused + initialization in 
init_cfs_rq()
2) remove sched_domain.clb_{count,failed,pushed}, also looks unused

On 3/16/26 16:17, Dmitry Sepp wrote:
> The idea behind the change is to transition from the existing spatial
> vCPU handling approach that introduces costly modification to the
> scheduling logic to ensure the requested CPU count is obeyed (10%+
> performance drop in some tests) to temporal isolation that can be
> provided by the cgroup2 cpu.max.
> 
> Drop the legacy unneeded vCPU handling code. Remove the 'cpu.rate'
> control in favor of the internal calculation based on 'quota' and
> 'period' from 'cpu.max'.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-124385
> 
> Signed-off-by: Dmitry Sepp <[email protected]>
> ---
>  include/linux/sched.h |   6 -
>  kernel/sched/core.c   |  89 +---------
>  kernel/sched/fair.c   | 405 ------------------------------------------
>  kernel/sched/sched.h  |   3 -
>  4 files changed, 3 insertions(+), 500 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0f7892c449d2..493073a97f02 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -527,9 +527,6 @@ struct sched_statistics {
>       u64                             nr_migrations_cold;
>       u64                             nr_failed_migrations_affine;
>       u64                             nr_failed_migrations_running;
> -#ifdef CONFIG_CFS_CPULIMIT
> -     u64                             nr_failed_migrations_cpulimit;
> -#endif
>       u64                             nr_failed_migrations_hot;
>       u64                             nr_forced_migrations;
>  
> @@ -558,9 +555,6 @@ struct sched_entity {
>       u64                             min_slice;
>  
>       struct list_head                group_node;
> -#ifdef CONFIG_CFS_CPULIMIT
> -     struct list_head                cfs_rq_node;
> -#endif
>       unsigned char                   on_rq;
>       unsigned char                   sched_delayed;
>       unsigned char                   rel_deadline;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 28fb5d0ecd89..f66ee9d07387 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8710,9 +8710,6 @@ void __init sched_init(void)
>       INIT_LIST_HEAD(&root_task_group.children);
>       INIT_LIST_HEAD(&root_task_group.siblings);
>       autogroup_init(&init_task);
> -#ifdef CONFIG_CFS_CPULIMIT
> -     root_task_group.topmost_limited_ancestor = &root_task_group;
> -#endif
>  #endif /* CONFIG_CGROUP_SCHED */
>  
>       for_each_possible_cpu(i) {
> @@ -9149,8 +9146,6 @@ struct task_group *sched_create_group(struct task_group 
> *parent)
>       return ERR_PTR(-ENOMEM);
>  }
>  
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg);
> -
>  void sched_online_group(struct task_group *tg, struct task_group *parent)
>  {
>       unsigned long flags;
> @@ -9164,9 +9159,6 @@ void sched_online_group(struct task_group *tg, struct 
> task_group *parent)
>       tg->parent = parent;
>       INIT_LIST_HEAD(&tg->children);
>       list_add_rcu(&tg->siblings, &parent->children);
> -#ifdef CONFIG_CFS_BANDWIDTH
> -     tg_update_topmost_limited_ancestor(tg);
> -#endif
>       spin_unlock_irqrestore(&task_group_lock, flags);
>  
>       online_fair_sched_group(tg);
> @@ -9650,7 +9642,6 @@ static const u64 min_cfs_quota_period = 1 * 
> NSEC_PER_MSEC; /* 1ms */
>  static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
>  
>  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
> -static void tg_limit_toggled(struct task_group *tg);
>  
>  static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 
> quota,
>                               u64 burst)
> @@ -9730,10 +9721,6 @@ static int __tg_set_cfs_bandwidth(struct task_group 
> *tg, u64 period, u64 quota,
>               if (cfs_rq->throttled)
>                       unthrottle_cfs_rq(cfs_rq);
>       }
> -
> -     if (runtime_enabled != runtime_was_enabled)
> -             tg_limit_toggled(tg);
> -
>       if (runtime_was_enabled && !runtime_enabled)
>               cfs_bandwidth_usage_dec();
>  
> @@ -10002,49 +9989,6 @@ static int cpu_cfs_local_stat_show(struct seq_file 
> *sf, void *v)
>  }
>  
>  #ifdef CONFIG_CFS_CPULIMIT
> -static int __tg_update_topmost_limited_ancestor(struct task_group *tg, void 
> *unused)
> -{
> -     struct task_group *parent = tg->parent;
> -
> -     /*
> -      * Parent and none of its uncestors is limited? The task group should
> -      * become a topmost limited uncestor then, provided it has a limit set.
> -      * Otherwise inherit topmost limited ancestor from the parent.
> -      */
> -     if (parent->topmost_limited_ancestor == parent &&
> -         parent->cfs_bandwidth.quota == RUNTIME_INF)
> -             tg->topmost_limited_ancestor = tg;
> -     else
> -             tg->topmost_limited_ancestor = parent->topmost_limited_ancestor;
> -     return 0;
> -}
> -
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg)
> -{
> -     __tg_update_topmost_limited_ancestor(tg, NULL);
> -}
> -
> -static void tg_limit_toggled(struct task_group *tg)
> -{
> -     if (tg->topmost_limited_ancestor != tg) {
> -             /*
> -              * This task group is not a topmost limited ancestor, so both
> -              * it and all its children must already point to their topmost
> -              * limited ancestor, and we have nothing to do.
> -              */
> -             return;
> -     }
> -
> -     /*
> -      * This task group is a topmost limited ancestor. Walk over all its
> -      * children and update their pointers to the topmost limited ancestor.
> -      */
> -
> -     spin_lock_irq(&task_group_lock);
> -     walk_tg_tree_from(tg, __tg_update_topmost_limited_ancestor, tg_nop, 
> NULL);
> -     spin_unlock_irq(&task_group_lock);
> -}
> -
>  static void tg_update_cpu_limit(struct task_group *tg)
>  {
>       long quota, period;
> @@ -10059,14 +10003,13 @@ static void tg_update_cpu_limit(struct task_group 
> *tg)
>       }
>  
>       tg->cpu_rate = rate;
> -     tg->nr_cpus = 0;
>  }
>  
> -static int tg_set_cpu_limit(struct task_group *tg,
> -                         unsigned long cpu_rate, unsigned int nr_cpus)
> +static int tg_set_cpu_limit(struct task_group *tg, unsigned int nr_cpus)
>  {
>       int ret;
>       unsigned long rate;
> +     unsigned long cpu_rate = tg->cpu_rate;
>       u64 quota = RUNTIME_INF;
>       u64 burst = tg_get_cfs_burst(tg);
>       u64 period = default_cfs_period();
> @@ -10090,21 +10033,6 @@ static int tg_set_cpu_limit(struct task_group *tg,
>       return ret;
>  }
>  
> -static u64 cpu_rate_read_u64(struct cgroup_subsys_state *css, struct cftype 
> *cft)
> -{
> -     return css_tg(css)->cpu_rate;
> -}
> -
> -static int cpu_rate_write_u64(struct cgroup_subsys_state *css,
> -                           struct cftype *cftype, u64 rate)
> -{
> -     struct task_group *tg = css_tg(css);
> -
> -     if (rate > num_online_cpus() * MAX_CPU_RATE)
> -             rate = num_online_cpus() * MAX_CPU_RATE;
> -     return tg_set_cpu_limit(tg, rate, tg->nr_cpus);
> -}
> -
>  static u64 nr_cpus_read_u64(struct cgroup_subsys_state *css, struct cftype 
> *cft)
>  {
>       return css_tg(css)->nr_cpus;
> @@ -10117,15 +10045,9 @@ static int nr_cpus_write_u64(struct 
> cgroup_subsys_state *css,
>  
>       if (nr_cpus > num_online_cpus())
>               nr_cpus = num_online_cpus();
> -     return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
> +     return tg_set_cpu_limit(tg, nr_cpus);
>  }
>  #else
> -static void tg_update_topmost_limited_ancestor(struct task_group *tg)
> -{
> -}
> -static void tg_limit_toggled(struct task_group *tg)
> -{
> -}
>  static void tg_update_cpu_limit(struct task_group *tg)
>  {
>  }
> @@ -10257,11 +10179,6 @@ static struct cftype cpu_legacy_files[] = {
>       },
>  #endif
>  #ifdef CONFIG_CFS_CPULIMIT
> -     {
> -             .name = "rate",
> -             .read_u64 = cpu_rate_read_u64,
> -             .write_u64 = cpu_rate_write_u64,
> -     },
>       {
>               .name = "nr_cpus",
>               .read_u64 = nr_cpus_read_u64,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5879d9a99908..21a3981a89f1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -129,10 +129,6 @@ static unsigned int sysctl_sched_cfs_bandwidth_slice     
>         = 5000UL;
>  static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
>  #endif
>  
> -#ifdef CONFIG_CFS_CPULIMIT
> -unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
> -#endif
> -
>  #ifdef CONFIG_SYSCTL
>  static struct ctl_table sched_fair_sysctls[] = {
>  #ifdef CONFIG_CFS_BANDWIDTH
> @@ -155,16 +151,6 @@ static struct ctl_table sched_fair_sysctls[] = {
>               .extra1         = SYSCTL_ZERO,
>       },
>  #endif /* CONFIG_NUMA_BALANCING */
> -#ifdef CONFIG_CFS_CPULIMIT
> -     {
> -             .procname       = "sched_vcpu_hotslice",
> -             .data           = &sysctl_sched_vcpu_hotslice,
> -             .maxlen         = sizeof(unsigned int),
> -             .mode           = 0644,
> -             .proc_handler   = proc_dointvec_minmax,
> -             .extra1         = SYSCTL_ZERO,
> -     },
> -#endif
>  };
>  
>  static int __init sched_fair_sysctl_init(void)
> @@ -530,88 +516,6 @@ static int se_is_idle(struct sched_entity *se)
>  
>  #endif       /* CONFIG_FAIR_GROUP_SCHED */
>  
> -#ifdef CONFIG_CFS_CPULIMIT
> -static int cfs_rq_active(struct cfs_rq *cfs_rq)
> -{
> -     return cfs_rq->active;
> -}
> -
> -static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
> -{
> -     /* if we canceled delayed dec, there is no need to do inc */
> -     if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
> -             atomic_inc(&cfs_rq->tg->nr_cpus_active);
> -     cfs_rq->active = 1;
> -}
> -
> -static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
> -{
> -     if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
> -             postpone = 0;
> -
> -     if (!postpone) {
> -             cfs_rq->active = 0;
> -             atomic_dec(&cfs_rq->tg->nr_cpus_active);
> -     } else {
> -             hrtimer_start_range_ns(&cfs_rq->active_timer,
> -                             ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
> -                             HRTIMER_MODE_REL_PINNED);
> -     }
> -}
> -
> -static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
> -{
> -     struct cfs_rq *cfs_rq =
> -             container_of(timer, struct cfs_rq, active_timer);
> -     struct rq *rq = rq_of(cfs_rq);
> -     unsigned long flags;
> -
> -     raw_spin_rq_lock_irqsave(rq, flags);
> -     cfs_rq->active = !list_empty(&cfs_rq->tasks);
> -     raw_spin_rq_unlock_irqrestore(rq, flags);
> -
> -     atomic_dec(&cfs_rq->tg->nr_cpus_active);
> -
> -     return HRTIMER_NORESTART;
> -}
> -
> -static int check_cpulimit_spread(struct task_group *tg, int target_cpu)
> -{
> -     int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
> -     int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
> -
> -     nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
> -             min_t(int, nr_cpus_limit, tg->nr_cpus) :
> -             max_t(int, nr_cpus_limit, tg->nr_cpus);
> -
> -     if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
> -             return 1;
> -
> -     if (nr_cpus_active > nr_cpus_limit)
> -             return -1;
> -
> -     return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
> -}
> -#else /* !CONFIG_CFS_CPULIMIT */
> -static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
> -{
> -}
> -
> -static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
> -{
> -}
> -
> -static inline enum hrtimer_restart sched_cfs_active_timer(struct hrtimer 
> *timer)
> -{
> -     return 0;
> -}
> -
> -static inline int check_cpulimit_spread(struct task_group *tg, int 
> target_cpu)
> -{
> -     return 1;
> -}
> -#endif /* CONFIG_CFS_CPULIMIT */
> -
>  static __always_inline
>  void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
>  
> @@ -3771,9 +3675,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct 
> sched_entity *se)
>  
>               account_numa_enqueue(rq, task_of(se));
>               list_add(&se->group_node, &rq->cfs_tasks);
> -#ifdef CONFIG_CFS_CPULIMIT
> -             list_add(&se->cfs_rq_node, &cfs_rq->tasks);
> -#endif
>       }
>  #endif
>       cfs_rq->nr_running++;
> @@ -3789,9 +3690,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct 
> sched_entity *se)
>       if (entity_is_task(se)) {
>               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
>               list_del_init(&se->group_node);
> -#ifdef CONFIG_CFS_CPULIMIT
> -             list_del(&se->cfs_rq_node);
> -#endif
>       }
>  #endif
>       cfs_rq->nr_running--;
> @@ -5393,8 +5291,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct 
> sched_entity *se, int flags)
>  {
>       bool curr = cfs_rq->curr == se;
>  
> -     if (!cfs_rq->load.weight)
> -             inc_nr_active_cfs_rqs(cfs_rq);
>       /*
>        * If we're the current task, we must renormalise before calling
>        * update_curr().
> @@ -5600,9 +5496,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct 
> sched_entity *se, int flags)
>       if (cfs_rq->nr_running == 0)
>               update_idle_cfs_rq_clock_pelt(cfs_rq);
>  
> -     if (!cfs_rq->load.weight)
> -             dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
> -
>       return true;
>  }
>  
> @@ -6648,10 +6541,6 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>       cfs_rq->runtime_enabled = 0;
>       INIT_LIST_HEAD(&cfs_rq->throttled_list);
>       INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
> -#ifdef CONFIG_CFS_CPULIMIT
> -     hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> -     cfs_rq->active_timer.function = sched_cfs_active_timer;
> -#endif
>  }
>  
>  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> @@ -7294,9 +7183,6 @@ static bool dequeue_task_fair(struct rq *rq, struct 
> task_struct *p, int flags)
>  static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
>  static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
>  static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
> -#ifdef CONFIG_CFS_CPULIMIT
> -static DEFINE_PER_CPU(struct balance_callback, cpulimit_cb_head);
> -#endif
>  
>  #ifdef CONFIG_NO_HZ_COMMON
>  
> @@ -8656,38 +8542,6 @@ static int find_energy_efficient_cpu(struct 
> task_struct *p, int prev_cpu)
>       return target;
>  }
>  
> -static bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
> -{
> -#ifdef CONFIG_CFS_CPULIMIT
> -     struct task_group *tg;
> -     struct sched_domain *sd;
> -     int prev_cpu = task_cpu(p);
> -     int cpu;
> -
> -     tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> -     if (check_cpulimit_spread(tg, *new_cpu) > 0)
> -             return false;
> -
> -     if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
> -             return true;
> -
> -     if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
> -             *new_cpu = prev_cpu;
> -             return true;
> -     }
> -
> -     for_each_domain(*new_cpu, sd) {
> -             for_each_cpu_and(cpu, sched_domain_span(sd), p->cpus_ptr) {
> -                     if (cfs_rq_active(tg->cfs_rq[cpu])) {
> -                             *new_cpu = cpu;
> -                             return true;
> -                     }
> -             }
> -     }
> -#endif
> -     return false;
> -}
> -
>  /*
>   * select_task_rq_fair: Select target runqueue for the waking task in domains
>   * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -8756,9 +8610,6 @@ select_task_rq_fair(struct task_struct *p, int 
> prev_cpu, int wake_flags)
>                       break;
>       }
>  
> -     if (select_runnable_cpu(p, &new_cpu))
> -             goto unlock;
> -
>       if (unlikely(sd)) {
>               /* Slow path */
>               new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, 
> sd_flag);
> @@ -8766,7 +8617,6 @@ select_task_rq_fair(struct task_struct *p, int 
> prev_cpu, int wake_flags)
>               /* Fast path */
>               new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>       }
> -unlock:
>       rcu_read_unlock();
>  
>       return new_cpu;
> @@ -8992,51 +8842,6 @@ static struct task_struct *pick_task_fair(struct rq 
> *rq)
>  static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool 
> first);
>  static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool 
> first);
>  
> -#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
> -static int cpulimit_balance_cpu_stop(void *data);
> -
> -static void trigger_cpulimit_balance(struct rq *this_rq)
> -{
> -     struct task_struct *p = this_rq->curr;
> -     struct task_group *tg;
> -     int this_cpu, cpu, target_cpu = -1;
> -     struct sched_domain *sd;
> -
> -     this_cpu = cpu_of(this_rq);
> -
> -     if (!p->se.on_rq || this_rq->active_balance)
> -             return;
> -
> -     tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> -     if (check_cpulimit_spread(tg, this_cpu) >= 0)
> -             return;
> -
> -     rcu_read_lock();
> -     for_each_domain(this_cpu, sd) {
> -             for_each_cpu_and(cpu, sched_domain_span(sd),
> -                              p->cpus_ptr) {
> -                     if (cpu != this_cpu &&
> -                         cfs_rq_active(tg->cfs_rq[cpu])) {
> -                             target_cpu = cpu;
> -                             goto unlock;
> -                     }
> -             }
> -     }
> -unlock:
> -     rcu_read_unlock();
> -
> -     if (target_cpu >= 0) {
> -             this_rq->active_balance = 1;
> -             this_rq->push_cpu = target_cpu;
> -             raw_spin_rq_unlock(this_rq);
> -             stop_one_cpu_nowait(this_rq->cpu,
> -                                 cpulimit_balance_cpu_stop, this_rq,
> -                                 &this_rq->active_balance_work);
> -             raw_spin_rq_lock(this_rq);
> -     }
> -}
> -#endif
> -
>  struct task_struct *
>  pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags 
> *rf)
>  {
> @@ -9091,20 +8896,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
> *prev, struct rq_flags *rf
>               __set_next_task_fair(rq, p, true);
>       }
>  
> -#ifdef CONFIG_CFS_CPULIMIT
> -     queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), 
> trigger_cpulimit_balance);
> -#endif
> -
>       return p;
>  
>  simple:
>  #endif
>       put_prev_set_next_task(rq, prev, p);
>  
> -#ifdef CONFIG_CFS_CPULIMIT
> -     queue_balance_callback(rq, &per_cpu(cpulimit_cb_head, rq->cpu), 
> trigger_cpulimit_balance);
> -#endif
> -
>       return p;
>  
>  idle:
> @@ -9529,37 +9326,6 @@ static inline int migrate_degrades_locality(struct 
> task_struct *p,
>  }
>  #endif
>  
> -static int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env 
> *env)
> -{
> -#ifdef CONFIG_CFS_CPULIMIT
> -     struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
> -
> -     if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
> -             int cpu;
> -
> -             schedstat_inc(p->stats.nr_failed_migrations_cpulimit);
> -
> -             env->flags |= LBF_SOME_PINNED;
> -
> -             if (check_cpulimit_spread(tg, env->src_cpu) != 0)
> -                     return 0;
> -
> -             if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
> -                     return 0;
> -
> -             for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
> -                     if (cfs_rq_active(tg->cfs_rq[cpu])) {
> -                             env->flags |= LBF_DST_PINNED;
> -                             env->new_dst_cpu = cpu;
> -                             break;
> -                     }
> -             }
> -             return 0;
> -     }
> -#endif
> -     return 1;
> -}
> -
>  /*
>   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
>   */
> @@ -9570,8 +9336,6 @@ int can_migrate_task(struct task_struct *p, struct 
> lb_env *env)
>  
>       lockdep_assert_rq_held(env->src_rq);
>  
> -        if (!can_migrate_task_cpulimit(p, env))
> -                return 0;
>       /*
>        * We do not migrate tasks that are:
>        * 1) throttled_lb_pair, or
> @@ -9935,161 +9699,6 @@ static inline void update_blocked_load_tick(struct rq 
> *rq) {}
>  static inline void update_blocked_load_status(struct rq *rq, bool 
> has_blocked) {}
>  #endif
>  
> -#ifdef CONFIG_CFS_CPULIMIT
> -static unsigned long entity_h_load(struct sched_entity *se);
> -
> -static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
> -{
> -     struct sched_entity *se;
> -     struct task_struct *p;
> -
> -     list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
> -             p = task_of(se);
> -             if (task_curr(p) ||
> -                 !cpumask_test_cpu(env->dst_cpu, p->cpus_ptr))
> -                     return 0;
> -     }
> -     env->flags &= ~LBF_ALL_PINNED;
> -     return 1;
> -}
> -
> -static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
> -{
> -     struct sched_entity *se, *tmp;
> -     int moved = 0;
> -
> -     list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
> -             struct task_struct *p = task_of(se);
> -             detach_task(p, env);
> -             attach_task(env->dst_rq, p);
> -             moved++;
> -     }
> -     return moved;
> -}
> -
> -static int move_task_groups(struct lb_env *env)
> -{
> -     struct cfs_rq *cfs_rq, *pos;
> -     struct task_group *tg;
> -     unsigned long load;
> -     int cur_pulled, pulled = 0;
> -
> -     if (env->imbalance <= 0)
> -             return 0;
> -
> -     for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
> -             if (cfs_rq->tg == &root_task_group)
> -                     continue;
> -             /*
> -              * A child always goes before its parent in a leaf_cfs_rq_list.
> -              * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
> -              * we could not migrate the child and therefore we should not
> -              * even try to migrate the parent.
> -              */
> -             if (cfs_rq->nr_running != cfs_rq->h_nr_running)
> -                     continue;
> -
> -             tg = cfs_rq->tg->topmost_limited_ancestor;
> -
> -             if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
> -                 cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
> -                     continue;
> -
> -             load = entity_h_load(tg->se[env->src_cpu]);
> -             if ((load / 2) > env->imbalance)
> -                     continue;
> -
> -             if (!can_migrate_task_group(cfs_rq, env))
> -                     continue;
> -
> -             cur_pulled = move_task_group(cfs_rq, env);
> -             pulled += cur_pulled;
> -             env->imbalance -= load;
> -
> -             env->loop += cur_pulled;
> -             if (env->loop > env->loop_max)
> -                     break;
> -
> -             if (env->imbalance <= 0)
> -                     break;
> -     }
> -     return pulled;
> -}
> -
> -static int do_cpulimit_balance(struct lb_env *env)
> -{
> -     struct cfs_rq *cfs_rq, *pos;
> -     struct task_group *tg;
> -     int pushed = 0;
> -
> -     for_each_leaf_cfs_rq_safe(env->src_rq, cfs_rq, pos) {
> -             if (cfs_rq->tg == &root_task_group)
> -                     continue;
> -             /* see move_task_groups for why we skip such groups */
> -             if (cfs_rq->nr_running != cfs_rq->h_nr_running)
> -                     continue;
> -             tg = cfs_rq->tg->topmost_limited_ancestor;
> -             if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
> -                 cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
> -                 can_migrate_task_group(cfs_rq, env))
> -                     pushed += move_task_group(cfs_rq, env);
> -     }
> -     return pushed;
> -}
> -
> -static int cpulimit_balance_cpu_stop(void *data)
> -{
> -     struct rq *rq = data;
> -     int cpu = cpu_of(rq);
> -     int target_cpu = rq->push_cpu;
> -     struct rq *target_rq = cpu_rq(target_cpu);
> -     struct sched_domain *sd;
> -
> -     raw_spin_rq_lock_irq(rq);
> -
> -     if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
> -                  !cpu_online(target_cpu)))
> -             goto out_unlock;
> -
> -     if (unlikely(!rq->nr_running))
> -             goto out_unlock;
> -
> -     BUG_ON(rq == target_rq);
> -
> -     double_lock_balance(rq, target_rq);
> -     rcu_read_lock();
> -     for_each_domain(target_cpu, sd) {
> -             if (cpumask_test_cpu(cpu, sched_domain_span(sd)))
> -                             break;
> -     }
> -     if (likely(sd)) {
> -             struct lb_env env = {
> -                     .sd             = sd,
> -                     .dst_cpu        = target_cpu,
> -                     .dst_rq         = target_rq,
> -                     .src_cpu        = cpu,
> -                     .src_rq         = rq,
> -             };
> -
> -             schedstat_inc(sd->clb_count);
> -
> -             update_rq_clock(rq);
> -             update_rq_clock(target_rq);
> -             if (do_cpulimit_balance(&env))
> -                     schedstat_inc(sd->clb_pushed);
> -             else
> -                     schedstat_inc(sd->clb_failed);
> -     }
> -     rcu_read_unlock();
> -     double_unlock_balance(rq, target_rq);
> -
> -out_unlock:
> -     rq->active_balance = 0;
> -     raw_spin_rq_unlock_irq(rq);
> -     return 0;
> -}
> -#endif /* CONFIG_CFS_CPULIMIT */
> -
>  static bool __update_blocked_others(struct rq *rq, bool *done)
>  {
>       bool updated;
> @@ -12126,20 +11735,6 @@ static int sched_balance_rq(int this_cpu, struct rq 
> *this_rq,
>  
>               local_irq_restore(rf.flags);
>  
> -#ifdef CONFIG_CFS_CPULIMIT
> -             if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
> -                     env.loop = 0;
> -                     local_irq_save(rf.flags);
> -                     double_rq_lock(env.dst_rq, busiest);
> -                     rq_repin_lock(busiest, &rf);
> -                     update_rq_clock(env.dst_rq);
> -                     cur_ld_moved = ld_moved = move_task_groups(&env);
> -                     rq_unpin_lock(busiest, &rf);
> -                     double_rq_unlock(env.dst_rq, busiest);
> -                     local_irq_restore(rf.flags);
> -                }
> -#endif
> -
>               if (env.flags & LBF_NEED_BREAK) {
>                       env.flags &= ~LBF_NEED_BREAK;
>                       goto more_balance;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 0d3ff6958199..b4ad44e3a3ab 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -515,9 +515,6 @@ struct task_group {
>  #define MAX_CPU_RATE 1024
>       unsigned long cpu_rate;
>       unsigned int nr_cpus;
> -     atomic_t nr_cpus_active;
> -     struct task_group *topmost_limited_ancestor; /* self if none of the
> -                                                     ancestors is limited */
>  #endif
>  };
>  

-- 
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH vz10 1/2] sched: Clean up vCPU handling logic

Reply via email to