Re: Scheduler improvements

Gregor Best Wed, 18 Jan 2012 07:54:26 -0800

Hi people,

after a long time of silence here's a second iteration of the patch.
I've addressed a few concerns voiced here:


* Process lookup and friends now have O(log n) runtime. I achieved that
  by abusing RB-trees as priority queues since they have runtime
  O(log n) in all relevant algorithms.
* The algorithm for calculating a new deadline for a given process has
  been simplified and is now documented a bit better. It also derives
  the deadline offset from the value of hz (via rrticks_init) as
  suggested by Miod (?).
* CPU rlimits are now honored again. The relevant code did not change, the
  new patch just doesn't remove rlimit enforcement anymore.
* Timeslices are 20ms long instead of 10ms. This solves the issue of 0ms
  long timeslices on machines with hz < 100.

With recent improvements in the mainline scheduler and especially
rthreads, the performance of the patched scheduler and mainline is now
roughly similar, at least if throughput is concerned. I have the feeling
that the system behaves "snappier" with my patch, but that might be some
sort of placebo effect. I haven't yet come up with a reliable method to
benchmark interactivity except for actually using the machine and doing
stuff on it. It's interesting to note however that the patched scheduler
achieves a performance similar to the default one without all the fancy
methods for calculating how expensive it is to move a process from one
CPU to another or related methods for preserving cache locality.

I use the patched scheduler exclusively on my Core2Duo machine with an
MP build.

The amount of lines removed versus added lines by this patch shifted
towards "more added lines" but is still at 173 lines less than the
default.

Once again, comments, rants, insults, everything is welcome :)

-- 
    Gregor Best
Index: sys/proc.h
===================================================================
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.149
diff -u -r1.149 proc.h
--- sys/proc.h  7 Jan 2012 05:38:12 -0000       1.149
+++ sys/proc.h  17 Jan 2012 16:10:09 -0000
@@ -43,6 +43,7 @@
 #include <machine/proc.h>              /* Machine-dependent proc substruct. */
 #include <sys/selinfo.h>               /* For struct selinfo */
 #include <sys/queue.h>
+#include <sys/tree.h>
 #include <sys/timeout.h>               /* For struct timeout */
 #include <sys/event.h>                 /* For struct klist */
 #include <sys/mutex.h>                 /* For struct mutex */
@@ -210,7 +211,9 @@
 #define        PS_SINGLEUNWIND _P_SINGLEUNWIND
 
 struct proc {
-       TAILQ_ENTRY(proc) p_runq;
+       RB_ENTRY(proc) p_runq;
+       RB_ENTRY(proc) p_expq;
+       TAILQ_ENTRY(proc) p_slpq;
        LIST_ENTRY(proc) p_list;        /* List of all processes. */
 
        struct  process *p_p;           /* The process of this thread. */
@@ -251,6 +254,9 @@
 #endif
 
        /* scheduling */
+       struct timeval p_deadline; /* virtual deadline used for scheduling */
+       struct timeval p_deadline_set; /* time at which the deadline was set */
+       int  p_rrticks; /* number of ticks this process is allowed to stay on a 
processor */
        u_int   p_estcpu;        /* Time averaged value of p_cpticks. */
        int     p_cpticks;       /* Ticks of cpu time. */
        fixpt_t p_pctcpu;        /* %cpu for this process during p_swtime */
Index: sys/sched.h
===================================================================
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.30
diff -u -r1.30 sched.h
--- sys/sched.h 16 Nov 2011 20:50:19 -0000      1.30
+++ sys/sched.h 17 Jan 2012 16:10:09 -0000
@@ -87,8 +87,6 @@
 #define CP_IDLE                4
 #define CPUSTATES      5
 
-#define        SCHED_NQS       32                      /* 32 run queues. */
-
 /*
  * Per-CPU scheduler state.
  * XXX - expose to userland for now.
@@ -99,7 +97,6 @@
        u_int spc_schedticks;           /* ticks for schedclock() */
        u_int64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
        u_char spc_curpriority;         /* usrpri of curproc */
-       int spc_rrticks;                /* ticks until roundrobin() */
        int spc_pscnt;                  /* prof/stat counter */
        int spc_psdiv;                  /* prof/stat divisor */ 
        struct proc *spc_idleproc;      /* idle proc for this cpu */
@@ -107,9 +104,6 @@
        u_int spc_nrun;                 /* procs on the run queues */
        fixpt_t spc_ldavg;              /* shortest load avg. for this cpu */
 
-       TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
-       volatile uint32_t spc_whichqs;
-
 #ifdef notyet
        struct proc *spc_reaper;        /* dead proc reaper */
 #endif
@@ -119,18 +113,16 @@
 #ifdef _KERNEL
 
 /* spc_flags */
-#define SPCF_SEENRR             0x0001  /* process has seen roundrobin() */
-#define SPCF_SHOULDYIELD        0x0002  /* process should yield the CPU */
-#define SPCF_SWITCHCLEAR        (SPCF_SEENRR|SPCF_SHOULDYIELD)
-#define SPCF_SHOULDHALT                0x0004  /* CPU should be vacated */
-#define SPCF_HALTED            0x0008  /* CPU has been halted */
+#define SPCF_SHOULDYIELD        0x0001  /* process should yield the CPU */
+#define SPCF_SHOULDHALT                0x0002  /* CPU should be vacated */
+#define SPCF_HALTED            0x0004  /* CPU has been halted */
 
-#define        SCHED_PPQ       (128 / SCHED_NQS)       /* priorities per queue 
*/
 #define NICE_WEIGHT 2                  /* priorities per nice level */
-#define        ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - SCHED_PPQ)
+#define        ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX)
 
 extern int schedhz;                    /* ideally: 16 */
 extern int rrticks_init;               /* ticks per roundrobin() */
+extern struct cpuset sched_idle_cpus;
 
 struct proc;
 void schedclock(struct proc *);
@@ -147,18 +139,20 @@
 void cpu_switchto(struct proc *, struct proc *);
 struct proc *sched_chooseproc(void);
 struct cpu_info *sched_choosecpu(struct proc *);
-struct cpu_info *sched_choosecpu_fork(struct proc *parent, int);
+struct cpu_info *sched_choosecpu_fork(struct proc *parent);
 void cpu_idle_enter(void);
 void cpu_idle_cycle(void);
 void cpu_idle_leave(void);
 void sched_peg_curproc(struct cpu_info *ci);
 
+void generate_deadline(struct proc *, char);
+
 #ifdef MULTIPROCESSOR
 void sched_start_secondary_cpus(void);
 void sched_stop_secondary_cpus(void);
 #endif
 
-#define curcpu_is_idle()       (curcpu()->ci_schedstate.spc_whichqs == 0)
+#define curcpu_is_idle() (cpuset_isset(&sched_idle_cpus, curcpu()))
 
 void sched_init_runqueues(void);
 void setrunqueue(struct proc *);
Index: kern/kern_clock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.72
diff -u -r1.72 kern_clock.c
--- kern/kern_clock.c   7 Mar 2011 07:07:13 -0000       1.72
+++ kern/kern_clock.c   17 Jan 2012 16:10:10 -0000
@@ -241,7 +241,11 @@
        if (stathz == 0)
                statclock(frame);
 
-       if (--ci->ci_schedstate.spc_rrticks <= 0)
+       /*
+        * If the currently running process went over its round robin tick 
quota,
+        * preempt it.
+        */
+       if (p && (--(p->p_rrticks) <= 0))
                roundrobin(ci);
 
        /*
Index: kern/kern_fork.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.133
diff -u -r1.133 kern_fork.c
--- kern/kern_fork.c    14 Dec 2011 07:32:16 -0000      1.133
+++ kern/kern_fork.c    17 Jan 2012 16:10:10 -0000
@@ -225,6 +225,9 @@
                atomic_setbits_int(&pr->ps_flags, PS_CONTROLT);
 
        p->p_p = pr;
+
+       /* Pretend we preempted this new process */
+       generate_deadline(p, 0);
 }
 
 /* print the 'table full' message once per 10 seconds */
@@ -485,7 +488,7 @@
        getmicrotime(&p->p_stats->p_start);
        p->p_acflag = AFORK;
        p->p_stat = SRUN;
-       p->p_cpu = sched_choosecpu_fork(curp, flags);
+       p->p_cpu = sched_choosecpu_fork(curp);
        setrunqueue(p);
        SCHED_UNLOCK(s);
 
Index: kern/kern_proc.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.47
diff -u -r1.47 kern_proc.c
--- kern/kern_proc.c    18 Sep 2011 23:20:54 -0000      1.47
+++ kern/kern_proc.c    17 Jan 2012 16:10:10 -0000
@@ -398,8 +398,6 @@
            p->p_comm, p->p_pid, pst, p->p_flag, P_BITS);
        (*pr)("    pri=%u, usrpri=%u, nice=%d\n",
            p->p_priority, p->p_usrpri, p->p_p->ps_nice);
-       (*pr)("    forw=%p, list=%p,%p\n",
-           TAILQ_NEXT(p, p_runq), p->p_list.le_next, p->p_list.le_prev);
        (*pr)("    process=%p user=%p, vmspace=%p\n",
            p->p_p, p->p_addr, p->p_vmspace);
        (*pr)("    estcpu=%u, cpticks=%d, pctcpu=%u.%u, swtime=%u\n",
Index: kern/kern_sched.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.24
diff -u -r1.24 kern_sched.c
--- kern/kern_sched.c   12 Oct 2011 18:30:09 -0000      1.24
+++ kern/kern_sched.c   17 Jan 2012 16:10:10 -0000
@@ -19,6 +19,7 @@
 
 #include <sys/sched.h>
 #include <sys/proc.h>
+#include <sys/tree.h>
 #include <sys/kthread.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
@@ -32,9 +33,11 @@
 
 void sched_kthreads_create(void *);
 
-int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
 struct proc *sched_steal_proc(struct cpu_info *);
 
+RB_HEAD(runqueue, proc) sched_runqueue;
+RB_HEAD(expqueue, proc) sched_expqueue;
+
 /*
  * To help choosing which cpu should run which process we keep track
  * of cpus which are currently idle and which cpus have processes
@@ -61,6 +64,27 @@
  * interrupts during the context switch.
  */

+static int
+sched_cmp_proc(struct proc *a, struct proc *b) {
+       if (a == b)
+               return 0;
+       if (timercmp(&(a->p_deadline), &(b->p_deadline), <))
+               return -1;
+       return 1;
+}
+
+static int
+sched_cmp_proc_exp(struct proc *a, struct proc *b) {
+       if (a == b)
+               return 0;
+       if (timercmp(&(a->p_deadline_set), &(b->p_deadline_set), <))
+               return -1;
+       return 1;
+}
+
+RB_GENERATE_STATIC(runqueue, proc, p_runq, sched_cmp_proc);
+RB_GENERATE_STATIC(expqueue, proc, p_expq, sched_cmp_proc_exp);
+
 /*
  * sched_init_cpu is called from main() for the boot cpu, then it's the
  * responsibility of the MD code to call it for all other cpus.
@@ -69,10 +93,6 @@
 sched_init_cpu(struct cpu_info *ci)
 {
        struct schedstate_percpu *spc = &ci->ci_schedstate;
-       int i;
-
-       for (i = 0; i < SCHED_NQS; i++)
-               TAILQ_INIT(&spc->spc_qs[i]);
 
        spc->spc_idleproc = NULL;
 
@@ -106,6 +126,7 @@
 {
        struct schedstate_percpu *spc;
        struct proc *p = curproc;
+       struct proc *dead;
        struct cpu_info *ci = v;
        int s;
 
@@ -120,7 +141,6 @@
        SCHED_LOCK(s);
        cpuset_add(&sched_idle_cpus, ci);
        p->p_stat = SSLEEP;
-       p->p_cpu = ci;
        atomic_setbits_int(&p->p_flag, P_CPUPEG);
        mi_switch();
        cpuset_del(&sched_idle_cpus, ci);
@@ -130,38 +150,27 @@
        KASSERT(curproc == spc->spc_idleproc);
 
        while (1) {
-               while (!curcpu_is_idle()) {
-                       struct proc *dead;
-
-                       SCHED_LOCK(s);
-                       p->p_stat = SSLEEP;
-                       mi_switch();
-                       SCHED_UNLOCK(s);
-
-                       while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
+               while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
                                LIST_REMOVE(dead, p_hash);
                                exit2(dead);
-                       }
                }
+               if ((spc->spc_schedflags & SPCF_SHOULDHALT) && 
!(spc->spc_schedflags & SPCF_HALTED))
+                       atomic_setbits_int(&spc->spc_schedflags, SPCF_HALTED);
 
                splassert(IPL_NONE);
 
                cpuset_add(&sched_idle_cpus, ci);
+
                cpu_idle_enter();
-               while (spc->spc_whichqs == 0) {
-                       if (spc->spc_schedflags & SPCF_SHOULDHALT &&
-                           (spc->spc_schedflags & SPCF_HALTED) == 0) {
-                               cpuset_del(&sched_idle_cpus, ci);
-                               SCHED_LOCK(s);
-                               atomic_setbits_int(&spc->spc_schedflags,
-                                   spc->spc_whichqs ? 0 : SPCF_HALTED);
-                               SCHED_UNLOCK(s);
-                               wakeup(spc);
-                       }
-                       cpu_idle_cycle();
-               }
+               cpu_idle_cycle();
                cpu_idle_leave();
+
                cpuset_del(&sched_idle_cpus, ci);
+
+               SCHED_LOCK(s);
+               p->p_stat = SSLEEP;
+               mi_switch();
+               SCHED_UNLOCK(s);
        }
 }
 
@@ -206,63 +215,35 @@
 void
 sched_init_runqueues(void)
 {
+       RB_INIT(&sched_runqueue);
+       RB_INIT(&sched_expqueue);
 }
 
 void
 setrunqueue(struct proc *p)
 {
-       struct schedstate_percpu *spc;
-       int queue = p->p_priority >> 2;
-
        SCHED_ASSERT_LOCKED();
-       spc = &p->p_cpu->ci_schedstate;
-       spc->spc_nrun++;
-
-       TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
-       spc->spc_whichqs |= (1 << queue);
-       cpuset_add(&sched_queued_cpus, p->p_cpu);
-
-       if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
-               cpu_unidle(p->p_cpu);
+       RB_INSERT(runqueue, &sched_runqueue, p);
 }
 
 void
 remrunqueue(struct proc *p)
 {
-       struct schedstate_percpu *spc;
-       int queue = p->p_priority >> 2;
-
        SCHED_ASSERT_LOCKED();
-       spc = &p->p_cpu->ci_schedstate;
-       spc->spc_nrun--;
-
-       TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
-       if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
-               spc->spc_whichqs &= ~(1 << queue);
-               if (spc->spc_whichqs == 0)
-                       cpuset_del(&sched_queued_cpus, p->p_cpu);
-       }
+       if (!(RB_REMOVE(runqueue, &sched_runqueue, p)))
+               RB_REMOVE(expqueue, &sched_expqueue, p);
 }
 
 struct proc *
 sched_chooseproc(void)
 {
        struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
-       struct proc *p;
-       int queue;
+       struct proc *p = NULL;
+       struct timeval now;
 
        SCHED_ASSERT_LOCKED();
 
        if (spc->spc_schedflags & SPCF_SHOULDHALT) {
-               if (spc->spc_whichqs) {
-                       for (queue = 0; queue < SCHED_NQS; queue++) {
-                               TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
-                                       remrunqueue(p);
-                                       p->p_cpu = sched_choosecpu(p);
-                                       setrunqueue(p);
-                               }
-                       }
-               }
                p = spc->spc_idleproc;
                KASSERT(p);
                KASSERT(p->p_wchan == NULL);
@@ -270,16 +251,30 @@
                return (p);
        }
 
-again:
-       if (spc->spc_whichqs) {
-               queue = ffs(spc->spc_whichqs) - 1;
-               p = TAILQ_FIRST(&spc->spc_qs[queue]);
-               remrunqueue(p);
-               KASSERT(p->p_stat == SRUN);
-       } else if ((p = sched_steal_proc(curcpu())) == NULL) {
-               p = spc->spc_idleproc;
-               if (p == NULL) {
-                        int s;
+       if ((!RB_EMPTY(&sched_runqueue)) || (!RB_EMPTY(&sched_expqueue))) {
+               /*
+                * Move expired processes to the expired runqueue where they 
are sorted by the time they got their
+                * deadline set instead of the deadline itself.
+                * */
+               microuptime(&now);
+               while(!RB_EMPTY(&sched_runqueue)) {
+                       p = RB_MIN(runqueue, &sched_runqueue);
+                       if (!p) break;
+                       if (!timercmp(&(p->p_deadline), &now, <)) break;
+                       RB_INSERT(expqueue, &sched_expqueue, p);
+                       RB_REMOVE(runqueue, &sched_runqueue, p);
+               }
+
+               if (!RB_EMPTY(&sched_expqueue)) {
+                       p = RB_MIN(expqueue, &sched_expqueue);
+               } else {
+                       p = RB_MIN(runqueue, &sched_runqueue);
+               }
+               if (p)
+                       remrunqueue(p);
+       } else {
+               while ((p = spc->spc_idleproc) == NULL) {
+                       int s;
                        /*
                         * We get here if someone decides to switch during
                         * boot before forking kthreads, bleh.
@@ -291,13 +286,15 @@
                        spl0();
                        delay(10);
                        SCHED_LOCK(s);
-                       goto again;
-                }
+               }
                KASSERT(p);
+       }
+
+       if (p) {
                p->p_stat = SRUN;
+               p->p_cpu = curcpu();
        } 
 
-       KASSERT(p->p_wchan == NULL);
        return (p);     
 }
 
@@ -310,30 +307,13 @@
 uint64_t sched_nomigrations;
 
 struct cpu_info *
-sched_choosecpu_fork(struct proc *parent, int flags)
+sched_choosecpu_fork(struct proc *parent)
 {
        struct cpu_info *choice = NULL;
        fixpt_t load, best_load = ~0;
-       int run, best_run = INT_MAX;
        struct cpu_info *ci;
        struct cpuset set;
 
-#if 0
-       /*
-        * XXX
-        * Don't do this until we have a painless way to move the cpu in exec.
-        * Preferably when nuking the old pmap and getting a new one on a
-        * new cpu.
-        */
-       /*
-        * PPWAIT forks are simple. We know that the parent will not
-        * run until we exec and choose another cpu, so we just steal its
-        * cpu.
-        */
-       if (flags & FORK_PPWAIT)
-               return (parent->p_cpu);
-#endif
-
        /*
         * Look at all cpus that are currently idle and have nothing queued.
         * If there are none, pick the one with least queued procs first,
@@ -347,13 +327,10 @@
                cpuset_del(&set, ci);
 
                load = ci->ci_schedstate.spc_ldavg;
-               run = ci->ci_schedstate.spc_nrun;
 
-               if (choice == NULL || run < best_run ||
-                   (run == best_run &&load < best_load)) {
+               if (choice == NULL || load < best_load) {
                        choice = ci;
                        best_load = load;
-                       best_run = run;
                }
        }
 
@@ -364,9 +341,6 @@
 sched_choosecpu(struct proc *p)
 {
        struct cpu_info *choice = NULL;
-       int last_cost = INT_MAX;
-       struct cpu_info *ci;
-       struct cpuset set;
 
        /*
         * If pegged to a cpu, don't allow it to move.
@@ -374,169 +348,19 @@
        if (p->p_flag & P_CPUPEG)
                return (p->p_cpu);
 
-       sched_choose++;
-
-       /*
-        * Look at all cpus that are currently idle and have nothing queued.
-        * If there are none, pick the cheapest of those.
-        * (idle + queued could mean that the cpu is handling an interrupt
-        * at this moment and haven't had time to leave idle yet).
-        */
-       cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
-
        /*
-        * First, just check if our current cpu is in that set, if it is,
-        * this is simple.
-        * Also, our cpu might not be idle, but if it's the current cpu
-        * and it has nothing else queued and we're curproc, take it.
+        * Else, just pretend we forked a new process.
         */
-       if (cpuset_isset(&set, p->p_cpu) ||
-           (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
-           curproc == p)) {
-               sched_wasidle++;
-               return (p->p_cpu);
-       }
-
-       if (cpuset_first(&set) == NULL)
-               cpuset_copy(&set, &sched_all_cpus);
-
-       while ((ci = cpuset_first(&set)) != NULL) {
-               int cost = sched_proc_to_cpu_cost(ci, p);
+       sched_choose++;
 
-               if (choice == NULL || cost < last_cost) {
-                       choice = ci;
-                       last_cost = cost;
-               }
-               cpuset_del(&set, ci);
-       }
+       choice = sched_choosecpu_fork(p);
 
        if (p->p_cpu != choice)
                sched_nmigrations++;
        else
                sched_nomigrations++;
 
-       return (choice);
-}
-
-/*
- * Attempt to steal a proc from some cpu.
- */
-struct proc *
-sched_steal_proc(struct cpu_info *self)
-{
-       struct schedstate_percpu *spc;
-       struct proc *best = NULL;
-       int bestcost = INT_MAX;
-       struct cpu_info *ci;
-       struct cpuset set;
-
-       cpuset_copy(&set, &sched_queued_cpus);
-
-       while ((ci = cpuset_first(&set)) != NULL) {
-               struct proc *p;
-               int queue;
-               int cost;
-
-               cpuset_del(&set, ci);
-
-               spc = &ci->ci_schedstate;
-
-               queue = ffs(spc->spc_whichqs) - 1;
-               TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
-                       if (p->p_flag & P_CPUPEG)
-                               continue;
-
-                       cost = sched_proc_to_cpu_cost(self, p);
-
-                       if (best == NULL || cost < bestcost) {
-                               best = p;
-                               bestcost = cost;
-                       }
-               }
-       }
-       if (best == NULL)
-               return (NULL);
-
-       spc = &best->p_cpu->ci_schedstate;
-       remrunqueue(best);
-       best->p_cpu = self;
-
-       sched_stolen++;
-
-       return (best);
-}
-
-/*
- * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
- */
-static int
-log2(unsigned int i)
-{
-       int ret = 0;
-
-       while (i >>= 1)
-               ret++;
-
-       return (ret);
-}
-
-/*
- * Calculate the cost of moving the proc to this cpu.
- * 
- * What we want is some guesstimate of how much "performance" it will
- * cost us to move the proc here. Not just for caches and TLBs and NUMA
- * memory, but also for the proc itself. A highly loaded cpu might not
- * be the best candidate for this proc since it won't get run.
- *
- * Just total guesstimates for now.
- */
-
-int sched_cost_load = 1;
-int sched_cost_priority = 1;
-int sched_cost_runnable = 3;
-int sched_cost_resident = 1;
-
-int
-sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
-{
-       struct schedstate_percpu *spc;
-       int l2resident = 0;
-       int cost;
-
-       spc = &ci->ci_schedstate;
-
-       cost = 0;
-
-       /*
-        * First, account for the priority of the proc we want to move.
-        * More willing to move, the lower the priority of the destination
-        * and the higher the priority of the proc.
-        */
-       if (!cpuset_isset(&sched_idle_cpus, ci)) {
-               cost += (p->p_priority - spc->spc_curpriority) *
-                   sched_cost_priority;
-               cost += sched_cost_runnable;
-       }
-       if (cpuset_isset(&sched_queued_cpus, ci)) {
-               cost += spc->spc_nrun * sched_cost_runnable;
-       }
-
-       /*
-        * Higher load on the destination means we don't want to go there.
-        */
-       cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
-
-       /*
-        * If the proc is on this cpu already, lower the cost by how much
-        * it has been running and an estimate of its footprint.
-        */
-       if (p->p_cpu == ci && p->p_slptime == 0) {
-               l2resident =
-                   log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
-               cost -= l2resident * sched_cost_resident;
-       }
-
-       return (cost);
+       return choice;
 }
 
 /*
@@ -560,7 +384,6 @@
 }
 
 #ifdef MULTIPROCESSOR
-
 void
 sched_start_secondary_cpus(void)
 {
@@ -583,6 +406,9 @@
 {
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
+       int s;
+
+       SCHED_LOCK(s);
 
        /*
         * Make sure we stop the secondary CPUs.
@@ -601,14 +427,17 @@
 
                if (CPU_IS_PRIMARY(ci))
                        continue;
-               while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
+
+               SCHED_UNLOCK(s);
+               while (!(spc->spc_schedflags & SPCF_HALTED)) {
                        sleep_setup(&sls, spc, PZERO, "schedstate");
-                       sleep_finish(&sls,
-                           (spc->spc_schedflags & SPCF_HALTED) == 0);
+                       sleep_setup_timeout(&sls, 10);
+                       sleep_finish(&sls, 1);
                }
+               SCHED_LOCK(s);
        }
+       SCHED_UNLOCK(s);
 }
-
 #endif
 
 /*
Index: kern/kern_synch.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.99
diff -u -r1.99 kern_synch.c
--- kern/kern_synch.c   17 Jan 2012 02:34:18 -0000      1.99
+++ kern/kern_synch.c   17 Jan 2012 16:10:10 -0000
@@ -205,7 +205,7 @@
        p->p_wmesg = wmesg;
        p->p_slptime = 0;
        p->p_priority = prio & PRIMASK;
-       TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_runq);
+       TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
 }
 
 void
@@ -342,7 +342,7 @@
 unsleep(struct proc *p)
 {
        if (p->p_wchan) {
-               TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_runq);
+               TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
                p->p_wchan = NULL;
        }
 }
@@ -361,7 +361,7 @@
        SCHED_LOCK(s);
        qp = &slpque[LOOKUP(ident)];
        for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
-               pnext = TAILQ_NEXT(p, p_runq);
+               pnext = TAILQ_NEXT(p, p_slpq);
 #ifdef DIAGNOSTIC
                if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
                        panic("wakeup: p_stat is %d", (int)p->p_stat);
@@ -369,7 +369,7 @@
                if (p->p_wchan == ident) {
                        --n;
                        p->p_wchan = 0;
-                       TAILQ_REMOVE(qp, p, p_runq);
+                       TAILQ_REMOVE(qp, p, p_slpq);
                        if (p->p_stat == SSLEEP) {
                                /* OPTIMIZED EXPANSION OF setrunnable(p); */
                                if (p->p_slptime > 1)
Index: kern/sched_bsd.c
===================================================================
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.27
diff -u -r1.27 sched_bsd.c
--- kern/sched_bsd.c    7 Jul 2011 18:00:33 -0000       1.27
+++ kern/sched_bsd.c    17 Jan 2012 16:10:10 -0000
@@ -77,37 +77,23 @@
 
        timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
 
-       rrticks_init = hz / 10;
+       rrticks_init = hz / 50;
        schedcpu(&schedcpu_to);
 }
 
 /*
- * Force switch among equal priority processes every 100ms.
+ * Force switch among equal priority processes every 20ms.
  */
 void
 roundrobin(struct cpu_info *ci)
 {
        struct schedstate_percpu *spc = &ci->ci_schedstate;
 
-       spc->spc_rrticks = rrticks_init;
-
        if (ci->ci_curproc != NULL) {
-               if (spc->spc_schedflags & SPCF_SEENRR) {
-                       /*
-                        * The process has already been through a roundrobin
-                        * without switching and may be hogging the CPU.
-                        * Indicate that the process should yield.
-                        */
-                       atomic_setbits_int(&spc->spc_schedflags,
-                           SPCF_SHOULDYIELD);
-               } else {
-                       atomic_setbits_int(&spc->spc_schedflags,
-                           SPCF_SEENRR);
-               }
+               atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
        }
 
-       if (spc->spc_nrun)
-               need_resched(ci);
+       need_resched(ci);
 }
 
 /*
@@ -204,7 +190,6 @@
        struct timeout *to = (struct timeout *)arg;
        fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
        struct proc *p;
-       int s;
        unsigned int newcpu;
        int phz;
 
@@ -233,7 +218,6 @@
                 */
                if (p->p_slptime > 1)
                        continue;
-               SCHED_LOCK(s);
                /*
                 * p_pctcpu is only for ps.
                 */
@@ -250,17 +234,6 @@
                newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
                p->p_estcpu = newcpu;
                resetpriority(p);
-               if (p->p_priority >= PUSER) {
-                       if (p->p_stat == SRUN &&
-                           (p->p_priority / SCHED_PPQ) !=
-                           (p->p_usrpri / SCHED_PPQ)) {
-                               remrunqueue(p);
-                               p->p_priority = p->p_usrpri;
-                               setrunqueue(p);
-                       } else
-                               p->p_priority = p->p_usrpri;
-               }
-               SCHED_UNLOCK(s);
        }
        uvm_meter();
        wakeup(&lbolt);
@@ -278,8 +251,6 @@
        unsigned int newcpu = p->p_estcpu;
        fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 
-       SCHED_ASSERT_LOCKED();
-
        if (p->p_slptime > 5 * loadfac)
                p->p_estcpu = 0;
        else {
@@ -304,6 +275,7 @@
        SCHED_LOCK(s);
        p->p_priority = p->p_usrpri;
        p->p_stat = SRUN;
+       generate_deadline(p, 1);
        setrunqueue(p);
        p->p_stats->p_ru.ru_nvcsw++;
        mi_switch();
@@ -332,6 +304,7 @@
        p->p_priority = p->p_usrpri;
        p->p_stat = SRUN;
        p->p_cpu = sched_choosecpu(p);
+       generate_deadline(p, 0);
        setrunqueue(p);
        p->p_stats->p_ru.ru_nivcsw++;
        mi_switch();
@@ -372,14 +345,7 @@
         * process was running, and add that to its total so far.
         */
        microuptime(&tv);
-       if (timercmp(&tv, &spc->spc_runtime, <)) {
-#if 0
-               printf("uptime is not monotonic! "
-                   "tv=%lu.%06lu, runtime=%lu.%06lu\n",
-                   tv.tv_sec, tv.tv_usec, spc->spc_runtime.tv_sec,
-                   spc->spc_runtime.tv_usec);
-#endif
-       } else {
+       if (timercmp(&tv, &spc->spc_runtime, >=)) {
                timersub(&tv, &spc->spc_runtime, &tv);
                timeradd(&p->p_rtime, &tv, &p->p_rtime);
        }
@@ -403,7 +369,7 @@
         * Process is about to yield the CPU; clear the appropriate
         * scheduling flags.
         */
-       atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
+       atomic_clearbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
 
        nextproc = sched_chooseproc();
 
@@ -435,9 +401,8 @@
         * be running on a new CPU now, so don't use the cache'd
         * schedstate_percpu pointer.
         */
-       KASSERT(p->p_cpu == curcpu());
-
-       microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
+       if (p->p_cpu == curcpu())
+               microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
 
 #ifdef MULTIPROCESSOR
        /*
@@ -515,30 +480,56 @@
        }
        p->p_stat = SRUN;
        p->p_cpu = sched_choosecpu(p);
-       setrunqueue(p);
        if (p->p_slptime > 1)
                updatepri(p);
+       setrunqueue(p);
        p->p_slptime = 0;
        resched_proc(p, p->p_priority);
 }
 
 /*
  * Compute the priority of a process when running in user mode.
- * Arrange to reschedule if the resulting priority is better
- * than that of the current process.
  */
 void
 resetpriority(struct proc *p)
 {
-       unsigned int newpriority;
-
-       SCHED_ASSERT_LOCKED();
+       p->p_usrpri = min(((NZERO + (p->p_p->ps_nice))) << 1, MAXPRI);
+}
 
-       newpriority = PUSER + p->p_estcpu +
-           NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
-       newpriority = min(newpriority, MAXPRI);
-       p->p_usrpri = newpriority;
-       resched_proc(p, p->p_usrpri);
+/*
+ * Generate a new virtual deadline for a process. The deadline is a soft
+ * one and has no purpose besides being used for choosing the next process
+ * to run. Also resets the number of round robin ticks available to the
+ * process if the previous timeslice expired and the process had to be 
preempted.
+ */
+void
+generate_deadline(struct proc *p, char voluntary) {
+       /*
+        * For nice values between 0 and 39 inclusively, the offset lies between
+        * 32 and 1280 milliseconds for a machine with hz=100. That means that
+        * processes with nice value=0 (i.e. -20 in userland) will be executed
+        * 32 milliseconds in the future at the latest. Processes with very
+        * little priority will be executed 1.28 seconds in the future at the 
very
+        * latest. The shift is done to ensure that the lowest possible offset 
is
+        * larger than the timeslice, in order to make sure that the scheduler 
does
+        * not degenerate to round robin behaviour when more than just a few 
processes
+        * with high priority are started.
+        *
+        * If the process voluntarily yielded its CPU, we reward it by halving 
its
+        * deadline offset.
+        */
+       unsigned int offset_msec = ((p->p_p->ps_nice + 1) * rrticks_init) << 
(voluntary ? 2 : 3);
+       struct timeval offset = {
+               .tv_sec  = offset_msec / 1000,
+               .tv_usec = offset_msec % 1000
+       };
+       struct timeval now;
+       microuptime(&now);
+       bcopy(&now, &(p->p_deadline_set), sizeof(struct timeval));
+
+       timeradd(&now, &offset, &(p->p_deadline));
+       if (!voluntary)
+               p->p_rrticks = rrticks_init;
 }
 
 /*
@@ -559,12 +550,8 @@
 void
 schedclock(struct proc *p)
 {
-       int s;
-
-       SCHED_LOCK(s);
        p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
        resetpriority(p);
        if (p->p_priority >= PUSER)
                p->p_priority = p->p_usrpri;
-       SCHED_UNLOCK(s);
 }

Re: Scheduler improvements

Reply via email to