On Sun, Dec 08, 2019 at 02:07:46AM -0800, Pratik Vyas wrote: > Hi! > > This is an attempt to address 'thundering herd' problem when a lot of > vms are configured in vm.conf. A lot of vms booting in parallel can > overload the host and also mess up tsc calibration in openbsd guests as > it uses PIT which doesn't fire reliably if the host is overloaded. > > > This diff makes vmd start vms in a staggered fashion with default parallelism > of > number of cpus on the host and a delay of 30s. Default can be overridden with > a line like following in vm.conf > > staggered start parallel 4 delay 30 > > > Every non-disabled vm starts in waiting state. If you are eager to > start a vm that is way further in the list, you can vmctl start it. > > Discussed the idea with ori@, mlarkin@ and phessler@. > > Comments / ok? > > -- > Pratik >
See below. Other than the nits below, ok mlarkin when you are ready. -ml > Index: usr.sbin/vmctl/vmctl.c > =================================================================== > RCS file: /home/cvs/src/usr.sbin/vmctl/vmctl.c,v > retrieving revision 1.71 > diff -u -p -a -u -r1.71 vmctl.c > --- usr.sbin/vmctl/vmctl.c 7 Sep 2019 09:11:14 -0000 1.71 > +++ usr.sbin/vmctl/vmctl.c 8 Dec 2019 09:29:39 -0000 > @@ -716,6 +716,8 @@ vm_state(unsigned int mask) > { > if (mask & VM_STATE_PAUSED) > return "paused"; > + else if (mask & VM_STATE_WAITING) > + return "waiting"; > else if (mask & VM_STATE_RUNNING) > return "running"; > else if (mask & VM_STATE_SHUTDOWN) > Index: usr.sbin/vmd/parse.y > =================================================================== > RCS file: /home/cvs/src/usr.sbin/vmd/parse.y,v > retrieving revision 1.52 > diff -u -p -a -u -r1.52 parse.y > --- usr.sbin/vmd/parse.y 14 May 2019 06:05:45 -0000 1.52 > +++ usr.sbin/vmd/parse.y 8 Dec 2019 09:29:39 -0000 > @@ -122,7 +122,8 @@ typedef struct { > %token INCLUDE ERROR > %token ADD ALLOW BOOT CDROM DEVICE DISABLE DISK DOWN ENABLE FORMAT > GROUP > %token INET6 INSTANCE INTERFACE LLADDR LOCAL LOCKED MEMORY NET NIFS > OWNER > -%token PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID > +%token PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID STAGGERED > START > +%token PARALLEL DELAY > %token <v.number> NUMBER > %token <v.string> STRING > %type <v.lladdr> lladdr > @@ -217,6 +218,11 @@ main : LOCAL INET6 { > env->vmd_ps.ps_csock.cs_uid = $3.uid; > env->vmd_ps.ps_csock.cs_gid = $3.gid == -1 ? 0 : $3.gid; > } > + | STAGGERED START PARALLEL NUMBER DELAY NUMBER { > + env->vmd_cfg.cfg_flags |= VMD_CFG_STAGGERED_START; > + env->vmd_cfg.delay.tv_sec = $6; > + env->vmd_cfg.parallelism = $4; > + } > ; > > switch : SWITCH string { > @@ -368,6 +374,8 @@ vm : VM string vm_instance { > } else { > if (vcp_disable) > vm->vm_state |= > VM_STATE_DISABLED; > + else > + vm->vm_state |= > VM_STATE_WAITING; > log_debug("%s:%d: vm \"%s\" " > "registered (%s)", > file->name, yylval.lineno, > @@ -766,6 +774,7 @@ lookup(char *s) > { "allow", ALLOW }, > { "boot", BOOT }, > { "cdrom", CDROM }, > + { "delay", DELAY }, > { "device", DEVICE }, > { "disable", DISABLE }, > { "disk", DISK }, > @@ -785,10 +794,13 @@ lookup(char *s) > { "memory", MEMORY }, > { "net", NET }, > { "owner", OWNER }, > + { "parallel", PARALLEL }, > { "prefix", PREFIX }, > { "rdomain", RDOMAIN }, > { "size", SIZE }, > { "socket", SOCKET }, > + { "staggered", STAGGERED }, > + { "start", START }, > { "switch", SWITCH }, > { "up", UP }, > { "vm", VM } > Index: usr.sbin/vmd/vm.conf.5 > =================================================================== > RCS file: /home/cvs/src/usr.sbin/vmd/vm.conf.5,v > retrieving revision 1.44 > diff -u -p -a -u -r1.44 vm.conf.5 > --- usr.sbin/vmd/vm.conf.5 14 May 2019 12:47:17 -0000 1.44 > +++ usr.sbin/vmd/vm.conf.5 8 Dec 2019 09:29:39 -0000 > @@ -91,6 +91,16 @@ vm "vm1.example.com" { > .Sh GLOBAL CONFIGURATION > The following setting can be configured globally: > .Bl -tag -width Ds > +.It Ic staggered start parallel Ar parallelism Ic delay Ar seconds > +Start all configured vms in staggered fashion with > +.Ar parallelism > +instances in parallel every > +.Ar delay > +seconds. Defaults to > +.Ar parallelism > +equal to number of cpus and > +.Ar delay > +of 30 seconds. > .It Ic local prefix Ar address Ns Li / Ns Ar prefix > Set the network prefix that is used to allocate subnets for > local interfaces, see > Index: usr.sbin/vmd/vmd.c > =================================================================== > RCS file: /home/cvs/src/usr.sbin/vmd/vmd.c,v > retrieving revision 1.116 > diff -u -p -a -u -r1.116 vmd.c > --- usr.sbin/vmd/vmd.c 4 Sep 2019 07:02:03 -0000 1.116 > +++ usr.sbin/vmd/vmd.c 8 Dec 2019 09:29:39 -0000 > @@ -21,6 +21,7 @@ > #include <sys/wait.h> > #include <sys/cdefs.h> > #include <sys/stat.h> > +#include <sys/sysctl.h> > #include <sys/tty.h> > #include <sys/ttycom.h> > #include <sys/ioctl.h> > @@ -63,6 +64,7 @@ int vm_instance(struct privsep *, struc > struct vmop_create_params *, uid_t); > int vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t); > int vm_claimid(const char *, int, uint32_t *); > +void start_vm_batch(int, short, void*); > > struct vmd *env; > > @@ -73,6 +75,8 @@ static struct privsep_proc procs[] = { > { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, vmm_shutdown }, > }; > > +struct event staggered_start_timer; > + > /* For the privileged process */ > static struct privsep_proc *proc_priv = &procs[0]; > static struct passwd proc_privpw; > @@ -854,11 +858,40 @@ main(int argc, char **argv) > return (0); > } > > +void > +start_vm_batch(int fd, short type, void *args) > +{ > + int i = 0; > + struct vmd_vm *vm; Can you add a blank line here? > + log_debug("%s: starting batch of %d vms", __func__, > + env->vmd_cfg.parallelism); > + TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { > + if (!(vm->vm_state & VM_STATE_WAITING)) { > + log_debug("%s: not creating vm %s (disabled)", "creating"? should it be "starting"? (I realize this was moved from elsewhere but we can fix it now) > + __func__, > + vm->vm_params.vmc_params.vcp_name); > + continue; > + } > + i++; > + if (i > env->vmd_cfg.parallelism) { > + evtimer_add(&staggered_start_timer, > + &env->vmd_cfg.delay); > + break; > + } > + vm->vm_state &= ~VM_STATE_WAITING; > + config_setvm(&env->vmd_ps, vm, -1, > + vm->vm_params.vmc_owner.uid); You can join the 2 lines above, I think (I think it fits) > + } > + log_debug("%s: done starting vms", __func__); > +} > + > int > vmd_configure(void) > { > - struct vmd_vm *vm; > + int ncpus; > struct vmd_switch *vsw; > + int ncpu_mib[] = {CTL_HW, HW_NCPU}; Would NCPUONLINE be a better fit here? > + size_t ncpus_sz = sizeof(ncpus); > > if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1) > fatal("open %s", PATH_PTMDEV); > @@ -906,17 +939,21 @@ vmd_configure(void) > } > } > > - TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) { > - if (vm->vm_state & VM_STATE_DISABLED) { > - log_debug("%s: not creating vm %s (disabled)", > - __func__, > - vm->vm_params.vmc_params.vcp_name); > - continue; > - } > - if (config_setvm(&env->vmd_ps, vm, > - -1, vm->vm_params.vmc_owner.uid) == -1) > - return (-1); > + if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) { > + env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY; > + if (sysctl(ncpu_mib, 2, &ncpus, &ncpus_sz, NULL, 0) == -1) > + ncpus = 1; > + env->vmd_cfg.parallelism = ncpus; > + log_debug("%s: setting staggered start configuration to " > + "parallelism: %d and delay: %lld", > + __func__, ncpus, env->vmd_cfg.delay.tv_sec); > + > } > + > + log_debug("%s: starting vms in staggered fashion", __func__); > + evtimer_set(&staggered_start_timer, start_vm_batch, NULL); > + /* start first batch */ > + start_vm_batch(0, 0, NULL); > > return (0); > } > Index: usr.sbin/vmd/vmd.h > =================================================================== > RCS file: /home/cvs/src/usr.sbin/vmd/vmd.h,v > retrieving revision 1.97 > diff -u -p -a -u -r1.97 vmd.h > --- usr.sbin/vmd/vmd.h 7 Sep 2019 09:11:14 -0000 1.97 > +++ usr.sbin/vmd/vmd.h 8 Dec 2019 09:29:39 -0000 > @@ -56,6 +56,8 @@ > #define VMD_SWITCH_TYPE "bridge" > #define VM_DEFAULT_MEMORY 512 > > +#define VMD_DEFAULT_STAGGERED_START_DELAY 30 > + > /* Rate-limit fast reboots */ > #define VM_START_RATE_SEC 6 /* min. seconds since last reboot */ > #define VM_START_RATE_LIMIT 3 /* max. number of fast reboots */ > @@ -280,6 +282,7 @@ struct vmd_vm { > #define VM_STATE_SHUTDOWN 0x04 > #define VM_STATE_RECEIVED 0x08 > #define VM_STATE_PAUSED 0x10 > +#define VM_STATE_WAITING 0x20 > > /* For rate-limiting */ > struct timeval vm_start_tv; > @@ -319,7 +322,10 @@ struct vmd_config { > unsigned int cfg_flags; > #define VMD_CFG_INET6 0x01 > #define VMD_CFG_AUTOINET6 0x02 > +#define VMD_CFG_STAGGERED_START 0x04 > > + struct timeval delay; > + int parallelism; > struct address cfg_localprefix; > struct address cfg_localprefix6; > }; >