> On 8 Dec 2019, at 11:08, Pratik Vyas <m...@pd.io> wrote:
>
> Hi!
>
> This is an attempt to address 'thundering herd' problem when a lot of
> vms are configured in vm.conf. A lot of vms booting in parallel can
> overload the host and also mess up tsc calibration in openbsd guests as
> it uses PIT which doesn't fire reliably if the host is overloaded.
>
>
> This diff makes vmd start vms in a staggered fashion with default parallelism
> of
> number of cpus on the host and a delay of 30s. Default can be overridden with
> a line like following in vm.conf
>
> staggered start parallel 4 delay 30
>
>
> Every non-disabled vm starts in waiting state. If you are eager to
> start a vm that is way further in the list, you can vmctl start it.
>
> Discussed the idea with ori@, mlarkin@ and phessler@.
>
> Comments / ok?
Great addition to stop -w. Like it!
Mischa
> --
> Pratik
>
> Index: usr.sbin/vmctl/vmctl.c
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmctl/vmctl.c,v
> retrieving revision 1.71
> diff -u -p -a -u -r1.71 vmctl.c
> --- usr.sbin/vmctl/vmctl.c 7 Sep 2019 09:11:14 -0000 1.71
> +++ usr.sbin/vmctl/vmctl.c 8 Dec 2019 09:29:39 -0000
> @@ -716,6 +716,8 @@ vm_state(unsigned int mask)
> {
> if (mask & VM_STATE_PAUSED)
> return "paused";
> + else if (mask & VM_STATE_WAITING)
> + return "waiting";
> else if (mask & VM_STATE_RUNNING)
> return "running";
> else if (mask & VM_STATE_SHUTDOWN)
> Index: usr.sbin/vmd/parse.y
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/parse.y,v
> retrieving revision 1.52
> diff -u -p -a -u -r1.52 parse.y
> --- usr.sbin/vmd/parse.y 14 May 2019 06:05:45 -0000 1.52
> +++ usr.sbin/vmd/parse.y 8 Dec 2019 09:29:39 -0000
> @@ -122,7 +122,8 @@ typedef struct {
> %token INCLUDE ERROR
> %token ADD ALLOW BOOT CDROM DEVICE DISABLE DISK DOWN ENABLE FORMAT GROUP
> %token INET6 INSTANCE INTERFACE LLADDR LOCAL LOCKED MEMORY NET NIFS OWNER
> -%token PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID
> +%token PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID STAGGERED START
> +%token PARALLEL DELAY
> %token <v.number> NUMBER
> %token <v.string> STRING
> %type <v.lladdr> lladdr
> @@ -217,6 +218,11 @@ main : LOCAL INET6 {
> env->vmd_ps.ps_csock.cs_uid = $3.uid;
> env->vmd_ps.ps_csock.cs_gid = $3.gid == -1 ? 0 : $3.gid;
> }
> + | STAGGERED START PARALLEL NUMBER DELAY NUMBER {
> + env->vmd_cfg.cfg_flags |= VMD_CFG_STAGGERED_START;
> + env->vmd_cfg.delay.tv_sec = $6;
> + env->vmd_cfg.parallelism = $4;
> + }
> ;
> switch : SWITCH string {
> @@ -368,6 +374,8 @@ vm : VM string vm_instance {
> } else {
> if (vcp_disable)
> vm->vm_state |= VM_STATE_DISABLED;
> + else
> + vm->vm_state |= VM_STATE_WAITING;
> log_debug("%s:%d: vm \"%s\" "
> "registered (%s)",
> file->name, yylval.lineno,
> @@ -766,6 +774,7 @@ lookup(char *s)
> { "allow", ALLOW },
> { "boot", BOOT },
> { "cdrom", CDROM },
> + { "delay", DELAY },
> { "device", DEVICE },
> { "disable", DISABLE },
> { "disk", DISK },
> @@ -785,10 +794,13 @@ lookup(char *s)
> { "memory", MEMORY },
> { "net", NET },
> { "owner", OWNER },
> + { "parallel", PARALLEL },
> { "prefix", PREFIX },
> { "rdomain", RDOMAIN },
> { "size", SIZE },
> { "socket", SOCKET },
> + { "staggered", STAGGERED },
> + { "start", START },
> { "switch", SWITCH },
> { "up", UP },
> { "vm", VM }
> Index: usr.sbin/vmd/vm.conf.5
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/vm.conf.5,v
> retrieving revision 1.44
> diff -u -p -a -u -r1.44 vm.conf.5
> --- usr.sbin/vmd/vm.conf.5 14 May 2019 12:47:17 -0000 1.44
> +++ usr.sbin/vmd/vm.conf.5 8 Dec 2019 09:29:39 -0000
> @@ -91,6 +91,16 @@ vm "vm1.example.com" {
> .Sh GLOBAL CONFIGURATION
> The following setting can be configured globally:
> .Bl -tag -width Ds
> +.It Ic staggered start parallel Ar parallelism Ic delay Ar seconds
> +Start all configured vms in staggered fashion with
> +.Ar parallelism
> +instances in parallel every
> +.Ar delay
> +seconds. Defaults to
> +.Ar parallelism
> +equal to number of cpus and
> +.Ar delay
> +of 30 seconds.
> .It Ic local prefix Ar address Ns Li / Ns Ar prefix
> Set the network prefix that is used to allocate subnets for
> local interfaces, see
> Index: usr.sbin/vmd/vmd.c
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/vmd.c,v
> retrieving revision 1.116
> diff -u -p -a -u -r1.116 vmd.c
> --- usr.sbin/vmd/vmd.c 4 Sep 2019 07:02:03 -0000 1.116
> +++ usr.sbin/vmd/vmd.c 8 Dec 2019 09:29:39 -0000
> @@ -21,6 +21,7 @@
> #include <sys/wait.h>
> #include <sys/cdefs.h>
> #include <sys/stat.h>
> +#include <sys/sysctl.h>
> #include <sys/tty.h>
> #include <sys/ttycom.h>
> #include <sys/ioctl.h>
> @@ -63,6 +64,7 @@ int vm_instance(struct privsep *, struc
> struct vmop_create_params *, uid_t);
> int vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
> int vm_claimid(const char *, int, uint32_t *);
> +void start_vm_batch(int, short, void*);
> struct vmd *env;
> @@ -73,6 +75,8 @@ static struct privsep_proc procs[] = {
> { "vmm", PROC_VMM, vmd_dispatch_vmm, vmm, vmm_shutdown },
> };
> +struct event staggered_start_timer;
> +
> /* For the privileged process */
> static struct privsep_proc *proc_priv = &procs[0];
> static struct passwd proc_privpw;
> @@ -854,11 +858,40 @@ main(int argc, char **argv)
> return (0);
> }
> +void
> +start_vm_batch(int fd, short type, void *args)
> +{
> + int i = 0;
> + struct vmd_vm *vm;
> + log_debug("%s: starting batch of %d vms", __func__,
> + env->vmd_cfg.parallelism);
> + TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
> + if (!(vm->vm_state & VM_STATE_WAITING)) {
> + log_debug("%s: not creating vm %s (disabled)",
> + __func__,
> + vm->vm_params.vmc_params.vcp_name);
> + continue;
> + }
> + i++;
> + if (i > env->vmd_cfg.parallelism) {
> + evtimer_add(&staggered_start_timer,
> + &env->vmd_cfg.delay);
> + break;
> + }
> + vm->vm_state &= ~VM_STATE_WAITING;
> + config_setvm(&env->vmd_ps, vm, -1,
> + vm->vm_params.vmc_owner.uid);
> + }
> + log_debug("%s: done starting vms", __func__);
> +}
> +
> int
> vmd_configure(void)
> {
> - struct vmd_vm *vm;
> + int ncpus;
> struct vmd_switch *vsw;
> + int ncpu_mib[] = {CTL_HW, HW_NCPU};
> + size_t ncpus_sz = sizeof(ncpus);
> if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
> fatal("open %s", PATH_PTMDEV);
> @@ -906,17 +939,21 @@ vmd_configure(void)
> }
> }
> - TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
> - if (vm->vm_state & VM_STATE_DISABLED) {
> - log_debug("%s: not creating vm %s (disabled)",
> - __func__,
> - vm->vm_params.vmc_params.vcp_name);
> - continue;
> - }
> - if (config_setvm(&env->vmd_ps, vm,
> - -1, vm->vm_params.vmc_owner.uid) == -1)
> - return (-1);
> + if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
> + env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
> + if (sysctl(ncpu_mib, 2, &ncpus, &ncpus_sz, NULL, 0) == -1)
> + ncpus = 1;
> + env->vmd_cfg.parallelism = ncpus;
> + log_debug("%s: setting staggered start configuration to "
> + "parallelism: %d and delay: %lld",
> + __func__, ncpus, env->vmd_cfg.delay.tv_sec);
> +
> }
> +
> + log_debug("%s: starting vms in staggered fashion", __func__);
> + evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
> + /* start first batch */
> + start_vm_batch(0, 0, NULL);
> return (0);
> }
> Index: usr.sbin/vmd/vmd.h
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/vmd.h,v
> retrieving revision 1.97
> diff -u -p -a -u -r1.97 vmd.h
> --- usr.sbin/vmd/vmd.h 7 Sep 2019 09:11:14 -0000 1.97
> +++ usr.sbin/vmd/vmd.h 8 Dec 2019 09:29:39 -0000
> @@ -56,6 +56,8 @@
> #define VMD_SWITCH_TYPE "bridge"
> #define VM_DEFAULT_MEMORY 512
> +#define VMD_DEFAULT_STAGGERED_START_DELAY 30
> +
> /* Rate-limit fast reboots */
> #define VM_START_RATE_SEC 6 /* min. seconds since last reboot */
> #define VM_START_RATE_LIMIT 3 /* max. number of fast reboots */
> @@ -280,6 +282,7 @@ struct vmd_vm {
> #define VM_STATE_SHUTDOWN 0x04
> #define VM_STATE_RECEIVED 0x08
> #define VM_STATE_PAUSED 0x10
> +#define VM_STATE_WAITING 0x20
> /* For rate-limiting */
> struct timeval vm_start_tv;
> @@ -319,7 +322,10 @@ struct vmd_config {
> unsigned int cfg_flags;
> #define VMD_CFG_INET6 0x01
> #define VMD_CFG_AUTOINET6 0x02
> +#define VMD_CFG_STAGGERED_START 0x04
> + struct timeval delay;
> + int parallelism;
> struct address cfg_localprefix;
> struct address cfg_localprefix6;
> };
>