On Sun, Dec 08, 2019 at 02:07:46AM -0800, Pratik Vyas wrote:
> Hi!
> 
> This is an attempt to address 'thundering herd' problem when a lot of
> vms are configured in vm.conf.  A lot of vms booting in parallel can
> overload the host and also mess up tsc calibration in openbsd guests as
> it uses PIT which doesn't fire reliably if the host is overloaded.
> 
> 
> This diff makes vmd start vms in a staggered fashion with default parallelism 
> of
> number of cpus on the host and a delay of 30s.  Default can be overridden with
> a line like following in vm.conf
> 
> staggered start parallel 4 delay 30
> 
> 
> Every non-disabled vm starts in waiting state.  If you are eager to
> start a vm that is way further in the list, you can vmctl start it.
> 
> Discussed the idea with ori@, mlarkin@ and phessler@.
> 
> Comments / ok?
> 
> --
> Pratik
> 

See below. Other than the nits below, ok mlarkin when you are ready.

-ml

> Index: usr.sbin/vmctl/vmctl.c
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmctl/vmctl.c,v
> retrieving revision 1.71
> diff -u -p -a -u -r1.71 vmctl.c
> --- usr.sbin/vmctl/vmctl.c    7 Sep 2019 09:11:14 -0000       1.71
> +++ usr.sbin/vmctl/vmctl.c    8 Dec 2019 09:29:39 -0000
> @@ -716,6 +716,8 @@ vm_state(unsigned int mask)
> {
>       if (mask & VM_STATE_PAUSED)
>               return "paused";
> +     else if (mask & VM_STATE_WAITING)
> +             return "waiting";
>       else if (mask & VM_STATE_RUNNING)
>               return "running";
>       else if (mask & VM_STATE_SHUTDOWN)
> Index: usr.sbin/vmd/parse.y
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/parse.y,v
> retrieving revision 1.52
> diff -u -p -a -u -r1.52 parse.y
> --- usr.sbin/vmd/parse.y      14 May 2019 06:05:45 -0000      1.52
> +++ usr.sbin/vmd/parse.y      8 Dec 2019 09:29:39 -0000
> @@ -122,7 +122,8 @@ typedef struct {
> %token        INCLUDE ERROR
> %token        ADD ALLOW BOOT CDROM DEVICE DISABLE DISK DOWN ENABLE FORMAT 
> GROUP
> %token        INET6 INSTANCE INTERFACE LLADDR LOCAL LOCKED MEMORY NET NIFS 
> OWNER
> -%token       PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID
> +%token       PATH PREFIX RDOMAIN SIZE SOCKET SWITCH UP VM VMID STAGGERED 
> START
> +%token  PARALLEL DELAY
> %token        <v.number>      NUMBER
> %token        <v.string>      STRING
> %type <v.lladdr>      lladdr
> @@ -217,6 +218,11 @@ main             : LOCAL INET6 {
>                       env->vmd_ps.ps_csock.cs_uid = $3.uid;
>                       env->vmd_ps.ps_csock.cs_gid = $3.gid == -1 ? 0 : $3.gid;
>               }
> +             | STAGGERED START PARALLEL NUMBER DELAY NUMBER {
> +                     env->vmd_cfg.cfg_flags |= VMD_CFG_STAGGERED_START;
> +                     env->vmd_cfg.delay.tv_sec = $6;
> +                     env->vmd_cfg.parallelism = $4;
> +             }
>               ;
> 
> switch                : SWITCH string                 {
> @@ -368,6 +374,8 @@ vm                : VM string vm_instance         {
>                               } else {
>                                       if (vcp_disable)
>                                               vm->vm_state |= 
> VM_STATE_DISABLED;
> +                                     else
> +                                             vm->vm_state |= 
> VM_STATE_WAITING;
>                                       log_debug("%s:%d: vm \"%s\" "
>                                           "registered (%s)",
>                                           file->name, yylval.lineno,
> @@ -766,6 +774,7 @@ lookup(char *s)
>               { "allow",              ALLOW },
>               { "boot",               BOOT },
>               { "cdrom",              CDROM },
> +             { "delay",              DELAY },
>               { "device",             DEVICE },
>               { "disable",            DISABLE },
>               { "disk",               DISK },
> @@ -785,10 +794,13 @@ lookup(char *s)
>               { "memory",             MEMORY },
>               { "net",                NET },
>               { "owner",              OWNER },
> +             { "parallel",           PARALLEL },
>               { "prefix",             PREFIX },
>               { "rdomain",            RDOMAIN },
>               { "size",               SIZE },
>               { "socket",             SOCKET },
> +             { "staggered",          STAGGERED },
> +             { "start",              START  },
>               { "switch",             SWITCH },
>               { "up",                 UP },
>               { "vm",                 VM }
> Index: usr.sbin/vmd/vm.conf.5
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/vm.conf.5,v
> retrieving revision 1.44
> diff -u -p -a -u -r1.44 vm.conf.5
> --- usr.sbin/vmd/vm.conf.5    14 May 2019 12:47:17 -0000      1.44
> +++ usr.sbin/vmd/vm.conf.5    8 Dec 2019 09:29:39 -0000
> @@ -91,6 +91,16 @@ vm "vm1.example.com" {
> .Sh GLOBAL CONFIGURATION
> The following setting can be configured globally:
> .Bl -tag -width Ds
> +.It Ic staggered start parallel Ar parallelism Ic delay Ar seconds
> +Start all configured vms in staggered fashion with
> +.Ar parallelism
> +instances in parallel every
> +.Ar delay
> +seconds.  Defaults to
> +.Ar parallelism
> +equal to number of cpus and
> +.Ar delay
> +of 30 seconds.
> .It Ic local prefix Ar address Ns Li / Ns Ar prefix
> Set the network prefix that is used to allocate subnets for
> local interfaces, see
> Index: usr.sbin/vmd/vmd.c
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/vmd.c,v
> retrieving revision 1.116
> diff -u -p -a -u -r1.116 vmd.c
> --- usr.sbin/vmd/vmd.c        4 Sep 2019 07:02:03 -0000       1.116
> +++ usr.sbin/vmd/vmd.c        8 Dec 2019 09:29:39 -0000
> @@ -21,6 +21,7 @@
> #include <sys/wait.h>
> #include <sys/cdefs.h>
> #include <sys/stat.h>
> +#include <sys/sysctl.h>
> #include <sys/tty.h>
> #include <sys/ttycom.h>
> #include <sys/ioctl.h>
> @@ -63,6 +64,7 @@ int  vm_instance(struct privsep *, struc
>           struct vmop_create_params *, uid_t);
> int    vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
> int    vm_claimid(const char *, int, uint32_t *);
> +void  start_vm_batch(int, short, void*);
> 
> struct vmd    *env;
> 
> @@ -73,6 +75,8 @@ static struct privsep_proc procs[] = {
>       { "vmm",        PROC_VMM,       vmd_dispatch_vmm, vmm, vmm_shutdown },
> };
> 
> +struct event staggered_start_timer;
> +
> /* For the privileged process */
> static struct privsep_proc *proc_priv = &procs[0];
> static struct passwd proc_privpw;
> @@ -854,11 +858,40 @@ main(int argc, char **argv)
>       return (0);
> }
> 
> +void
> +start_vm_batch(int fd, short type, void *args)
> +{
> +     int             i = 0;
> +     struct vmd_vm   *vm;

Can you add a blank line here?

> +     log_debug("%s: starting batch of %d vms", __func__,
> +         env->vmd_cfg.parallelism);
> +     TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
> +             if (!(vm->vm_state & VM_STATE_WAITING)) {
> +                     log_debug("%s: not creating vm %s (disabled)",

"creating"? should it be "starting"? (I realize this was moved from
elsewhere but we can fix it now)

> +                         __func__,
> +                         vm->vm_params.vmc_params.vcp_name);
> +                     continue;
> +             }
> +             i++;
> +             if (i > env->vmd_cfg.parallelism) {
> +                     evtimer_add(&staggered_start_timer,
> +                         &env->vmd_cfg.delay);
> +                     break;
> +             }
> +             vm->vm_state &= ~VM_STATE_WAITING;
> +             config_setvm(&env->vmd_ps, vm, -1,
> +                 vm->vm_params.vmc_owner.uid);

You can join the 2 lines above, I think (I think it fits)

> +     }
> +     log_debug("%s: done starting vms", __func__);
> +}
> +
> int
> vmd_configure(void)
> {
> -     struct vmd_vm           *vm;
> +     int                     ncpus;
>       struct vmd_switch       *vsw;
> +     int ncpu_mib[] = {CTL_HW, HW_NCPU};

Would NCPUONLINE be a better fit here?

> +     size_t ncpus_sz = sizeof(ncpus);
> 
>       if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
>               fatal("open %s", PATH_PTMDEV);
> @@ -906,17 +939,21 @@ vmd_configure(void)
>               }
>       }
> 
> -     TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
> -             if (vm->vm_state & VM_STATE_DISABLED) {
> -                     log_debug("%s: not creating vm %s (disabled)",
> -                         __func__,
> -                         vm->vm_params.vmc_params.vcp_name);
> -                     continue;
> -             }
> -             if (config_setvm(&env->vmd_ps, vm,
> -                 -1, vm->vm_params.vmc_owner.uid) == -1)
> -                     return (-1);
> +     if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
> +             env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
> +             if (sysctl(ncpu_mib, 2, &ncpus, &ncpus_sz, NULL, 0) == -1)
> +                     ncpus = 1;
> +             env->vmd_cfg.parallelism = ncpus;
> +             log_debug("%s: setting staggered start configuration to "
> +                 "parallelism: %d and delay: %lld",
> +                 __func__, ncpus, env->vmd_cfg.delay.tv_sec);
> +
>       }
> +
> +     log_debug("%s: starting vms in staggered fashion", __func__);
> +     evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
> +     /* start first batch */
> +     start_vm_batch(0, 0, NULL);
> 
>       return (0);
> }
> Index: usr.sbin/vmd/vmd.h
> ===================================================================
> RCS file: /home/cvs/src/usr.sbin/vmd/vmd.h,v
> retrieving revision 1.97
> diff -u -p -a -u -r1.97 vmd.h
> --- usr.sbin/vmd/vmd.h        7 Sep 2019 09:11:14 -0000       1.97
> +++ usr.sbin/vmd/vmd.h        8 Dec 2019 09:29:39 -0000
> @@ -56,6 +56,8 @@
> #define VMD_SWITCH_TYPE               "bridge"
> #define VM_DEFAULT_MEMORY     512
> 
> +#define VMD_DEFAULT_STAGGERED_START_DELAY 30
> +
> /* Rate-limit fast reboots */
> #define VM_START_RATE_SEC     6       /* min. seconds since last reboot */
> #define VM_START_RATE_LIMIT   3       /* max. number of fast reboots */
> @@ -280,6 +282,7 @@ struct vmd_vm {
> #define VM_STATE_SHUTDOWN     0x04
> #define VM_STATE_RECEIVED     0x08
> #define VM_STATE_PAUSED               0x10
> +#define VM_STATE_WAITING     0x20
> 
>       /* For rate-limiting */
>       struct timeval           vm_start_tv;
> @@ -319,7 +322,10 @@ struct vmd_config {
>       unsigned int             cfg_flags;
> #define VMD_CFG_INET6         0x01
> #define VMD_CFG_AUTOINET6     0x02
> +#define VMD_CFG_STAGGERED_START      0x04
> 
> +     struct timeval           delay;
> +     int                      parallelism;
>       struct address           cfg_localprefix;
>       struct address           cfg_localprefix6;
> };
> 

Reply via email to