On Sun, Jun 05, 2022 at 09:25:34AM -0400, Dave Voutila wrote: > tech@ friends: > > tl;dr: testers wanted for fixing Linux guest reboot. If you've got > Linux guests that no longer reboot properly, please test! For other > vmd users, please check for any regressions. > > Our port of SeaBIOS is configured to enable QEMU features to simplify > its working with vmd(8). This generally works well. > > SeaBIOS provides a reboot routine specifically for QEMU environments. > One of the reasons is to provide some extra logic for refreshing the > copy of the BIOS in memory (as if reading from ROM) before attempting > the reset (first via PCI and falling back to triple-faulting). The way > SeaBIOS does this appears to be it assumes there's a "pristine copy" > of the BIOS loaded by the host's emulator to just below the 4GB mark > in physical memory. (See src/fw/shadow.c in the SeaBIOS source tree.) > > This hasn't been a problem until recent Linux kernel changes started > calling into the BIOS as a way to reboot the guest. (I know at least > the 5.15 kernel shipped with Alpine does this.) > > Since vmd/vmm doesn't create a mapping for that area just below 4GB, > guests experience a page fault vm-exit and a resulting failure as we > consider that address part of the MMIO hole and reserved. > > This change to vmd(8) loads a second copy of the BIOS, ending at the > 4GB mark in guest memory. Consequently, vmm(4)'s MMIO memory hole is > adjusted to end 2MB below 4GB to accomodate SeaBIOS and future > firmware payloads that may be > 1MB in size. (I believe EDK-II UEFI is > larger than 1MB...haven't looked in awhile.) > > Along the way, I adjusted the use of hardcoded values for 1 MB and 4 > GB to use a more human readable version via #defines. > > For testers: > 0. apply patch > 1. build, install updated kernel, boot new kernel > 2. copy or symlink sys/arch/amd64/include/vmmvar.h to > /usr/include/amd64/ > 3. build and install vmd(8) > 4. test! > > ~dv >
Does qemu load 2 copies of the bios or just rely on A20 tricks to make the bios appear at two addresses? -ml > > diff refs/heads/master refs/heads/vmd-bios4g > blob - fea4ab52e6db7eff12b913ecde30abf970da0b54 > blob + f06212b18f8ae19b5edc8fa8d64684d7163e35a8 > --- sys/arch/amd64/include/vmmvar.h > +++ sys/arch/amd64/include/vmmvar.h > @@ -35,7 +35,7 @@ > #define VMM_MAX_NICS_PER_VM 4 > > #define VMM_PCI_MMIO_BAR_BASE 0xF0000000ULL > -#define VMM_PCI_MMIO_BAR_END 0xFFFFFFFFULL > +#define VMM_PCI_MMIO_BAR_END 0xFFDFFFFFULL /* 2 MiB below 4 GiB */ > #define VMM_PCI_MMIO_BAR_SIZE 0x00010000 > #define VMM_PCI_IO_BAR_BASE 0x1000 > #define VMM_PCI_IO_BAR_END 0xFFFF > blob - d952ba4d8d0bff700fc09c066ffc284909150417 > blob + c36e17eb5ed4d1799f55fa1af5f7ca158923202e > --- usr.sbin/vmd/vm.c > +++ usr.sbin/vmd/vm.c > @@ -65,6 +65,10 @@ > #include "vmd.h" > #include "vmm.h" > > +#define _1_MB (1UL * 1024 * 1024) > +#define _2_MB (2UL * 1024 * 1024) > +#define _4_GB (4UL * 1024 * 1024 * 1024) > + > io_fn_t ioports_map[MAX_PORTS]; > > int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, > @@ -234,7 +238,7 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s > return (-1); > > /* The BIOS image must end at 1MB */ > - if ((off = 1048576 - size) < 0) > + if ((off = _1_MB - size) < 0) > return (-1); > > /* Read BIOS image into memory */ > @@ -243,6 +247,16 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s > return (-1); > } > > + if (gzseek(fp, 0, SEEK_SET) == -1) > + return (-1); > + > + /* Read a second BIOS copy into memory ending at 4GB */ > + off = _4_GB - size; > + if (mread(fp, off, size) != (size_t)size) { > + errno = EIO; > + return (-1); > + } > + > log_debug("%s: loaded BIOS image", __func__); > > return (0); > @@ -872,6 +886,7 @@ void > create_memory_map(struct vm_create_params *vcp) > { > size_t len, mem_bytes; > + size_t above_1m = 0, above_4g = 0; > > mem_bytes = vcp->vcp_memranges[0].vmr_size; > vcp->vcp_nmemranges = 0; > @@ -893,29 +908,47 @@ create_memory_map(struct vm_create_params *vcp) > * we need to make sure that vmm(4) permits accesses > * to it. So allocate guest memory for it. > */ > - len = 0x100000 - LOWMEM_KB * 1024; > + len = _1_MB - (LOWMEM_KB * 1024); > vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; > vcp->vcp_memranges[1].vmr_size = len; > mem_bytes -= len; > > - /* Make sure that we do not place physical memory into MMIO ranges. */ > - if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) > - len = VMM_PCI_MMIO_BAR_BASE - 0x100000; > - else > - len = mem_bytes; > - > - /* Third memory region: 1MB - (1MB + len) */ > - vcp->vcp_memranges[2].vmr_gpa = 0x100000; > - vcp->vcp_memranges[2].vmr_size = len; > - mem_bytes -= len; > - > - if (mem_bytes > 0) { > - /* Fourth memory region for the remaining memory (if any) */ > - vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; > - vcp->vcp_memranges[3].vmr_size = mem_bytes; > - vcp->vcp_nmemranges = 4; > - } else > + /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ > + if (mem_bytes <= _2_MB) { > + vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; > + vcp->vcp_memranges[2].vmr_size = _2_MB; > vcp->vcp_nmemranges = 3; > + return; > + } > + > + /* > + * Calculate the how to split any remaining memory across the 4GB > + * boundary while making sure we do not place physical memory into > + * MMIO ranges. > + */ > + if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - _1_MB) { > + above_1m = VMM_PCI_MMIO_BAR_BASE - _1_MB; > + above_4g = mem_bytes - above_1m; > + } else { > + above_1m = mem_bytes; > + above_4g = 0; > + } > + > + /* Third memory region: area above 1MB to MMIO region */ > + vcp->vcp_memranges[2].vmr_gpa = _1_MB; > + vcp->vcp_memranges[2].vmr_size = above_1m; > + > + /* Fourth region: 2nd copy of BIOS above MMIO ending at 4GB */ > + vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; > + vcp->vcp_memranges[3].vmr_size = _2_MB; > + > + /* Fifth region: any remainder above 4GB */ > + if (above_4g > 0) { > + vcp->vcp_memranges[4].vmr_gpa = _4_GB; > + vcp->vcp_memranges[4].vmr_size = above_4g; > + vcp->vcp_nmemranges = 5; > + } else > + vcp->vcp_nmemranges = 4; > } > > /* > @@ -1015,17 +1048,19 @@ init_emulated_hw(struct vmop_create_params *vmc, int c > int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) > { > struct vm_create_params *vcp = &vmc->vmc_params; > - int i; > + size_t i; > uint64_t memlo, memhi; > > /* Calculate memory size for NVRAM registers */ > memlo = memhi = 0; > - if (vcp->vcp_nmemranges > 2) > - memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; > + for (i = 0; i < vcp->vcp_nmemranges; i++) { > + if (vcp->vcp_memranges[i].vmr_gpa == _1_MB && > + vcp->vcp_memranges[i].vmr_size > (15 * _1_MB)) > + memlo = vcp->vcp_memranges[i].vmr_size - (15 * _1_MB); > + else if (vcp->vcp_memranges[i].vmr_gpa == _4_GB) > + memhi = vcp->vcp_memranges[i].vmr_size; > + } > > - if (vcp->vcp_nmemranges > 3) > - memhi = vcp->vcp_memranges[3].vmr_size; > - > /* Reset the IO port map */ > memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS); >