tech@ friends: tl;dr: testers wanted for fixing Linux guest reboot. If you've got Linux guests that no longer reboot properly, please test! For other vmd users, please check for any regressions.
Our port of SeaBIOS is configured to enable QEMU features to simplify its working with vmd(8). This generally works well. SeaBIOS provides a reboot routine specifically for QEMU environments. One of the reasons is to provide some extra logic for refreshing the copy of the BIOS in memory (as if reading from ROM) before attempting the reset (first via PCI and falling back to triple-faulting). The way SeaBIOS does this appears to be it assumes there's a "pristine copy" of the BIOS loaded by the host's emulator to just below the 4GB mark in physical memory. (See src/fw/shadow.c in the SeaBIOS source tree.) This hasn't been a problem until recent Linux kernel changes started calling into the BIOS as a way to reboot the guest. (I know at least the 5.15 kernel shipped with Alpine does this.) Since vmd/vmm doesn't create a mapping for that area just below 4GB, guests experience a page fault vm-exit and a resulting failure as we consider that address part of the MMIO hole and reserved. This change to vmd(8) loads a second copy of the BIOS, ending at the 4GB mark in guest memory. Consequently, vmm(4)'s MMIO memory hole is adjusted to end 2MB below 4GB to accomodate SeaBIOS and future firmware payloads that may be > 1MB in size. (I believe EDK-II UEFI is larger than 1MB...haven't looked in awhile.) Along the way, I adjusted the use of hardcoded values for 1 MB and 4 GB to use a more human readable version via #defines. For testers: 0. apply patch 1. build, install updated kernel, boot new kernel 2. copy or symlink sys/arch/amd64/include/vmmvar.h to /usr/include/amd64/ 3. build and install vmd(8) 4. test! ~dv diff refs/heads/master refs/heads/vmd-bios4g blob - fea4ab52e6db7eff12b913ecde30abf970da0b54 blob + f06212b18f8ae19b5edc8fa8d64684d7163e35a8 --- sys/arch/amd64/include/vmmvar.h +++ sys/arch/amd64/include/vmmvar.h @@ -35,7 +35,7 @@ #define VMM_MAX_NICS_PER_VM 4 #define VMM_PCI_MMIO_BAR_BASE 0xF0000000ULL -#define VMM_PCI_MMIO_BAR_END 0xFFFFFFFFULL +#define VMM_PCI_MMIO_BAR_END 0xFFDFFFFFULL /* 2 MiB below 4 GiB */ #define VMM_PCI_MMIO_BAR_SIZE 0x00010000 #define VMM_PCI_IO_BAR_BASE 0x1000 #define VMM_PCI_IO_BAR_END 0xFFFF blob - d952ba4d8d0bff700fc09c066ffc284909150417 blob + c36e17eb5ed4d1799f55fa1af5f7ca158923202e --- usr.sbin/vmd/vm.c +++ usr.sbin/vmd/vm.c @@ -65,6 +65,10 @@ #include "vmd.h" #include "vmm.h" +#define _1_MB (1UL * 1024 * 1024) +#define _2_MB (2UL * 1024 * 1024) +#define _4_GB (4UL * 1024 * 1024 * 1024) + io_fn_t ioports_map[MAX_PORTS]; int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *, @@ -234,7 +238,7 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s return (-1); /* The BIOS image must end at 1MB */ - if ((off = 1048576 - size) < 0) + if ((off = _1_MB - size) < 0) return (-1); /* Read BIOS image into memory */ @@ -243,6 +247,16 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s return (-1); } + if (gzseek(fp, 0, SEEK_SET) == -1) + return (-1); + + /* Read a second BIOS copy into memory ending at 4GB */ + off = _4_GB - size; + if (mread(fp, off, size) != (size_t)size) { + errno = EIO; + return (-1); + } + log_debug("%s: loaded BIOS image", __func__); return (0); @@ -872,6 +886,7 @@ void create_memory_map(struct vm_create_params *vcp) { size_t len, mem_bytes; + size_t above_1m = 0, above_4g = 0; mem_bytes = vcp->vcp_memranges[0].vmr_size; vcp->vcp_nmemranges = 0; @@ -893,29 +908,47 @@ create_memory_map(struct vm_create_params *vcp) * we need to make sure that vmm(4) permits accesses * to it. So allocate guest memory for it. */ - len = 0x100000 - LOWMEM_KB * 1024; + len = _1_MB - (LOWMEM_KB * 1024); vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024; vcp->vcp_memranges[1].vmr_size = len; mem_bytes -= len; - /* Make sure that we do not place physical memory into MMIO ranges. */ - if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000) - len = VMM_PCI_MMIO_BAR_BASE - 0x100000; - else - len = mem_bytes; - - /* Third memory region: 1MB - (1MB + len) */ - vcp->vcp_memranges[2].vmr_gpa = 0x100000; - vcp->vcp_memranges[2].vmr_size = len; - mem_bytes -= len; - - if (mem_bytes > 0) { - /* Fourth memory region for the remaining memory (if any) */ - vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; - vcp->vcp_memranges[3].vmr_size = mem_bytes; - vcp->vcp_nmemranges = 4; - } else + /* If we have less than 2MB remaining, still create a 2nd BIOS area. */ + if (mem_bytes <= _2_MB) { + vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END; + vcp->vcp_memranges[2].vmr_size = _2_MB; vcp->vcp_nmemranges = 3; + return; + } + + /* + * Calculate the how to split any remaining memory across the 4GB + * boundary while making sure we do not place physical memory into + * MMIO ranges. + */ + if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - _1_MB) { + above_1m = VMM_PCI_MMIO_BAR_BASE - _1_MB; + above_4g = mem_bytes - above_1m; + } else { + above_1m = mem_bytes; + above_4g = 0; + } + + /* Third memory region: area above 1MB to MMIO region */ + vcp->vcp_memranges[2].vmr_gpa = _1_MB; + vcp->vcp_memranges[2].vmr_size = above_1m; + + /* Fourth region: 2nd copy of BIOS above MMIO ending at 4GB */ + vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1; + vcp->vcp_memranges[3].vmr_size = _2_MB; + + /* Fifth region: any remainder above 4GB */ + if (above_4g > 0) { + vcp->vcp_memranges[4].vmr_gpa = _4_GB; + vcp->vcp_memranges[4].vmr_size = above_4g; + vcp->vcp_nmemranges = 5; + } else + vcp->vcp_nmemranges = 4; } /* @@ -1015,17 +1048,19 @@ init_emulated_hw(struct vmop_create_params *vmc, int c int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) { struct vm_create_params *vcp = &vmc->vmc_params; - int i; + size_t i; uint64_t memlo, memhi; /* Calculate memory size for NVRAM registers */ memlo = memhi = 0; - if (vcp->vcp_nmemranges > 2) - memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000; + for (i = 0; i < vcp->vcp_nmemranges; i++) { + if (vcp->vcp_memranges[i].vmr_gpa == _1_MB && + vcp->vcp_memranges[i].vmr_size > (15 * _1_MB)) + memlo = vcp->vcp_memranges[i].vmr_size - (15 * _1_MB); + else if (vcp->vcp_memranges[i].vmr_gpa == _4_GB) + memhi = vcp->vcp_memranges[i].vmr_size; + } - if (vcp->vcp_nmemranges > 3) - memhi = vcp->vcp_memranges[3].vmr_size; - /* Reset the IO port map */ memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);