tech@ friends:

tl;dr: testers wanted for fixing Linux guest reboot. If you've got
Linux guests that no longer reboot properly, please test! For other
vmd users, please check for any regressions.

Our port of SeaBIOS is configured to enable QEMU features to simplify
its working with vmd(8). This generally works well.

SeaBIOS provides a reboot routine specifically for QEMU environments.
One of the reasons is to provide some extra logic for refreshing the
copy of the BIOS in memory (as if reading from ROM) before attempting
the reset (first via PCI and falling back to triple-faulting). The way
SeaBIOS does this appears to be it assumes there's a "pristine copy"
of the BIOS loaded by the host's emulator to just below the 4GB mark
in physical memory. (See src/fw/shadow.c in the SeaBIOS source tree.)

This hasn't been a problem until recent Linux kernel changes started
calling into the BIOS as a way to reboot the guest. (I know at least
the 5.15 kernel shipped with Alpine does this.)

Since vmd/vmm doesn't create a mapping for that area just below 4GB,
guests experience a page fault vm-exit and a resulting failure as we
consider that address part of the MMIO hole and reserved.

This change to vmd(8) loads a second copy of the BIOS, ending at the
4GB mark in guest memory. Consequently, vmm(4)'s MMIO memory hole is
adjusted to end 2MB below 4GB to accomodate SeaBIOS and future
firmware payloads that may be > 1MB in size. (I believe EDK-II UEFI is
larger than 1MB...haven't looked in awhile.)

Along the way, I adjusted the use of hardcoded values for 1 MB and 4
GB to use a more human readable version via #defines.

For testers:
  0. apply patch
  1. build, install updated kernel, boot new kernel
  2. copy or symlink sys/arch/amd64/include/vmmvar.h to
     /usr/include/amd64/
  3. build and install vmd(8)
  4. test!

~dv


diff refs/heads/master refs/heads/vmd-bios4g
blob - fea4ab52e6db7eff12b913ecde30abf970da0b54
blob + f06212b18f8ae19b5edc8fa8d64684d7163e35a8
--- sys/arch/amd64/include/vmmvar.h
+++ sys/arch/amd64/include/vmmvar.h
@@ -35,7 +35,7 @@
 #define VMM_MAX_NICS_PER_VM    4

 #define VMM_PCI_MMIO_BAR_BASE  0xF0000000ULL
-#define VMM_PCI_MMIO_BAR_END   0xFFFFFFFFULL
+#define VMM_PCI_MMIO_BAR_END   0xFFDFFFFFULL           /* 2 MiB below 4 GiB */
 #define VMM_PCI_MMIO_BAR_SIZE  0x00010000
 #define VMM_PCI_IO_BAR_BASE    0x1000
 #define VMM_PCI_IO_BAR_END     0xFFFF
blob - d952ba4d8d0bff700fc09c066ffc284909150417
blob + c36e17eb5ed4d1799f55fa1af5f7ca158923202e
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -65,6 +65,10 @@
 #include "vmd.h"
 #include "vmm.h"

+#define _1_MB  (1UL * 1024 * 1024)
+#define _2_MB  (2UL * 1024 * 1024)
+#define _4_GB  (4UL * 1024 * 1024 * 1024)
+
 io_fn_t ioports_map[MAX_PORTS];

 int run_vm(int, int[][VM_MAX_BASE_PER_DISK], int *,
@@ -234,7 +238,7 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
                return (-1);

        /* The BIOS image must end at 1MB */
-       if ((off = 1048576 - size) < 0)
+       if ((off = _1_MB - size) < 0)
                return (-1);

        /* Read BIOS image into memory */
@@ -243,6 +247,16 @@ loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_s
                return (-1);
        }

+       if (gzseek(fp, 0, SEEK_SET) == -1)
+               return (-1);
+
+       /* Read a second BIOS copy into memory ending at 4GB */
+       off = _4_GB - size;
+       if (mread(fp, off, size) != (size_t)size) {
+               errno = EIO;
+               return (-1);
+       }
+
        log_debug("%s: loaded BIOS image", __func__);

        return (0);
@@ -872,6 +886,7 @@ void
 create_memory_map(struct vm_create_params *vcp)
 {
        size_t len, mem_bytes;
+       size_t above_1m = 0, above_4g = 0;

        mem_bytes = vcp->vcp_memranges[0].vmr_size;
        vcp->vcp_nmemranges = 0;
@@ -893,29 +908,47 @@ create_memory_map(struct vm_create_params *vcp)
         * we need to make sure that vmm(4) permits accesses
         * to it. So allocate guest memory for it.
         */
-       len = 0x100000 - LOWMEM_KB * 1024;
+       len = _1_MB - (LOWMEM_KB * 1024);
        vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
        vcp->vcp_memranges[1].vmr_size = len;
        mem_bytes -= len;

-       /* Make sure that we do not place physical memory into MMIO ranges. */
-       if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
-               len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
-       else
-               len = mem_bytes;
-
-       /* Third memory region: 1MB - (1MB + len) */
-       vcp->vcp_memranges[2].vmr_gpa = 0x100000;
-       vcp->vcp_memranges[2].vmr_size = len;
-       mem_bytes -= len;
-
-       if (mem_bytes > 0) {
-               /* Fourth memory region for the remaining memory (if any) */
-               vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
-               vcp->vcp_memranges[3].vmr_size = mem_bytes;
-               vcp->vcp_nmemranges = 4;
-       } else
+       /* If we have less than 2MB remaining, still create a 2nd BIOS area. */
+       if (mem_bytes <= _2_MB) {
+               vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
+               vcp->vcp_memranges[2].vmr_size = _2_MB;
                vcp->vcp_nmemranges = 3;
+               return;
+       }
+
+       /*
+        * Calculate the how to split any remaining memory across the 4GB
+        * boundary while making sure we do not place physical memory into
+        * MMIO ranges.
+        */
+       if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - _1_MB) {
+               above_1m = VMM_PCI_MMIO_BAR_BASE - _1_MB;
+               above_4g = mem_bytes - above_1m;
+       } else {
+               above_1m = mem_bytes;
+               above_4g = 0;
+       }
+
+       /* Third memory region: area above 1MB to MMIO region */
+       vcp->vcp_memranges[2].vmr_gpa = _1_MB;
+       vcp->vcp_memranges[2].vmr_size = above_1m;
+
+       /* Fourth region: 2nd copy of BIOS above MMIO ending at 4GB */
+       vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
+       vcp->vcp_memranges[3].vmr_size = _2_MB;
+
+       /* Fifth region: any remainder above 4GB */
+       if (above_4g > 0) {
+               vcp->vcp_memranges[4].vmr_gpa = _4_GB;
+               vcp->vcp_memranges[4].vmr_size = above_4g;
+               vcp->vcp_nmemranges = 5;
+       } else
+               vcp->vcp_nmemranges = 4;
 }

 /*
@@ -1015,17 +1048,19 @@ init_emulated_hw(struct vmop_create_params *vmc, int c
     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
 {
        struct vm_create_params *vcp = &vmc->vmc_params;
-       int i;
+       size_t i;
        uint64_t memlo, memhi;

        /* Calculate memory size for NVRAM registers */
        memlo = memhi = 0;
-       if (vcp->vcp_nmemranges > 2)
-               memlo = vcp->vcp_memranges[2].vmr_size - 15 * 0x100000;
+       for (i = 0; i < vcp->vcp_nmemranges; i++) {
+               if (vcp->vcp_memranges[i].vmr_gpa == _1_MB &&
+                   vcp->vcp_memranges[i].vmr_size > (15 * _1_MB))
+                       memlo = vcp->vcp_memranges[i].vmr_size - (15 * _1_MB);
+               else if (vcp->vcp_memranges[i].vmr_gpa == _4_GB)
+                       memhi = vcp->vcp_memranges[i].vmr_size;
+       }

-       if (vcp->vcp_nmemranges > 3)
-               memhi = vcp->vcp_memranges[3].vmr_size;
-
        /* Reset the IO port map */
        memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);

Reply via email to