dv@ suggested coming to the list to request testing for the pvclock(4) driver. Attached is a patch that corrects several bugs. Most of these changes will only matter in the non-TSC_STABLE case on a multiprocessor VM.
Ideally, nothing should break. - pvclock yields a 64-bit value. The BSD timecounter layer can only use the lower 32 bits, but internally we need to track the full 64-bit value to allow comparisons with the full value in the non-TSC_STABLE case. So make pvclock_lastcount a 64-bit quantity. - In pvclock_get_timecount(), move rdtsc() up into the lockless read loop to get a more accurate timestamp. - In pvclock_get_timecount(), use rdtsc_lfence(), not rdtsc(). - In pvclock_get_timecount(), check that our TSC value doesn't predate ti->ti_tsc_timestamp, otherwise we will produce an enormous value. - In pvclock_get_timecount(), update pvclock_lastcount in the non-TSC_STABLE case with more care. On amd64 we can do this with an atomic_cas_ulong(9) loop because u_long is 64 bits. On i386 we need to introduce a mutex to protect our comparison and read/write. Index: pvclock.c =================================================================== RCS file: /cvs/src/sys/dev/pv/pvclock.c,v retrieving revision 1.8 diff -u -p -r1.8 pvclock.c --- pvclock.c 5 Nov 2021 11:38:29 -0000 1.8 +++ pvclock.c 2 Sep 2022 22:54:08 -0000 @@ -27,6 +27,10 @@ #include <sys/timeout.h> #include <sys/malloc.h> #include <sys/atomic.h> +#include <sys/stdint.h> +#if defined(__i386__) +#include <sys/mutex.h> +#endif #include <machine/cpu.h> #include <machine/atomic.h> @@ -35,7 +39,12 @@ #include <dev/pv/pvvar.h> #include <dev/pv/pvreg.h> -uint pvclock_lastcount; +#if defined(__amd64__) +volatile u_long pvclock_lastcount; +#elif defined(__i386__) +struct mutex pvclock_mtx = MUTEX_INITIALIZER(IPL_HIGH); +uint64_t pvclock_lastcount; +#endif struct pvclock_softc { struct device sc_dev; @@ -212,7 +221,7 @@ pvclock_get_timecount(struct timecounter { struct pvclock_softc *sc = tc->tc_priv; struct pvclock_time_info *ti; - uint64_t tsc_timestamp, system_time, delta, ctr; + uint64_t system_time, delta, ctr, tsc; uint32_t version, mul_frac; int8_t shift; uint8_t flags; @@ -220,8 +229,12 @@ pvclock_get_timecount(struct timecounter ti = sc->sc_time; do { version = pvclock_read_begin(ti); + tsc = rdtsc_lfence(); + if (ti->ti_tsc_timestamp < tsc) + delta = tsc - ti->ti_tsc_timestamp; + else + delta = 0; system_time = ti->ti_system_time; - tsc_timestamp = ti->ti_tsc_timestamp; mul_frac = ti->ti_tsc_to_system_mul; shift = ti->ti_tsc_shift; flags = ti->ti_flags; @@ -231,7 +244,6 @@ pvclock_get_timecount(struct timecounter * The algorithm is described in * linux/Documentation/virtual/kvm/msr.txt */ - delta = rdtsc() - tsc_timestamp; if (shift < 0) delta >>= -shift; else @@ -241,10 +253,20 @@ pvclock_get_timecount(struct timecounter if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0) return (ctr); - if (ctr < pvclock_lastcount) - return (pvclock_lastcount); - - atomic_swap_uint(&pvclock_lastcount, ctr); - +#if defined(__amd64__) + u_long last; + do { + last = pvclock_lastcount; + if (ctr < last) + return last; + } while (atomic_cas_ulong(&pvclock_lastcount, last, ctr) != last); +#elif defined(__i386__) + mtx_enter(&pvclock_mtx); + if (pvclock_lastcount < ctr) + pvclock_lastcount = ctr; + else + ctr = pvclock_lastcount; + mtx_leave(&pvclock_mtx); +#endif return (ctr); }