On Thu, Sep 08, 2022 at 10:17:12AM -0500, Scott Cheloha wrote: > > On Sep 8, 2022, at 9:05 AM, Mike Larkin <mlar...@nested.page> wrote: > > [...] > > > > You could compile this and then objdump -D it and see for yourself... > > I can't make heads or tails of it. Please explain what I am looking > at and why it is, or is not, atomic.
Or I guess I can just wing it. Okay, so this C code: volatile uint64_t pvclock_lastcount; /* [...] */ void pvclock_get_timecount(struct timecounter *tc) { uint64_t ctr, last; /* [...] */ do { last = pvclock_lastcount; if (ctr < last) return (last); } while (atomic_cas_64(&pvclock_lastcount, last, ctr) != last); return (ctr); } ... yields this amd64 disassembly (from the linked bsd binary): ffffffff81de242b: 75 2b jne ffffffff81de2458 <pvclock_get_timecount+0xb8> ffffffff81de242d: eb 01 jmp ffffffff81de2430 <pvclock_get_timecount+0x90> ffffffff81de242f: cc int3 ffffffff81de2430: 48 8b 0d 91 fc 6c 00 mov 7142545(%rip),%rcx # ffffffff824b20c8 <pvclock_lastcount> ffffffff81de2437: 48 87 d1 xchg %rdx,%rcx ffffffff81de243a: 48 39 d1 cmp %rdx,%rcx ffffffff81de243d: 48 87 d1 xchg %rdx,%rcx ffffffff81de2440: 72 13 jb ffffffff81de2455 <pvclock_get_timecount+0xb5> ffffffff81de2442: 48 89 c8 mov %rcx,%rax ffffffff81de2445: f0 48 0f b1 15 7a fc lock cmpxchg %rdx,7142522(%rip) # ffffffff824b20c8 <pvclock_lastcount> ffffffff81de244c: 6c 00 ffffffff81de244e: 48 39 c8 cmp %rcx,%rax ffffffff81de2451: 75 dd jne ffffffff81de2430 <pvclock_get_timecount+0x90> ffffffff81de2453: eb 03 jmp ffffffff81de2458 <pvclock_get_timecount+0xb8> ffffffff81de2455: 48 8b d1 mov %rcx,%rdx ffffffff81de2458: 89 d0 mov %edx,%eax ffffffff81de245a: 48 83 c4 08 add $0x8,%rsp ffffffff81de245e: 41 5e pop %r14 ffffffff81de2460: c9 leaveq ffffffff81de2461: c3 retq ... and also yields this i386 disassembly (from the pvclock.o object): 2c7: 75 2e jne 2f7 <pvclock_get_timecount+0xf7> 2c9: eb 05 jmp 2d0 <pvclock_get_timecount+0xd0> 2cb: cc int3 2cc: cc int3 2cd: cc int3 2ce: cc int3 2cf: cc int3 2d0: 8b 15 04 00 00 00 mov 0x4,%edx 2d6: a1 00 00 00 00 mov 0x0,%eax 2db: 87 d8 xchg %ebx,%eax 2dd: 39 d8 cmp %ebx,%eax 2df: 87 d8 xchg %ebx,%eax 2e1: 89 f1 mov %esi,%ecx 2e3: 19 d1 sbb %edx,%ecx 2e5: 72 0e jb 2f5 <pvclock_get_timecount+0xf5> 2e7: 89 f1 mov %esi,%ecx 2e9: f0 0f c7 0d 00 00 00 lock cmpxchg8b 0x0 2f0: 00 2f1: 75 dd jne 2d0 <pvclock_get_timecount+0xd0> 2f3: eb 02 jmp 2f7 <pvclock_get_timecount+0xf7> 2f5: 8b d8 mov %eax,%ebx 2f7: 89 d8 mov %ebx,%eax 2f9: 83 c4 1c add $0x1c,%esp 2fc: 5e pop %esi 2fd: 5f pop %edi 2fe: 5b pop %ebx 2ff: 5d pop %ebp 300: c3 ret If we isolate the pvclock_lastcount loads, on amd64 we have: ffffffff81de2430: 48 8b 0d 91 fc 6c 00 mov 7142545(%rip),%rcx # ffffffff824b20c8 <pvclock_lastcount> and on i386 we have: 2d0: 8b 15 04 00 00 00 mov 0x4,%edx 2d6: a1 00 00 00 00 mov 0x0,%eax so the 8-byte load is atomic on amd64 (one load) and non-atomic on i386 (two loads). I don't know what jsg@ meant when he said the ifdefs "seemed unnecessary", but near as I can tell they are necessary. I need an atomic 8-byte load and i386 can't, or won't, do it. So I guess we go back to my original patch. This resolves kettenis@'s atomic_cas_64() objections because we no longer need it. So, once again, the patch in brief: - Add rdtsc_lfence() to i386/include/cpufunc.h - Make pvclock_lastcount volatile uint64_t to fix the non-PVCLOCK_FLAG_TSC_STABLE case (see sub.). - Check for SSE2 support in pvclock_match(), we need it for LFENCE in pvclock_get_timecount(). - Do RDTSC as soon as possible in the lockless read loop to get a better timestamp. - Use rdtsc_lfence() instead of rdtsc() to get a better timestamp. - Check whether our TSC lags ti->ti_tsc_timestamp so we don't produce a bogus delta. - Fix the non-PVCLOCK_FLAG_TSC_STABLE case: + On amd64 we can do this with an atomic_cas_ulong(9) loop. We need to cast the pointer to (unsigned long *) or the compiler complains. This is safe because sizeof(long) equals sizeof(uint64_t) on amd64. + On i386 we need a mutex, so add pvclock_mtx. Set it to IPL_CLOCK because this is a timecounter driver. -- How are we looking? Index: arch/i386/include/cpufunc.h =================================================================== RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v retrieving revision 1.33 diff -u -p -r1.33 cpufunc.h --- arch/i386/include/cpufunc.h 13 Sep 2020 11:53:16 -0000 1.33 +++ arch/i386/include/cpufunc.h 9 Sep 2022 15:05:26 -0000 @@ -229,6 +229,15 @@ rdtsc(void) return (tsc); } +static inline uint64_t +rdtsc_lfence(void) +{ + uint64_t tsc; + + __asm volatile("lfence; rdtsc" : "=A" (tsc)); + return tsc; +} + static __inline void wrmsr(u_int msr, u_int64_t newval) { Index: dev/pv/pvclock.c =================================================================== RCS file: /cvs/src/sys/dev/pv/pvclock.c,v retrieving revision 1.8 diff -u -p -r1.8 pvclock.c --- dev/pv/pvclock.c 5 Nov 2021 11:38:29 -0000 1.8 +++ dev/pv/pvclock.c 9 Sep 2022 15:05:26 -0000 @@ -21,21 +21,28 @@ #endif #include <sys/param.h> +#include <sys/stdint.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/timetc.h> #include <sys/timeout.h> #include <sys/malloc.h> #include <sys/atomic.h> +#include <sys/mutex.h> #include <machine/cpu.h> #include <machine/atomic.h> +#include <machine/specialreg.h> + #include <uvm/uvm_extern.h> #include <dev/pv/pvvar.h> #include <dev/pv/pvreg.h> -uint pvclock_lastcount; +#if defined(__i386__) +struct mutex pvclock_mtx = MUTEX_INITIALIZER(IPL_CLOCK); +#endif +volatile uint64_t pvclock_lastcount; struct pvclock_softc { struct device sc_dev; @@ -116,6 +123,12 @@ pvclock_match(struct device *parent, voi (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) == 0) return (0); + /* + * LFENCE is also required, so check for SSE2 support. + */ + if (!ISSET(curcpu()->ci_feature_flags, CPUID_SSE2)) + return (0); + return (1); } @@ -213,6 +226,7 @@ pvclock_get_timecount(struct timecounter struct pvclock_softc *sc = tc->tc_priv; struct pvclock_time_info *ti; uint64_t tsc_timestamp, system_time, delta, ctr; + uint64_t last, tsc; uint32_t version, mul_frac; int8_t shift; uint8_t flags; @@ -220,6 +234,7 @@ pvclock_get_timecount(struct timecounter ti = sc->sc_time; do { version = pvclock_read_begin(ti); + tsc = rdtsc_lfence(); system_time = ti->ti_system_time; tsc_timestamp = ti->ti_tsc_timestamp; mul_frac = ti->ti_tsc_to_system_mul; @@ -231,7 +246,10 @@ pvclock_get_timecount(struct timecounter * The algorithm is described in * linux/Documentation/virtual/kvm/msr.txt */ - delta = rdtsc() - tsc_timestamp; + if (tsc > tsc_timestamp) + delta = tsc - tsc_timestamp; + else + delta = 0; if (shift < 0) delta >>= -shift; else @@ -241,10 +259,22 @@ pvclock_get_timecount(struct timecounter if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0) return (ctr); - if (ctr < pvclock_lastcount) - return (pvclock_lastcount); - - atomic_swap_uint(&pvclock_lastcount, ctr); +#if defined(__amd64__) + do { + last = pvclock_lastcount; + if (ctr < last) + return (last); + } while (atomic_cas_ulong((unsigned long *)&pvclock_lastcount, last, + ctr) != last); +#elif defined (__i386__) + mtx_enter(&pvclock_mtx); + last = pvclock_lastcount; + if (ctr < last) + ctr = last; + else + pvclock_lastcount = ctr; + mtx_leave(&pvclock_mtx); +#endif return (ctr); }