On Thu, Sep 08, 2022 at 10:17:12AM -0500, Scott Cheloha wrote:
> > On Sep 8, 2022, at 9:05 AM, Mike Larkin <mlar...@nested.page> wrote:
> > [...]
> > 
> > You could compile this and then objdump -D it and see for yourself...
> 
> I can't make heads or tails of it.  Please explain what I am looking
> at and why it is, or is not, atomic.

Or I guess I can just wing it.

Okay, so this C code:

volatile uint64_t pvclock_lastcount;

/* [...] */

void
pvclock_get_timecount(struct timecounter *tc)
{
        uint64_t ctr, last;

        /* [...] */

        do {
                last = pvclock_lastcount;
                if (ctr < last)
                        return (last);
        } while (atomic_cas_64(&pvclock_lastcount, last, ctr) != last);

        return (ctr);
}

... yields this amd64 disassembly (from the linked bsd binary):

ffffffff81de242b:       75 2b                   jne    ffffffff81de2458 
<pvclock_get_timecount+0xb8>
ffffffff81de242d:       eb 01                   jmp    ffffffff81de2430 
<pvclock_get_timecount+0x90>
ffffffff81de242f:       cc                      int3   
ffffffff81de2430:       48 8b 0d 91 fc 6c 00    mov    7142545(%rip),%rcx       
 # ffffffff824b20c8 <pvclock_lastcount>
ffffffff81de2437:       48 87 d1                xchg   %rdx,%rcx
ffffffff81de243a:       48 39 d1                cmp    %rdx,%rcx
ffffffff81de243d:       48 87 d1                xchg   %rdx,%rcx
ffffffff81de2440:       72 13                   jb     ffffffff81de2455 
<pvclock_get_timecount+0xb5>
ffffffff81de2442:       48 89 c8                mov    %rcx,%rax
ffffffff81de2445:       f0 48 0f b1 15 7a fc    lock cmpxchg %rdx,7142522(%rip) 
       # ffffffff824b20c8 <pvclock_lastcount>
ffffffff81de244c:       6c 00 
ffffffff81de244e:       48 39 c8                cmp    %rcx,%rax
ffffffff81de2451:       75 dd                   jne    ffffffff81de2430 
<pvclock_get_timecount+0x90>
ffffffff81de2453:       eb 03                   jmp    ffffffff81de2458 
<pvclock_get_timecount+0xb8>
ffffffff81de2455:       48 8b d1                mov    %rcx,%rdx
ffffffff81de2458:       89 d0                   mov    %edx,%eax
ffffffff81de245a:       48 83 c4 08             add    $0x8,%rsp
ffffffff81de245e:       41 5e                   pop    %r14
ffffffff81de2460:       c9                      leaveq 
ffffffff81de2461:       c3                      retq

... and also yields this i386 disassembly (from the pvclock.o object):

 2c7:   75 2e                   jne    2f7 <pvclock_get_timecount+0xf7>
 2c9:   eb 05                   jmp    2d0 <pvclock_get_timecount+0xd0>
 2cb:   cc                      int3   
 2cc:   cc                      int3   
 2cd:   cc                      int3   
 2ce:   cc                      int3   
 2cf:   cc                      int3   
 2d0:   8b 15 04 00 00 00       mov    0x4,%edx
 2d6:   a1 00 00 00 00          mov    0x0,%eax
 2db:   87 d8                   xchg   %ebx,%eax
 2dd:   39 d8                   cmp    %ebx,%eax
 2df:   87 d8                   xchg   %ebx,%eax
 2e1:   89 f1                   mov    %esi,%ecx
 2e3:   19 d1                   sbb    %edx,%ecx
 2e5:   72 0e                   jb     2f5 <pvclock_get_timecount+0xf5>
 2e7:   89 f1                   mov    %esi,%ecx
 2e9:   f0 0f c7 0d 00 00 00    lock cmpxchg8b 0x0
 2f0:   00 
 2f1:   75 dd                   jne    2d0 <pvclock_get_timecount+0xd0>
 2f3:   eb 02                   jmp    2f7 <pvclock_get_timecount+0xf7>
 2f5:   8b d8                   mov    %eax,%ebx
 2f7:   89 d8                   mov    %ebx,%eax
 2f9:   83 c4 1c                add    $0x1c,%esp
 2fc:   5e                      pop    %esi
 2fd:   5f                      pop    %edi
 2fe:   5b                      pop    %ebx
 2ff:   5d                      pop    %ebp
 300:   c3                      ret    

If we isolate the pvclock_lastcount loads, on amd64 we have:

ffffffff81de2430:       48 8b 0d 91 fc 6c 00    mov    7142545(%rip),%rcx       
 # ffffffff824b20c8 <pvclock_lastcount>

and on i386 we have:

 2d0:   8b 15 04 00 00 00       mov    0x4,%edx
 2d6:   a1 00 00 00 00          mov    0x0,%eax

so the 8-byte load is atomic on amd64 (one load) and non-atomic on
i386 (two loads).

I don't know what jsg@ meant when he said the ifdefs "seemed
unnecessary", but near as I can tell they are necessary.  I need an
atomic 8-byte load and i386 can't, or won't, do it.

So I guess we go back to my original patch.

This resolves kettenis@'s atomic_cas_64() objections because we no
longer need it.

So, once again, the patch in brief:

- Add rdtsc_lfence() to i386/include/cpufunc.h

- Make pvclock_lastcount volatile uint64_t to fix the
  non-PVCLOCK_FLAG_TSC_STABLE case (see sub.).

- Check for SSE2 support in pvclock_match(), we need it for LFENCE
  in pvclock_get_timecount().

- Do RDTSC as soon as possible in the lockless read loop to get
  a better timestamp.

- Use rdtsc_lfence() instead of rdtsc() to get a better timestamp.

- Check whether our TSC lags ti->ti_tsc_timestamp so we don't
  produce a bogus delta.

- Fix the non-PVCLOCK_FLAG_TSC_STABLE case:

  + On amd64 we can do this with an atomic_cas_ulong(9) loop.  We need
    to cast the pointer to (unsigned long *) or the compiler complains.
    This is safe because sizeof(long) equals sizeof(uint64_t) on amd64.

  + On i386 we need a mutex, so add pvclock_mtx.  Set it to IPL_CLOCK
    because this is a timecounter driver.

--

How are we looking?

Index: arch/i386/include/cpufunc.h
===================================================================
RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v
retrieving revision 1.33
diff -u -p -r1.33 cpufunc.h
--- arch/i386/include/cpufunc.h 13 Sep 2020 11:53:16 -0000      1.33
+++ arch/i386/include/cpufunc.h 9 Sep 2022 15:05:26 -0000
@@ -229,6 +229,15 @@ rdtsc(void)
        return (tsc);
 }
 
+static inline uint64_t
+rdtsc_lfence(void)
+{
+       uint64_t tsc;
+
+       __asm volatile("lfence; rdtsc" : "=A" (tsc));
+       return tsc;
+}
+
 static __inline void
 wrmsr(u_int msr, u_int64_t newval)
 {
Index: dev/pv/pvclock.c
===================================================================
RCS file: /cvs/src/sys/dev/pv/pvclock.c,v
retrieving revision 1.8
diff -u -p -r1.8 pvclock.c
--- dev/pv/pvclock.c    5 Nov 2021 11:38:29 -0000       1.8
+++ dev/pv/pvclock.c    9 Sep 2022 15:05:26 -0000
@@ -21,21 +21,28 @@
 #endif
 
 #include <sys/param.h>
+#include <sys/stdint.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/timetc.h>
 #include <sys/timeout.h>
 #include <sys/malloc.h>
 #include <sys/atomic.h>
+#include <sys/mutex.h>
 
 #include <machine/cpu.h>
 #include <machine/atomic.h>
+#include <machine/specialreg.h>
+
 #include <uvm/uvm_extern.h>
 
 #include <dev/pv/pvvar.h>
 #include <dev/pv/pvreg.h>
 
-uint pvclock_lastcount;
+#if defined(__i386__)
+struct mutex pvclock_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
+#endif
+volatile uint64_t pvclock_lastcount;
 
 struct pvclock_softc {
        struct device            sc_dev;
@@ -116,6 +123,12 @@ pvclock_match(struct device *parent, voi
                    (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) == 0)
                        return (0);
 
+               /*
+                * LFENCE is also required, so check for SSE2 support.
+                */
+               if (!ISSET(curcpu()->ci_feature_flags, CPUID_SSE2))
+                       return (0);
+
                return (1);
        }
 
@@ -213,6 +226,7 @@ pvclock_get_timecount(struct timecounter
        struct pvclock_softc            *sc = tc->tc_priv;
        struct pvclock_time_info        *ti;
        uint64_t                         tsc_timestamp, system_time, delta, ctr;
+       uint64_t                         last, tsc;
        uint32_t                         version, mul_frac;
        int8_t                           shift;
        uint8_t                          flags;
@@ -220,6 +234,7 @@ pvclock_get_timecount(struct timecounter
        ti = sc->sc_time;
        do {
                version = pvclock_read_begin(ti);
+               tsc = rdtsc_lfence();
                system_time = ti->ti_system_time;
                tsc_timestamp = ti->ti_tsc_timestamp;
                mul_frac = ti->ti_tsc_to_system_mul;
@@ -231,7 +246,10 @@ pvclock_get_timecount(struct timecounter
         * The algorithm is described in
         * linux/Documentation/virtual/kvm/msr.txt
         */
-       delta = rdtsc() - tsc_timestamp;
+       if (tsc > tsc_timestamp)
+               delta = tsc - tsc_timestamp;
+       else
+               delta = 0;
        if (shift < 0)
                delta >>= -shift;
        else
@@ -241,10 +259,22 @@ pvclock_get_timecount(struct timecounter
        if ((flags & PVCLOCK_FLAG_TSC_STABLE) != 0)
                return (ctr);
 
-       if (ctr < pvclock_lastcount)
-               return (pvclock_lastcount);
-
-       atomic_swap_uint(&pvclock_lastcount, ctr);
+#if defined(__amd64__)
+       do {
+               last = pvclock_lastcount;
+               if (ctr < last)
+                       return (last);
+       } while (atomic_cas_ulong((unsigned long *)&pvclock_lastcount, last,
+           ctr) != last);
+#elif defined (__i386__)
+       mtx_enter(&pvclock_mtx);
+       last = pvclock_lastcount;
+       if (ctr < last)
+               ctr = last;
+       else
+               pvclock_lastcount = ctr;
+       mtx_leave(&pvclock_mtx);
+#endif
 
        return (ctr);
 }

Reply via email to