Hello Scott, With the patch, my machine on ESXi it doesn't show any extra message.
*Without* the patch, the machine shows % grep 'TSC.*skew' dmesg.current-tsc-debug cpu1: disabling user TSC (skew=-2603) cpu2: disabling user TSC (skew=-2959) cpu3: disabling user TSC (skew=-3784) % and monotonic time goes backward (the test in https://marc.info/?l=openbsd-tech&m=161699406119704&w=2 failed) dmesg of with the patch: OpenBSD 7.1-current (GENERIC.MP) #24: Thu Jul 7 10:09:32 JST 2022 yasu...@yasuoka-ob-c.tokyo.iiji.jp:/source/yasuoka/head/git/src/sys/arch/amd64/compile/GENERIC.MP real mem = 2113290240 (2015MB) avail mem = 2031927296 (1937MB) random: good seed from bootblocks mpath0 at root scsibus0 at mpath0: 256 targets mainbus0 at root bios0 at mainbus0: SMBIOS rev. 2.7 @ 0xff7401f (161 entries) bios0: vendor VMware, Inc. version "VMW71.00V.16460286.B64.2006250725" date 06/25/2020 bios0: VMware, Inc. VMware7,1 acpi0 at bios0: ACPI 4.0 acpi0: sleep states S0 S1 S4 S5 acpi0: tables DSDT SRAT FACP APIC MCFG HPET WAET WSMT acpi0: wakeup devices PCI0(S3) USB_(S1) P2P0(S3) S1F0(S3) S2F0(S3) S8F0(S3) S16F(S3) S17F(S3) S18F(S3) S22F(S3) S23F(S3) S24F(S3) S25F(S3) PE40(S3) S1F0(S3) PE41(S3) [...] acpitimer0 at acpi0: 3579545 Hz, 24 bits acpimadt0 at acpi0 addr 0xfee00000: PC-AT compat cpu0 at mainbus0: apid 0 (boot processor) cpu0: Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz, 2393.81 MHz, 06-55-07 cpu0: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SS,HTT,SSE3,PCLMUL,VMX,SSSE3,FMA3,CX16,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,AVX512F,AVX512DQ,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,AVX512CD,AVX512BW,AVX512VL,PKU,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES cpu0: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache, 1MB 64b/line 16-way L2 cache, 13MB 64b/line 11-way L3 cache cpu0: smt 0, core 0, package 0 mtrr: Pentium Pro MTRR support, 8 var ranges, 88 fixed ranges cpu0: apic clock running at 65MHz cpu1 at mainbus0: apid 1 (application processor) cpu1: Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz, 2393.61 MHz, 06-55-07 cpu1: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SS,HTT,SSE3,PCLMUL,VMX,SSSE3,FMA3,CX16,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,AVX512F,AVX512DQ,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,AVX512CD,AVX512BW,AVX512VL,PKU,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES cpu1: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache, 1MB 64b/line 16-way L2 cache, 13MB 64b/line 11-way L3 cache cpu1: smt 0, core 1, package 0 cpu2 at mainbus0: apid 2 (application processor) cpu2: Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz, 2393.62 MHz, 06-55-07 cpu2: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SS,HTT,SSE3,PCLMUL,VMX,SSSE3,FMA3,CX16,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,AVX512F,AVX512DQ,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,AVX512CD,AVX512BW,AVX512VL,PKU,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES cpu2: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache, 1MB 64b/line 16-way L2 cache, 13MB 64b/line 11-way L3 cache cpu2: smt 0, core 2, package 0 cpu3 at mainbus0: apid 3 (application processor) cpu3: Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz, 2393.62 MHz, 06-55-07 cpu3: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,MMX,FXSR,SSE,SSE2,SS,HTT,SSE3,PCLMUL,VMX,SSSE3,FMA3,CX16,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,HV,NXE,PAGE1GB,RDTSCP,LONG,LAHF,ABM,3DNOWP,PERF,ITSC,FSGSBASE,TSC_ADJUST,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,AVX512F,AVX512DQ,RDSEED,ADX,SMAP,CLFLUSHOPT,CLWB,AVX512CD,AVX512BW,AVX512VL,PKU,MD_CLEAR,IBRS,IBPB,STIBP,L1DF,SSBD,ARAT,XSAVEOPT,XSAVEC,XGETBV1,XSAVES cpu3: 32KB 64b/line 8-way D-cache, 32KB 64b/line 8-way I-cache, 1MB 64b/line 16-way L2 cache, 13MB 64b/line 11-way L3 cache cpu3: smt 0, core 3, package 0 ioapic0 at mainbus0: apid 4 pa 0xfec00000, version 20, 24 pins, remapped acpimcfg0 at acpi0 acpimcfg0: addr 0xe0000000, bus 0-127 acpihpet0 at acpi0: 14318179 Hz acpiprt0 at acpi0: bus 0 (PCI0) acpipci0 at acpi0 PCI0: 0x00000000 0x00000011 0x00000001 acpicmos0 at acpi0 "PNP0A05" at acpi0 not configured com0 at acpi0 COMA addr 0x3f8/0x8 irq 4: ns16550a, 16 byte fifo com0: console acpiac0 at acpi0: AC unit online acpicpu0 at acpi0: C1(@1 halt!) acpicpu1 at acpi0: C1(@1 halt!) acpicpu2 at acpi0: C1(@1 halt!) acpicpu3 at acpi0: C1(@1 halt!) pvbus0 at mainbus0: VMware vmt0 at pvbus0 pci0 at mainbus0 bus 0 0:16:0: rom address conflict 0xffffc000/0x4000 pchb0 at pci0 dev 0 function 0 "Intel 82443BX AGP" rev 0x01 ppb0 at pci0 dev 1 function 0 "Intel 82443BX AGP" rev 0x01 pci1 at ppb0 bus 1 pcib0 at pci0 dev 7 function 0 "Intel 82371AB PIIX4 ISA" rev 0x08 pciide0 at pci0 dev 7 function 1 "Intel 82371AB IDE" rev 0x01: DMA, channel 0 configured to compatibility, channel 1 configured to compatibility pciide0: channel 0 disabled (no drives) atapiscsi0 at pciide0 channel 1 drive 0 scsibus1 at atapiscsi0: 2 targets cd0 at scsibus1 targ 0 lun 0: <NECVMWar, VMware IDE CDR10, 1.00> removable cd0(pciide0:1:0): using PIO mode 4, Ultra-DMA mode 2 piixpm0 at pci0 dev 7 function 3 "Intel 82371AB Power" rev 0x08: SMBus disabled "VMware VMCI" rev 0x10 at pci0 dev 7 function 7 not configured "VMware SVGA II" rev 0x00 at pci0 dev 15 function 0 not configured mpi0 at pci0 dev 16 function 0 "Symbios Logic 53c1030" rev 0x01: apic 4 int 17 mpi0: 0, firmware 1.3.41.32 scsibus2 at mpi0: 16 targets, initiator 7 sd0 at scsibus2 targ 0 lun 0: <VMware, Virtual disk, 2.0> sd0: 16384MB, 512 bytes/sector, 33554432 sectors sd1 at scsibus2 targ 1 lun 0: <VMware, Virtual disk, 2.0> sd1: 81920MB, 512 bytes/sector, 167772160 sectors mpi0: target 0 Sync at 160MHz width 16bit offset 127 QAS 1 DT 1 IU 1 mpi0: target 1 Sync at 160MHz width 16bit offset 127 QAS 1 DT 1 IU 1 ppb1 at pci0 dev 17 function 0 "VMware PCI" rev 0x02 pci2 at ppb1 bus 2 ppb2 at pci0 dev 21 function 0 "VMware PCIE" rev 0x01: msi pci3 at ppb2 bus 3 vmx0 at pci3 dev 0 function 0 "VMware VMXNET3" rev 0x01: msix, 4 queues, address 00:0c:29:24:3d:0a ppb3 at pci0 dev 21 function 1 "VMware PCIE" rev 0x01: msi pci4 at ppb3 bus 4 ppb4 at pci0 dev 21 function 2 "VMware PCIE" rev 0x01: msi pci5 at ppb4 bus 5 ppb5 at pci0 dev 21 function 3 "VMware PCIE" rev 0x01: msi pci6 at ppb5 bus 6 ppb6 at pci0 dev 21 function 4 "VMware PCIE" rev 0x01: msi pci7 at ppb6 bus 7 ppb7 at pci0 dev 21 function 5 "VMware PCIE" rev 0x01: msi pci8 at ppb7 bus 8 ppb8 at pci0 dev 21 function 6 "VMware PCIE" rev 0x01: msi pci9 at ppb8 bus 9 ppb9 at pci0 dev 21 function 7 "VMware PCIE" rev 0x01: msi pci10 at ppb9 bus 10 ppb10 at pci0 dev 22 function 0 "VMware PCIE" rev 0x01: msi pci11 at ppb10 bus 11 ppb11 at pci0 dev 22 function 1 "VMware PCIE" rev 0x01: msi pci12 at ppb11 bus 12 ppb12 at pci0 dev 22 function 2 "VMware PCIE" rev 0x01: msi pci13 at ppb12 bus 13 ppb13 at pci0 dev 22 function 3 "VMware PCIE" rev 0x01: msi pci14 at ppb13 bus 14 ppb14 at pci0 dev 22 function 4 "VMware PCIE" rev 0x01: msi pci15 at ppb14 bus 15 ppb15 at pci0 dev 22 function 5 "VMware PCIE" rev 0x01: msi pci16 at ppb15 bus 16 ppb16 at pci0 dev 22 function 6 "VMware PCIE" rev 0x01: msi pci17 at ppb16 bus 17 ppb17 at pci0 dev 22 function 7 "VMware PCIE" rev 0x01: msi pci18 at ppb17 bus 18 ppb18 at pci0 dev 23 function 0 "VMware PCIE" rev 0x01: msi pci19 at ppb18 bus 19 ppb19 at pci0 dev 23 function 1 "VMware PCIE" rev 0x01: msi pci20 at ppb19 bus 20 ppb20 at pci0 dev 23 function 2 "VMware PCIE" rev 0x01: msi pci21 at ppb20 bus 21 ppb21 at pci0 dev 23 function 3 "VMware PCIE" rev 0x01: msi pci22 at ppb21 bus 22 ppb22 at pci0 dev 23 function 4 "VMware PCIE" rev 0x01: msi pci23 at ppb22 bus 23 ppb23 at pci0 dev 23 function 5 "VMware PCIE" rev 0x01: msi pci24 at ppb23 bus 24 ppb24 at pci0 dev 23 function 6 "VMware PCIE" rev 0x01: msi pci25 at ppb24 bus 25 ppb25 at pci0 dev 23 function 7 "VMware PCIE" rev 0x01: msi pci26 at ppb25 bus 26 ppb26 at pci0 dev 24 function 0 "VMware PCIE" rev 0x01: msi pci27 at ppb26 bus 27 ppb27 at pci0 dev 24 function 1 "VMware PCIE" rev 0x01: msi pci28 at ppb27 bus 28 ppb28 at pci0 dev 24 function 2 "VMware PCIE" rev 0x01: msi pci29 at ppb28 bus 29 ppb29 at pci0 dev 24 function 3 "VMware PCIE" rev 0x01: msi pci30 at ppb29 bus 30 ppb30 at pci0 dev 24 function 4 "VMware PCIE" rev 0x01: msi pci31 at ppb30 bus 31 ppb31 at pci0 dev 24 function 5 "VMware PCIE" rev 0x01: msi pci32 at ppb31 bus 32 ppb32 at pci0 dev 24 function 6 "VMware PCIE" rev 0x01: msi pci33 at ppb32 bus 33 ppb33 at pci0 dev 24 function 7 "VMware PCIE" rev 0x01: msi pci34 at ppb33 bus 34 isa0 at pcib0 isadma0 at isa0 pckbc0 at isa0 port 0x60/5 irq 1 irq 12 pckbd0 at pckbc0 (kbd slot) wskbd0 at pckbd0 mux 1 pms0 at pckbc0 (aux slot) wsmouse0 at pms0 mux 0 pcppi0 at isa0 port 0x61 spkr0 at pcppi0 vmm0 at mainbus0: VMX/EPT efifb0 at mainbus0: 1152x864, 32bpp wsdisplay0 at efifb0 mux 1 wskbd0: connecting to wsdisplay0 wsdisplay0: screen 0-5 added (std, vt100 emulation) vscsi0 at root scsibus3 at vscsi0: 256 targets softraid0 at root scsibus4 at softraid0: 256 targets root on sd0a (a5bd0d220920df21.a) swap on sd0b dump on sd0b On Mon, 4 Jul 2022 21:06:55 -0500 Scott Cheloha <scottchel...@gmail.com> wrote: > Hi, > > Once again, I am trying to change our approach to TSC sync testing to > eliminate false positive results. Instead of trying to repair the TSC > by measuring skew, we just spin in a lockless loop looking for skew > and mark the TSC as broken if we detect any. > > This is motivated in part by some multisocket machines that do not use > the TSC as a timecounter because the current sync test confuses NUMA > latency for TSC skew. > > The only difference between this version and the prior version (v2) is > that we check whether we have the IA32_TSC_ADJUST register by hand in > tsc_reset_adjust(). If someone wants to help me rearrange cpu_hatch() > so we do CPU identification before TSC sync testing we can remove the > workaround later. > > If you have the IA32_TSC_ADJUST register and it is non-zero going into > the test, you will see something on the console like this: > > tsc: cpu5: IA32_TSC_ADJUST: -150 -> 0 > > This does *not* mean you failed the test. It just means you probably > have a bug in your BIOS (or some other firmware) and you should report > it to your vendor. > > If you fail the test you will see something like this: > > tsc: cpu0/cpu2: sync test round 1/2 failed > tsc: cpu0/cpu2: cpu2: 13043 lags 438 cycles > > A printout like this would mean that the sync test for cpu2 failed. > In particular, cpu2's TSC trails cpu0's TSC by at least 438 cycles. > If this happens for *any* CPU we mark the TSC timecounter as > defective. > > -- > > Please test! Send your dmesg, pass or fail. > > I am especially interested in: > > 1. A test from dv@. Your dual-socket machine has the IA32_TSC_ADJUST > register but it failed the test running patch v2. Maybe it will pass > with this version? > > 2. Other multisocket machines. > > 3. There were reports of TSC issues with OpenBSD VMs running on ESXi. > What happens when you run with this patch? > > 4. OpenBSD VMs on other hypervisors. > > 5. Non-Lenovo machines, non-Intel machines. > > -Scott > > Index: amd64/tsc.c > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/amd64/tsc.c,v > retrieving revision 1.24 > diff -u -p -r1.24 tsc.c > --- amd64/tsc.c 31 Aug 2021 15:11:54 -0000 1.24 > +++ amd64/tsc.c 5 Jul 2022 01:52:10 -0000 > @@ -36,13 +36,6 @@ int tsc_recalibrate; > uint64_t tsc_frequency; > int tsc_is_invariant; > > -#define TSC_DRIFT_MAX 250 > -#define TSC_SKEW_MAX 100 > -int64_t tsc_drift_observed; > - > -volatile int64_t tsc_sync_val; > -volatile struct cpu_info *tsc_sync_cpu; > - > u_int tsc_get_timecount(struct timecounter *tc); > void tsc_delay(int usecs); > > @@ -236,22 +229,12 @@ cpu_recalibrate_tsc(struct timecounter * > u_int > tsc_get_timecount(struct timecounter *tc) > { > - return rdtsc_lfence() + curcpu()->ci_tsc_skew; > + return rdtsc_lfence(); > } > > void > tsc_timecounter_init(struct cpu_info *ci, uint64_t cpufreq) > { > -#ifdef TSC_DEBUG > - printf("%s: TSC skew=%lld observed drift=%lld\n", ci->ci_dev->dv_xname, > - (long long)ci->ci_tsc_skew, (long long)tsc_drift_observed); > -#endif > - if (ci->ci_tsc_skew < -TSC_SKEW_MAX || ci->ci_tsc_skew > TSC_SKEW_MAX) { > - printf("%s: disabling user TSC (skew=%lld)\n", > - ci->ci_dev->dv_xname, (long long)ci->ci_tsc_skew); > - tsc_timecounter.tc_user = 0; > - } > - > if (!(ci->ci_flags & CPUF_PRIMARY) || > !(ci->ci_flags & CPUF_CONST_TSC) || > !(ci->ci_flags & CPUF_INVAR_TSC)) > @@ -268,111 +251,267 @@ tsc_timecounter_init(struct cpu_info *ci > calibrate_tsc_freq(); > } > > - if (tsc_drift_observed > TSC_DRIFT_MAX) { > - printf("ERROR: %lld cycle TSC drift observed\n", > - (long long)tsc_drift_observed); > - tsc_timecounter.tc_quality = -1000; > - tsc_timecounter.tc_user = 0; > - tsc_is_invariant = 0; > - } > - > tc_init(&tsc_timecounter); > } > > -/* > - * Record drift (in clock cycles). Called during AP startup. > - */ > void > -tsc_sync_drift(int64_t drift) > +tsc_delay(int usecs) > { > - if (drift < 0) > - drift = -drift; > - if (drift > tsc_drift_observed) > - tsc_drift_observed = drift; > + uint64_t interval, start; > + > + interval = (uint64_t)usecs * tsc_frequency / 1000000; > + start = rdtsc_lfence(); > + while (rdtsc_lfence() - start < interval) > + CPU_BUSY_CYCLE(); > } > > +#ifdef MULTIPROCESSOR > + > +#define TSC_DEBUG 1 > + > /* > - * Called during startup of APs, by the boot processor. Interrupts > - * are disabled on entry. > + * Protections for global variables in this code: > + * > + * a Modified atomically > + * b Protected by a barrier > + * p Only modified by the primary CPU > */ > + > +#define TSC_TEST_MS 1 /* Test round duration */ > +#define TSC_TEST_ROUNDS 2 /* Number of test rounds */ > + > +struct tsc_test_status { > + volatile uint64_t val __aligned(64); /* [b] CPU's latest TSC value */ > + uint64_t lag_count; /* [b] # of lags seen by CPU */ > + uint64_t lag_max; /* [b] Biggest lag seen */ > + int64_t adj_val; /* [b] initial TSC_ADJUST val */ > +}; > +struct tsc_test_status tsc_ap_status; /* [b] Test results from AP */ > +struct tsc_test_status tsc_bp_status; /* [p] Test results from BP */ > +uint64_t tsc_test_cycles; /* [p] TSC cycles per test round */ > +const char *tsc_ap_name; /* [b] Name of AP */ > +volatile u_int tsc_egress_barrier; /* [a] Test end barrier */ > +volatile u_int tsc_ingress_barrier; /* [a] Test start barrier */ > +volatile u_int tsc_test_rounds; /* [p] Remaining test rounds */ > +int tsc_is_synchronized = 1; /* [p] TSC sync'd across all CPUs? */ > + > +void tsc_report_test_results(void); > +void tsc_reset_adjust(struct tsc_test_status *); > +void tsc_test_ap(void); > +void tsc_test_bp(void); > + > void > -tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp) > +tsc_test_sync_bp(struct cpu_info *ci) > { > - uint64_t bptsc; > + /* TSC must be constant and invariant to use it as a timecounter. */ > +#ifndef TSC_DEBUG > + if (!tsc_is_invariant) > + return; > +#endif > > - if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) > - panic("tsc_sync_bp: 1"); > + /* Reset IA32_TSC_ADJUST if it exists. */ > + tsc_reset_adjust(&tsc_bp_status); > > - /* Flag it and read our TSC. */ > - atomic_setbits_int(&ci->ci_flags, CPUF_SYNCTSC); > - bptsc = (rdtsc_lfence() >> 1); > + /* Reset the test cycle limit and round count. */ > + tsc_test_cycles = TSC_TEST_MS * tsc_frequency / 1000; > + tsc_test_rounds = TSC_TEST_ROUNDS; > + > + do { > + /* > + * Pass through the ingress barrier, run the test, > + * then wait for the AP to reach the egress barrier. > + */ > + atomic_inc_int(&tsc_ingress_barrier); > + while (tsc_ingress_barrier != 2) > + CPU_BUSY_CYCLE(); > + tsc_test_bp(); > + while (tsc_egress_barrier != 1) > + CPU_BUSY_CYCLE(); > + > + /* > + * Report what happened. Adjust the timecounter quality > + * if we failed the test. > + * > + * XXX We need a tc_detach() function to cleanly > + * disable the timecounter. Lowering the quality > + * like this is a nasty hack. > + */ > + tsc_report_test_results(); > + if (tsc_ap_status.lag_count || tsc_bp_status.lag_count) { > + tsc_timecounter.tc_quality = -1000; > + tsc_is_synchronized = 0; > + tsc_test_rounds = 0; > + } else > + tsc_test_rounds--; > + > + /* > + * Clean up for the next round. > + * > + * It is safe to reset the ingress barrier because > + * at this point we know the AP has reached the egress > + * barrier. > + */ > + memset(&tsc_ap_status, 0, sizeof tsc_ap_status); > + memset(&tsc_bp_status, 0, sizeof tsc_bp_status); > + tsc_ingress_barrier = 0; > + if (tsc_test_rounds == 0) > + tsc_ap_name = NULL; > + > + /* > + * Pass through the egress barrier and release the AP. > + * The AP is responsible for resetting the egress barrier. > + */ > + if (atomic_inc_int_nv(&tsc_egress_barrier) != 2) > + panic("%s: unexpected egress count", __func__); > + } while (tsc_test_rounds > 0); > +} > + > +void > +tsc_test_sync_ap(struct cpu_info *ci) > +{ > +#ifndef TSC_DEBUG > + if (!tsc_is_invariant) > + return; > +#endif > > - /* Wait for remote to complete, and read ours again. */ > - while ((ci->ci_flags & CPUF_SYNCTSC) != 0) > - membar_consumer(); > - bptsc += (rdtsc_lfence() >> 1); > + /* The BP needs our name in order to report any problems. */ > + tsc_ap_name = ci->ci_dev->dv_xname; > > - /* Wait for the results to come in. */ > - while (tsc_sync_cpu == ci) > - CPU_BUSY_CYCLE(); > - if (tsc_sync_cpu != NULL) > - panic("tsc_sync_bp: 2"); > + tsc_reset_adjust(&tsc_ap_status); > > - *bptscp = bptsc; > - *aptscp = tsc_sync_val; > + /* > + * The AP is only responsible for running the test and > + * resetting the egress barrier. The BP sets up and tears > + * down everything else. > + */ > + do { > + atomic_inc_int(&tsc_ingress_barrier); > + while (tsc_ingress_barrier != 2) > + CPU_BUSY_CYCLE(); > + tsc_test_ap(); > + atomic_inc_int(&tsc_egress_barrier); > + while (atomic_cas_uint(&tsc_egress_barrier, 2, 0) != 2) > + CPU_BUSY_CYCLE(); > + } while (tsc_test_rounds > 0); > } > > void > -tsc_sync_bp(struct cpu_info *ci) > +tsc_report_test_results(void) > { > - uint64_t bptsc, aptsc; > + u_int round = TSC_TEST_ROUNDS - tsc_test_rounds + 1; > > - tsc_read_bp(ci, &bptsc, &aptsc); /* discarded - cache effects */ > - tsc_read_bp(ci, &bptsc, &aptsc); > - > - /* Compute final value to adjust for skew. */ > - ci->ci_tsc_skew = bptsc - aptsc; > + if (tsc_bp_status.adj_val != 0) { > + printf("tsc: cpu0: IA32_TSC_ADJUST: %lld -> 0\n", > + tsc_bp_status.adj_val); > + } > + if (tsc_ap_status.adj_val != 0) { > + printf("tsc: %s: IA32_TSC_ADJUST: %lld -> 0\n", > + tsc_ap_name, tsc_ap_status.adj_val); > + } > + if (tsc_ap_status.lag_count > 0 || tsc_bp_status.lag_count > 0) { > + printf("tsc: cpu0/%s: sync test round %u/%u failed\n", > + tsc_ap_name, round, TSC_TEST_ROUNDS); > + } > + if (tsc_bp_status.lag_count > 0) { > + printf("tsc: cpu0/%s: cpu0: %llu lags %llu cycles\n", > + tsc_ap_name, tsc_bp_status.lag_count, > + tsc_bp_status.lag_max); > + } > + if (tsc_ap_status.lag_count > 0) { > + printf("tsc: cpu0/%s: %s: %llu lags %llu cycles\n", > + tsc_ap_name, tsc_ap_name, tsc_ap_status.lag_count, > + tsc_ap_status.lag_max); > + } > } > > /* > - * Called during startup of AP, by the AP itself. Interrupts are > - * disabled on entry. > + * Reset IA32_TSC_ADJUST if we have it. > + * > + * XXX We should rearrange cpu_hatch() so that the feature flags > + * are already set before we get here. Do CPUID() and CPUID_LEAF() > + * here as a workaround until then. > */ > void > -tsc_post_ap(struct cpu_info *ci) > +tsc_reset_adjust(struct tsc_test_status *tts) > { > - uint64_t tsc; > - > - /* Wait for go-ahead from primary. */ > - while ((ci->ci_flags & CPUF_SYNCTSC) == 0) > - membar_consumer(); > - tsc = (rdtsc_lfence() >> 1); > + uint32_t eax, ebx, ecx, edx; > > - /* Instruct primary to read its counter. */ > - atomic_clearbits_int(&ci->ci_flags, CPUF_SYNCTSC); > - tsc += (rdtsc_lfence() >> 1); > - > - /* Post result. Ensure the whole value goes out atomically. */ > - (void)atomic_swap_64(&tsc_sync_val, tsc); > - > - if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) > - panic("tsc_sync_ap"); > + CPUID(0, eax, ebx, ecx, edx); > + if (eax >= 7) { > + CPUID_LEAF(7, 0, eax, ebx, ecx, edx); > + if (ISSET(ebx, SEFF0EBX_TSC_ADJUST)) { > + tts->adj_val = rdmsr(MSR_TSC_ADJUST); > + if (tts->adj_val != 0) > + wrmsr(MSR_TSC_ADJUST, 0); > + } > + } > } > > void > -tsc_sync_ap(struct cpu_info *ci) > +tsc_test_ap(void) > { > - tsc_post_ap(ci); > - tsc_post_ap(ci); > + uint64_t ap_val, bp_val, end, lag; > + u_int i = 0; > + > + ap_val = rdtsc_lfence(); > + end = ap_val + tsc_test_cycles; > + while (ap_val < end) { > + /* > + * The LFENCE ensures bp_val predates ap_val. If ap_val > + * is smaller than bp_val then the AP's TSC must lag that > + * of the BP, i.e. the two cannot be synchronized. > + */ > + bp_val = tsc_bp_status.val; > + ap_val = rdtsc_lfence(); > + tsc_ap_status.val = ap_val; > + > + /* > + * Ensure the other CPU has a chance to run. This is > + * crucial if the other CPU is our SMT sibling. SMT > + * starvation can prevent this test from detecting > + * relatively large lags. Eight is an arbitrary value, > + * but it seems to work in practice without impacting > + * the sensitivity of the test for non-sibling threads. > + */ > + if (++i % 8 == 0) > + CPU_BUSY_CYCLE(); > + > + /* > + * Record the magnitude of the problem if our TSC lags > + * the other. > + */ > + if (__predict_false(ap_val < bp_val)) { > + tsc_ap_status.lag_count++; > + lag = bp_val - ap_val; > + if (tsc_ap_status.lag_max < lag) > + tsc_ap_status.lag_max = lag; > + } > + } > } > > void > -tsc_delay(int usecs) > +tsc_test_bp(void) > { > - uint64_t interval, start; > + uint64_t ap_val, bp_val, end, lag; > + u_int i = 0; > > - interval = (uint64_t)usecs * tsc_frequency / 1000000; > - start = rdtsc_lfence(); > - while (rdtsc_lfence() - start < interval) > - CPU_BUSY_CYCLE(); > + bp_val = rdtsc_lfence(); > + end = bp_val + tsc_test_cycles; > + while (bp_val < end) { > + ap_val = tsc_ap_status.val; > + bp_val = rdtsc_lfence(); > + tsc_bp_status.val = bp_val; > + > + if (++i % 8 == 0) > + CPU_BUSY_CYCLE(); > + > + if (__predict_false(bp_val < ap_val)) { > + tsc_bp_status.lag_count++; > + lag = ap_val - bp_val; > + if (tsc_bp_status.lag_max < lag) > + tsc_bp_status.lag_max = lag; > + } > + } > } > + > +#endif /* MULTIPROCESSOR */ > Index: amd64/cpu.c > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v > retrieving revision 1.156 > diff -u -p -r1.156 cpu.c > --- amd64/cpu.c 26 Apr 2022 08:35:30 -0000 1.156 > +++ amd64/cpu.c 5 Jul 2022 01:52:11 -0000 > @@ -772,9 +772,9 @@ cpu_init(struct cpu_info *ci) > lcr4(cr4 & ~CR4_PGE); > lcr4(cr4); > > - /* Synchronize TSC */ > + /* Check if TSC is synchronized. */ > if (cold && !CPU_IS_PRIMARY(ci)) > - tsc_sync_ap(ci); > + tsc_test_sync_ap(ci); > #endif > } > > @@ -854,18 +854,14 @@ cpu_start_secondary(struct cpu_info *ci) > #endif > } else { > /* > - * Synchronize time stamp counters. Invalidate cache and > - * synchronize twice (in tsc_sync_bp) to minimize possible > - * cache effects. Disable interrupts to try and rule out any > - * external interference. > + * Test if TSCs are synchronized. Invalidate cache to > + * minimize possible cache effects. Disable interrupts to > + * rule out external interference. > */ > s = intr_disable(); > wbinvd(); > - tsc_sync_bp(ci); > + tsc_test_sync_bp(ci); > intr_restore(s); > -#ifdef TSC_DEBUG > - printf("TSC skew=%lld\n", (long long)ci->ci_tsc_skew); > -#endif > } > > if ((ci->ci_flags & CPUF_IDENTIFIED) == 0) { > @@ -890,7 +886,6 @@ void > cpu_boot_secondary(struct cpu_info *ci) > { > int i; > - int64_t drift; > u_long s; > > atomic_setbits_int(&ci->ci_flags, CPUF_GO); > @@ -905,18 +900,11 @@ cpu_boot_secondary(struct cpu_info *ci) > db_enter(); > #endif > } else if (cold) { > - /* Synchronize TSC again, check for drift. */ > - drift = ci->ci_tsc_skew; > + /* Test if TSCs are synchronized again. */ > s = intr_disable(); > wbinvd(); > - tsc_sync_bp(ci); > + tsc_test_sync_bp(ci); > intr_restore(s); > - drift -= ci->ci_tsc_skew; > -#ifdef TSC_DEBUG > - printf("TSC skew=%lld drift=%lld\n", > - (long long)ci->ci_tsc_skew, (long long)drift); > -#endif > - tsc_sync_drift(drift); > } > } > > @@ -942,13 +930,12 @@ cpu_hatch(void *v) > #endif > > /* > - * Synchronize the TSC for the first time. Note that interrupts are > - * off at this point. > + * Test if our TSC is synchronized for the first time. > + * Note that interrupts are off at this point. > */ > wbinvd(); > ci->ci_flags |= CPUF_PRESENT; > - ci->ci_tsc_skew = 0; /* reset on resume */ > - tsc_sync_ap(ci); > + tsc_test_sync_ap(ci); > > lapic_enable(); > lapic_startclock(); > Index: include/cpu.h > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v > retrieving revision 1.144 > diff -u -p -r1.144 cpu.h > --- include/cpu.h 28 Jun 2022 12:11:41 -0000 1.144 > +++ include/cpu.h 5 Jul 2022 01:52:11 -0000 > @@ -209,8 +209,6 @@ struct cpu_info { > paddr_t ci_vmxon_region_pa; > struct vmxon_region *ci_vmxon_region; > > - int64_t ci_tsc_skew; /* counter skew vs cpu0 */ > - > char ci_panicbuf[512]; > > paddr_t ci_vmcs_pa; > @@ -230,7 +228,6 @@ struct cpu_info { > #define CPUF_INVAR_TSC 0x0100 /* CPU has invariant TSC */ > #define CPUF_USERXSTATE 0x0200 /* CPU has curproc's xsave > state */ > > -#define CPUF_SYNCTSC 0x0800 /* Synchronize TSC */ > #define CPUF_PRESENT 0x1000 /* CPU is present */ > #define CPUF_RUNNING 0x2000 /* CPU is running */ > #define CPUF_PAUSE 0x4000 /* CPU is paused in DDB */ > Index: include/cpuvar.h > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/include/cpuvar.h,v > retrieving revision 1.11 > diff -u -p -r1.11 cpuvar.h > --- include/cpuvar.h 16 May 2021 04:33:05 -0000 1.11 > +++ include/cpuvar.h 5 Jul 2022 01:52:11 -0000 > @@ -97,8 +97,7 @@ void identifycpu(struct cpu_info *); > void cpu_init(struct cpu_info *); > void cpu_init_first(void); > > -void tsc_sync_drift(int64_t); > -void tsc_sync_bp(struct cpu_info *); > -void tsc_sync_ap(struct cpu_info *); > +void tsc_test_sync_bp(struct cpu_info *); > +void tsc_test_sync_ap(struct cpu_info *); > > #endif