On Thu, Nov 22, 2018 at 04:37:49PM +0100, Reyk Floeter wrote: > On Mon, Nov 19, 2018 at 01:12:46PM +0100, Reyk Floeter wrote: > > the attached diff is another attempt at implementing a pvclock(4) > > guest driver. This improves the clock on KVM and replaces the need > > for using the VM-expensive acpihpet(4). > > > > So far I only got positive reports. Where are the problems? ;) > > Otherwise: OK? > > Reyk >
Reads ok. One question - you mention in pvclock.c that this is supported on i386 and amd64 but I only see GENERIC changes for amd64? ok mlarkin in any case, but I'd either add GENERIC changes for i386 or make this for sure amd64 only. -ml > > Index: share/man/man4/pvclock.4 > > =================================================================== > > RCS file: share/man/man4/pvclock.4 > > diff -N share/man/man4/pvclock.4 > > --- /dev/null 1 Jan 1970 00:00:00 -0000 > > +++ share/man/man4/pvclock.4 19 Nov 2018 11:48:33 -0000 > > @@ -0,0 +1,45 @@ > > +.\" $OpenBSD$ > > +.\" > > +.\" Copyright (c) 2018 Reyk Floeter <r...@openbsd.org> > > +.\" > > +.\" Permission to use, copy, modify, and distribute this software for any > > +.\" purpose with or without fee is hereby granted, provided that the above > > +.\" copyright notice and this permission notice appear in all copies. > > +.\" > > +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL > > WARRANTIES > > +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > > +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > > +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > > +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > > +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > > +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > > +.\" > > +.Dd $Mdocdate$ > > +.Dt PVCLOCK 4 > > +.Os > > +.Sh NAME > > +.Nm pvclock > > +.Nd paravirtual clock driver > > +.Sh SYNOPSIS > > +.Cd "pvclock* at pvbus? > > +.Sh DESCRIPTION > > +The > > +.Nm > > +driver supports the paravirtual clock that is available in KVM and > > +other hypervisors. > > +.Nm > > +uses a shared page between the host and the hypervisor to synchronize > > +the TSC clock in an efficient way. > > +.Sh SEE ALSO > > +.Xr pvbus 4 > > +.Sh HISTORY > > +The > > +.Nm > > +driver first appeared in > > +.Ox 6.5 . > > +.Sh AUTHORS > > +.An -nosplit > > +The > > +.Nm > > +driver was written by > > +.An Reyk Floeter Aq Mt r...@openbsd.org . > > Index: sys/arch/amd64/conf/GENERIC > > =================================================================== > > RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v > > retrieving revision 1.464 > > diff -u -p -u -p -r1.464 GENERIC > > --- sys/arch/amd64/conf/GENERIC 26 Oct 2018 20:26:19 -0000 1.464 > > +++ sys/arch/amd64/conf/GENERIC 19 Nov 2018 11:48:33 -0000 > > @@ -79,6 +79,8 @@ ipmi0 at mainbus? disable # IPMI > > > > vmt0 at pvbus? # VMware Tools > > > > +pvclock0 at pvbus? # KVM pvclock > > + > > xen0 at pvbus? # Xen HVM domU > > xnf* at xen? # Xen Netfront > > xbf* at xen? # Xen Blkfront > > Index: sys/dev/pv/files.pv > > =================================================================== > > RCS file: /cvs/src/sys/dev/pv/files.pv,v > > retrieving revision 1.14 > > diff -u -p -u -p -r1.14 files.pv > > --- sys/dev/pv/files.pv 24 Aug 2018 16:07:01 -0000 1.14 > > +++ sys/dev/pv/files.pv 19 Nov 2018 11:48:33 -0000 > > @@ -8,6 +8,11 @@ device pvbus > > attach pvbus at mainbus > > file dev/pv/pvbus.c pvbus needs-flag > > > > +# KVM clock > > +device pvclock > > +attach pvclock at pvbus > > +file dev/pv/pvclock.c pvclock needs-flag > > + > > # VMware Tools > > device vmt > > attach vmt at pvbus > > Index: sys/dev/pv/pvclock.c > > =================================================================== > > RCS file: sys/dev/pv/pvclock.c > > diff -N sys/dev/pv/pvclock.c > > --- /dev/null 1 Jan 1970 00:00:00 -0000 > > +++ sys/dev/pv/pvclock.c 19 Nov 2018 11:48:33 -0000 > > @@ -0,0 +1,229 @@ > > +/* $OpenBSD$ */ > > + > > +/* > > + * Copyright (c) 2018 Reyk Floeter <r...@openbsd.org> > > + * > > + * Permission to use, copy, modify, and distribute this software for any > > + * purpose with or without fee is hereby granted, provided that the above > > + * copyright notice and this permission notice appear in all copies. > > + * > > + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > > + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > > + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > > + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > > + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > > + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > > + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > > + */ > > + > > +#if !defined(__i386__) && !defined(__amd64__) > > +#error pvclock(4) is only supported on i386 and amd64 > > +#endif > > + > > +#include <sys/param.h> > > +#include <sys/systm.h> > > +#include <sys/kernel.h> > > +#include <sys/timetc.h> > > +#include <sys/timeout.h> > > +#include <sys/malloc.h> > > +#include <sys/atomic.h> > > + > > +#include <machine/cpu.h> > > +#include <uvm/uvm_extern.h> > > + > > +#include <dev/pv/pvvar.h> > > +#include <dev/pv/pvreg.h> > > + > > +struct pvclock_softc { > > + struct device sc_dev; > > + void *sc_time; > > + paddr_t sc_paddr; > > + struct timecounter *sc_tc; > > +}; > > + > > +struct pvclock_wall_clock { > > + uint32_t wc_version; > > + uint32_t wc_sec; > > + uint32_t wc_nsec; > > +} __packed; > > + > > +struct pvclock_time_info { > > + uint32_t ti_version; > > + uint32_t ti_pad0; > > + uint64_t ti_tsc_timestamp; > > + uint64_t ti_system_time; > > + uint32_t ti_tsc_to_system_mul; > > + int8_t ti_tsc_shift; > > + uint8_t ti_flags; > > + uint8_t ti_pad[2]; > > +} __packed; > > + > > +#define PVCLOCK_FLAG_TSC_STABLE 0x01 > > +#define PVCLOCK_SYSTEM_TIME_ENABLE 0x01 > > +#define DEVNAME(_s) ((_s)->sc_dev.dv_xname) > > + > > +int pvclock_match(struct device *, void *, void *); > > +void pvclock_attach(struct device *, struct device *, void *); > > +int pvclock_activate(struct device *, int); > > + > > +uint pvclock_get_timecount(struct timecounter *); > > +void pvclock_read_time_info(struct pvclock_softc *, > > + struct pvclock_time_info *); > > + > > +struct cfattach pvclock_ca = { > > + sizeof(struct pvclock_softc), > > + pvclock_match, > > + pvclock_attach, > > + NULL, > > + pvclock_activate > > +}; > > + > > +struct cfdriver pvclock_cd = { > > + NULL, > > + "pvclock", > > + DV_DULL > > +}; > > + > > +struct timecounter pvclock_timecounter = { > > + pvclock_get_timecount, NULL, ~0u, 0, NULL, -2000, NULL > > +}; > > + > > +int > > +pvclock_match(struct device *parent, void *match, void *aux) > > +{ > > + struct pv_attach_args *pva = aux; > > + struct pvbus_hv *hv; > > + > > + /* > > + * pvclock is provided by different hypervisors, we currently > > + * only support the "kvmclock". > > + */ > > + hv = &pva->pva_hv[PVBUS_KVM]; > > + if (hv->hv_base != 0) { > > + /* > > + * We only implement support for the 2nd version of pvclock. > > + * The first version is basically the same but with different > > + * non-standard MSRs and it is deprecated. > > + */ > > + if ((hv->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) == 0) > > + return (0); > > + > > + /* > > + * Only the "stable" clock with a sync'ed TSC is supported. > > + * In this case the host guarantees that the TSC is constant > > + * and invariant, either by the underlying TSC or by passing > > + * on a synchronized value. > > + */ > > + if ((hv->hv_features & > > + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) == 0) > > + return (0); > > + } > > + > > + return (1); > > +} > > + > > +void > > +pvclock_attach(struct device *parent, struct device *self, void *aux) > > +{ > > + struct pvclock_softc *sc = (struct pvclock_softc *)self; > > + paddr_t pa; > > + > > + if ((sc->sc_time = km_alloc(PAGE_SIZE, > > + &kv_any, &kp_zero, &kd_nowait)) == NULL) { > > + printf(": time page allocation failed\n"); > > + return; > > + } > > + if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_time, &pa)) { > > + printf(": time page PA extraction failed\n"); > > + km_free(sc->sc_time, PAGE_SIZE, &kv_any, &kp_zero); > > + return; > > + } > > + > > + wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); > > + sc->sc_paddr = pa; > > + > > + sc->sc_tc = &pvclock_timecounter; > > + sc->sc_tc->tc_name = DEVNAME(sc); > > + sc->sc_tc->tc_frequency = 1000000000ULL; > > + sc->sc_tc->tc_priv = sc; > > + > > + /* Better than HPET but below TSC */ > > + sc->sc_tc->tc_quality = 1500; > > + > > + tc_init(sc->sc_tc); > > + > > + printf("\n"); > > +} > > + > > +int > > +pvclock_activate(struct device *self, int act) > > +{ > > + struct pvclock_softc *sc = (struct pvclock_softc *)self; > > + int rv = 0; > > + paddr_t pa = sc->sc_paddr; > > + > > + switch (act) { > > + case DVACT_POWERDOWN: > > + wrmsr(KVM_MSR_SYSTEM_TIME, pa & ~PVCLOCK_SYSTEM_TIME_ENABLE); > > + break; > > + case DVACT_RESUME: > > + wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); > > + break; > > + } > > + > > + return (rv); > > +} > > + > > +static inline uint32_t > > +pvclock_read_begin(const struct pvclock_time_info *ti) > > +{ > > + uint32_t version = ti->ti_version & ~0x1; > > + virtio_membar_sync(); > > + return (version); > > +} > > + > > +static inline int > > +pvclock_read_done(const struct pvclock_time_info *ti, > > + uint32_t version) > > +{ > > + virtio_membar_sync(); > > + return (ti->ti_version == version); > > +} > > + > > +uint > > +pvclock_get_timecount(struct timecounter *tc) > > +{ > > + struct pvclock_softc *sc = tc->tc_priv; > > + struct pvclock_time_info *ti; > > + uint64_t tsc_timestamp, system_time, delta, ctr; > > + uint32_t version, mul_frac; > > + int8_t shift; > > + uint8_t flags; > > + > > + ti = sc->sc_time; > > + do { > > + version = pvclock_read_begin(ti); > > + system_time = ti->ti_system_time; > > + tsc_timestamp = ti->ti_tsc_timestamp; > > + mul_frac = ti->ti_tsc_to_system_mul; > > + shift = ti->ti_tsc_shift; > > + flags = ti->ti_flags; > > + } while (!pvclock_read_done(ti, version)); > > + > > + /* This bit must be set as we attached based on the stable flag */ > > + if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) > > + panic("%s: unstable result on stable clock", DEVNAME(sc)); > > + > > + /* > > + * The algorithm is described in > > + * linux/Documentation/virtual/kvm/msr.txt > > + */ > > + delta = rdtsc() - tsc_timestamp; > > + if (shift < 0) > > + delta >>= -shift; > > + else > > + delta <<= shift; > > + ctr = ((delta * mul_frac) >> 32) + system_time; > > + > > + return (ctr); > > +} > > Index: sys/dev/pv/pvreg.h > > =================================================================== > > RCS file: /cvs/src/sys/dev/pv/pvreg.h,v > > retrieving revision 1.4 > > diff -u -p -u -p -r1.4 pvreg.h > > --- sys/dev/pv/pvreg.h 12 Dec 2015 12:33:49 -0000 1.4 > > +++ sys/dev/pv/pvreg.h 19 Nov 2018 11:48:33 -0000 > > @@ -43,6 +43,9 @@ > > #define KVM_MSR_EOI_EN 0x4b564d04 > > #define KVM_PV_EOI_BIT 0 > > > > +#define KVM_MSR_WALL_CLOCK 0x4b564d00 > > +#define KVM_MSR_SYSTEM_TIME 0x4b564d01 > > + > > /* > > * Hyper-V > > */ >