Hi,

By far one of the most popular and frequently used system calls is
clock_gettime(2). As a result the cost of kernel-userland transitions
out weight the actual work, thus I am proposing we make the data
available directly from userland without passing through a system call.

This has been a subject of discussion multiple times across the years
and last I heard from it was at the p2k19 hackthon that I hosted in
Bucharest where espie@ sent me a diff from one of his students(?). Being
busy with organization I have not had the time to look at it and
I am thus getting back to it just now due to robert@ prodding me again
on the subject. The proposed diff is mine, not the student's.


The technical bits. 

Please keep in mind that this is only proof of concept. I am looking for
ways to improve the current diff. As it is, it requires a flag day
because it makes use of ELF aux vectors to export the data from the
kernel.

I have also played with exposing the data via separate ELF sections and
with kbind-mmap alternatives. The frist also involves a flag day and is
more intrusive in my opinion, and the second I could not get to work. I
think that would be the less intrusive way of doing it, possibly without
a flag day, so if anyone knows how, please let me know.

The supported clocks are just those that do not require process specific
data. Those can also be handled later if this diff is decided to be a
good thing.

Clock update inside the kernel is done at the end of tc_windup(). There
might be better places to do it. Let me know where.

The update currently does the work of clock_gettime(), but it can
probably be changed to only update the timehands and move the logic
elsewhere. Note that if we expose only the timehands to userland, most
of the bintime functionality has to also be made available there. Or so
I think.

In userland, I wrapped the clock_gettime(2) syscall in libc. There, I
search for the auxiliary vector and fetch the timespec data from it.
As you can see in the diff, parts from the elf_exec header will have to
be exposed to userland if we do it this way.


Results.

To test this diff you need to do a full release(8). I have tested this
with multiple programs. Test programs, base programs and packages. None
the less, this diff touches many important areas of our tree and is
very fragile. I also probably missed changing some parts that required
change due to libc or elf changes.

If you see regressions, which you probably will, please let me know.

Here is a stress test from robert@:

robert@x202:/home/robert> time ./t && time ./t2
0m00.11s real 0m00.12s user 0m00.00s system
0m09.99s real 0m02.64s user 0m03.36s system
t is clock_gettime() and t2 is SYS_clock_gettime()


Please keep the discussions on the list and let me know what you think
and how we can improve this if we decide this is wanted in the tree.

Paul

diff --git lib/libc/shlib_version lib/libc/shlib_version
index 06f98b01084..5fb0770494f 100644
--- lib/libc/shlib_version
+++ lib/libc/shlib_version
@@ -1,4 +1,4 @@
 major=96
-minor=0
+minor=1
 # note: If changes were made to include/thread_private.h or if system calls
 # were added/changed then librthread/shlib_version must also be updated.
diff --git lib/libc/sys/Makefile.inc lib/libc/sys/Makefile.inc
index 34769576ced..607985e8f20 100644
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -12,7 +12,8 @@ SRCS+=        Ovfork.S brk.S ${CERROR} \
 
 # glue to offer userland wrappers for some syscalls
 SRCS+= posix_madvise.c pthread_sigmask.c \
-       w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c
+       w_fork.c w_sigaction.c w_sigprocmask.c w_sigsuspend.c w_vfork.c \
+       w_clock_gettime.c
 
 # glue for compat with old syscall interfaces.
 SRCS+= ftruncate.c lseek.c mquery.c mmap.c ptrace.c semctl.c truncate.c \
diff --git lib/libc/sys/w_clock_gettime.c lib/libc/sys/w_clock_gettime.c
new file mode 100644
index 00000000000..e955615248f
--- /dev/null
+++ lib/libc/sys/w_clock_gettime.c
@@ -0,0 +1,114 @@
+/*     $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <p...@irofti.net>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+#include <err.h>
+
+#include <sys/timekeep.h>
+
+void *elf_aux_timekeep;
+
+
+/*
+ * Needed exec_elf implementation.
+ * To be exposed by the kernel later if needed.
+ */
+
+#include <sys/exec_elf.h>
+
+typedef struct {
+       uint32_t        au_id;                          /* 32-bit id */
+       uint64_t        au_v;                           /* 64-bit value */
+} AuxInfo;
+
+enum AuxID {
+       AUX_null = 0,
+       AUX_ignore = 1,
+       AUX_execfd = 2,
+       AUX_phdr = 3,                   /* &phdr[0] */
+       AUX_phent = 4,                  /* sizeof(phdr[0]) */
+       AUX_phnum = 5,                  /* # phdr entries */
+       AUX_pagesz = 6,                 /* PAGESIZE */
+       AUX_base = 7,                   /* ld.so base addr */
+       AUX_flags = 8,                  /* processor flags */
+       AUX_entry = 9,                  /* a.out entry */
+       AUX_sun_uid = 2000,             /* euid */
+       AUX_sun_ruid = 2001,            /* ruid */
+       AUX_sun_gid = 2002,             /* egid */
+       AUX_sun_rgid = 2003,            /* rgid */
+       AUX_openbsd_timekeep = 2004,    /* userland clock_gettime */
+};
+
+
+/*
+ * Helper functions.
+ */
+
+static int
+find_timekeep(void)
+{
+       Elf_Addr *stackp;
+       AuxInfo *auxv;
+       int found = 0;
+
+       stackp = (Elf_Addr *)environ;
+       while (*stackp++) ;             /* pass environment */
+
+       /* look-up timekeep auxv */
+       for (auxv = (AuxInfo *)stackp; auxv->au_id != AUX_null; auxv++)
+               if (auxv->au_id == AUX_openbsd_timekeep) {
+                       found = 1;
+                       break;
+               }
+       if (found == 0) {
+               warnx("%s", "Could not find auxv!");
+               return -1;
+       }
+
+       elf_aux_timekeep = (void *)auxv->au_v;
+       return 0;
+}
+
+int
+WRAP(clock_gettime)(clockid_t clock_id, struct timespec *tp)
+{
+       struct timekeep *timekeep;
+
+       if (elf_aux_timekeep == NULL && find_timekeep())
+               return clock_gettime(clock_id, tp);
+       timekeep = elf_aux_timekeep;
+
+       switch (clock_id) {
+       case CLOCK_REALTIME:
+               *tp = timekeep->tp_realtime;
+               break;
+       case CLOCK_UPTIME:
+               *tp = timekeep->tp_uptime;
+               break;
+       case CLOCK_MONOTONIC:
+               *tp = timekeep->tp_monotonic;
+               break;
+       case CLOCK_BOOTTIME:
+               *tp = timekeep->tp_boottime;
+               break;
+       default:
+               return clock_gettime(clock_id, tp);
+       }
+       return 0;
+}
+DEF_WRAP(clock_gettime);
diff --git sys/kern/exec_elf.c sys/kern/exec_elf.c
index 9b5b8eb3acf..59bc923a6fb 100644
--- sys/kern/exec_elf.c
+++ sys/kern/exec_elf.c
@@ -124,7 +124,7 @@ extern char *syscallnames[];
 /*
  * How many entries are in the AuxInfo array we pass to the process?
  */
-#define ELF_AUX_ENTRIES        8
+#define ELF_AUX_ENTRIES        9
 
 /*
  * This is the OpenBSD ELF emul
@@ -860,6 +860,10 @@ exec_elf_fixup(struct proc *p, struct exec_package *epp)
                a->au_v = ap->arg_entry;
                a++;
 
+               a->au_id = AUX_openbsd_timekeep;
+               a->au_v = p->p_p->ps_timekeep;
+               a++;
+
                a->au_id = AUX_null;
                a->au_v = 0;
                a++;
diff --git sys/kern/kern_exec.c sys/kern/kern_exec.c
index 20480c2fc28..2496458fde1 100644
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -64,6 +64,11 @@
 #include <uvm/uvm_extern.h>
 #include <machine/tcb.h>
 
+#include <sys/timekeep.h>
+
+struct uvm_object *timekeep_object;
+struct timekeep* timekeep;
+
 void   unveil_destroy(struct process *ps);
 
 const struct kmem_va_mode kv_exec = {
@@ -76,6 +81,11 @@ const struct kmem_va_mode kv_exec = {
  */
 int exec_sigcode_map(struct process *, struct emul *);
 
+/*
+ * Map the shared timekeep page.
+ */
+int exec_timekeep_map(struct process *);
+
 /*
  * If non-zero, stackgap_random specifies the upper limit of the random gap 
size
  * added to the fixed stack position. Must be n^2.
@@ -684,6 +694,9 @@ sys_execve(struct proc *p, void *v, register_t *retval)
        /* map the process's signal trampoline code */
        if (exec_sigcode_map(pr, pack.ep_emul))
                goto free_pack_abort;
+       /* map the process's timekeep page */
+       if (exec_timekeep_map(pr))
+               goto free_pack_abort;
 
 #ifdef __HAVE_EXEC_MD_MAP
        /* perform md specific mappings that process might need */
@@ -863,3 +876,38 @@ exec_sigcode_map(struct process *pr, struct emul *e)
 
        return (0);
 }
+
+int exec_timekeep_map(struct process *pr)
+{
+       size_t timekeep_sz = sizeof(struct timekeep);
+
+       /*
+        * Similar to the sigcode object, except that there is a single timekeep
+        * object, and not one per emulation.
+        */
+       if (timekeep_object == NULL) {
+               vaddr_t va;
+
+               timekeep_object = uao_create(timekeep_sz, 0);
+               uao_reference(timekeep_object);
+
+               if (uvm_map(kernel_map, &va, round_page(timekeep_sz), 
timekeep_object,
+                   0, 0, UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | 
PROT_WRITE,
+                   MAP_INHERIT_SHARE, MADV_RANDOM, 0))) {
+                       uao_detach(timekeep_object);
+                       return (ENOMEM);
+               }
+
+               timekeep = (struct timekeep *)va;
+       }
+
+       uao_reference(timekeep_object);
+       if (uvm_map(&pr->ps_vmspace->vm_map, &pr->ps_timekeep, 
round_page(timekeep_sz),
+           timekeep_object, 0, 0, UVM_MAPFLAG(PROT_READ, PROT_READ,
+           MAP_INHERIT_COPY, MADV_RANDOM, 0))) {
+               uao_detach(timekeep_object);
+               return (ENOMEM);
+       }
+
+       return (0);
+}
diff --git sys/kern/kern_tc.c sys/kern/kern_tc.c
index bcf8f689625..007f1116c4f 100644
--- sys/kern/kern_tc.c
+++ sys/kern/kern_tc.c
@@ -35,6 +35,7 @@
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <dev/rndvar.h>
+#include <sys/timekeep.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
@@ -209,6 +210,31 @@ microuptime(struct timeval *tvp)
        BINTIME_TO_TIMEVAL(&bt, tvp);
 }
 
+void
+tc_clock_gettime(void)
+{
+       struct bintime bt;
+
+       if (timekeep == NULL)
+               return;
+
+       /* CLOCK_REALTIME */
+       nanotime(&timekeep->tp_realtime);
+
+       /* CLOCK_UPTIME */
+       binuptime(&bt);
+       bintimesub(&bt, &naptime, &bt);
+       BINTIME_TO_TIMESPEC(&bt, &timekeep->tp_uptime);
+
+       /* CLOCK_MONOTONIC */
+       nanouptime(&timekeep->tp_monotonic);
+
+       /* CLOCK_BOOTTIME */
+       timekeep->tp_boottime = timekeep->tp_monotonic;
+
+       return;
+}
+
 void
 bintime(struct bintime *bt)
 {
@@ -613,6 +639,8 @@ tc_windup(struct bintime *new_boottime, struct bintime 
*new_offset,
        time_uptime = th->th_offset.sec;
        membar_producer();
        timehands = th;
+
+       tc_clock_gettime();
 }
 
 /* Report or change the active timecounter hardware. */
diff --git sys/sys/exec_elf.h sys/sys/exec_elf.h
index a40e0510273..f55b75f1e84 100644
--- sys/sys/exec_elf.h
+++ sys/sys/exec_elf.h
@@ -691,7 +691,8 @@ enum AuxID {
        AUX_sun_uid = 2000,             /* euid */
        AUX_sun_ruid = 2001,            /* ruid */
        AUX_sun_gid = 2002,             /* egid */
-       AUX_sun_rgid = 2003             /* rgid */
+       AUX_sun_rgid = 2003,            /* rgid */
+       AUX_openbsd_timekeep = 2004,    /* userland clock_gettime */
 };
 
 struct elf_args {
diff --git sys/sys/proc.h sys/sys/proc.h
index 357c0c0d52c..93a79a220db 100644
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -248,6 +248,8 @@ struct process {
        u_int   ps_rtableid;            /* Process routing table/domain. */
        char    ps_nice;                /* Process "nice" value. */
 
+       vaddr_t ps_timekeep;            /* User pointer to timekeep */
+
        struct uprof {                  /* profile arguments */
                caddr_t pr_base;        /* buffer base */
                size_t  pr_size;        /* buffer size */
diff --git sys/sys/timekeep.h sys/sys/timekeep.h
new file mode 100644
index 00000000000..bad25185bc4
--- /dev/null
+++ sys/sys/timekeep.h
@@ -0,0 +1,37 @@
+/*     $OpenBSD$ */
+/*
+ * Copyright (c) 2020 Paul Irofti <p...@irofti.net>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _SYS_TIMEKEEP_H_
+#define _SYS_TIMEKEEP_H_
+
+#include <sys/time.h>
+
+struct timekeep {
+       struct timespec tp_realtime;
+       struct timespec tp_uptime;
+       struct timespec tp_monotonic;
+       struct timespec tp_boottime;
+};
+
+#if defined(_KERNEL)
+#include <uvm/uvm_extern.h>
+
+extern struct uvm_object *timekeep_object;
+extern struct timekeep *timekeep;
+#endif
+
+#endif /* _SYS_TIMEKEEP_H_ */

Reply via email to