Le sam. 9 mars 2024 à 15:03, Luca Dariz <[email protected]> a écrit :
> This extends the previous debug utility to trace system calls with
> more events and the ability to control the tracing from userspace,
> collecting a trace of the whole systems.
>
> This tool was quite useful in porting the rpc format to 64 bit and
> handle the 32to64 translation, but also to debug user-space programs.
>
> It's still a draft with some limitations and open points:
> - DEBUG_TRACE for 64 bit is missing
> - check if we can have meaningful and reliable data removing the
> syscall number on syscall return; this would avoid saving it in the
> thread structure.
> - add more info to each trace entry, e.g. current CPU or timestamps
> - trace more events
> - add more filtering capabilities
> - trace a single subhurd
> - trace format is not exported
> ---
> Makefrag.am | 7 +
> configfrag.ac | 12 ++
> i386/i386/debug.h | 16 +-
> i386/i386/debug_i386.c | 45 ++--
> i386/i386/locore.S | 33 ++-
> i386/i386/pcb.c | 4 +
> i386/i386/trap.c | 5 +
> include/mach/gnumach.defs | 33 +++
> include/mach/trace.h | 61 ++++++
> ipc/ipc_kmsg.c | 6 +-
> ipc/mach_msg.c | 2 +
> kern/exception.c | 1 +
> kern/startup.c | 2 +
> kern/thread.h | 4 +
> kern/trace.c | 432 ++++++++++++++++++++++++++++++++++++++
> kern/trace.h | 42 ++++
> scripts/gnumach-gdb.py | 63 ++++++
> tests/test-trace.c | 58 +++++
> tests/user-qemu.mk | 3 +-
> x86_64/debug_trace.S | 9 +-
> x86_64/interrupt.S | 19 ++
> x86_64/locore.S | 84 +++++++-
> 22 files changed, 887 insertions(+), 54 deletions(-)
> create mode 100644 include/mach/trace.h
> create mode 100644 kern/trace.c
> create mode 100644 kern/trace.h
> create mode 100644 scripts/gnumach-gdb.py
> create mode 100644 tests/test-trace.c
>
> diff --git a/Makefrag.am b/Makefrag.am
> index 82fce628..25d73ec3 100644
> --- a/Makefrag.am
> +++ b/Makefrag.am
> @@ -221,6 +221,12 @@ libkernel_a_SOURCES += \
> kern/xpr.h \
> kern/elf-load.c \
> kern/boot_script.c
> +
> +if KERNEL_TRACE
> +libkernel_a_SOURCES += \
> + kern/trace.c
> +endif
> +
> EXTRA_DIST += \
> kern/exc.defs \
> kern/mach.srv \
> @@ -407,6 +413,7 @@ include_mach_HEADERS = \
> include/mach/profilparam.h \
> include/mach/std_types.h \
> include/mach/syscall_sw.h \
> + include/mach/trace.h \
> include/mach/task_info.h \
> include/mach/task_special_ports.h \
> include/mach/thread_info.h \
> diff --git a/configfrag.ac b/configfrag.ac
> index b8b41261..f4dbe63d 100644
> --- a/configfrag.ac
> +++ b/configfrag.ac
> @@ -139,6 +139,18 @@ AC_DEFINE([SLAB_VERIFY], [0], [SLAB_VERIFY])
>
> # Enable the CPU pool layer in the slab allocator.
> AC_DEFINE([SLAB_USE_CPU_POOLS], [0], [SLAB_USE_CPU_POOLS])
> +
> +# enable support for tracing various kernel events
> +AC_ARG_ENABLE([kernel-trace],
> + AS_HELP_STRING([--enable-kernel-trace], [Enable kernel event tracing]))
> +[if [ x"$enable_kernel_trace" = xyes ]; then]
> + AC_DEFINE([KERNEL_TRACE], [1], [Enable kernel event tracing])
> + AM_CONDITIONAL([KERNEL_TRACE], [true])
> +[else]
> + AC_DEFINE([KERNEL_TRACE], [0], [Enable kernel event tracing])
> + AM_CONDITIONAL([KERNEL_TRACE], [false])
> +[fi]
> +
>
> #
> # Options.
> diff --git a/i386/i386/debug.h b/i386/i386/debug.h
> index 84397ba8..eff330c6 100644
> --- a/i386/i386/debug.h
> +++ b/i386/i386/debug.h
> @@ -54,6 +54,7 @@ void debug_trace_dump(void);
>
> #else /* __ASSEMBLER__ */
>
> +#ifndef __x86_64__
> #define DEBUG_TRACE \
> pushl $__LINE__ ;\
> pushl $9f ;\
> @@ -62,10 +63,21 @@ void debug_trace_dump(void);
> .data ;\
> 9: .ascii __FILE__"\0" ;\
> .text
> -
> +#else /* __x86_64__ */
> +#define DEBUG_TRACE \
> + pushq %rdi ;\
> + pushq %rs1 ;\
>
Is it really %rs1 here and not %rsi ? you are pushing rs1 but popping rsi
> + movq $__LINE__,%rdi ;\
> + movq $9f,%rsi ;\
> + call __debug_trace ;\
> + popq %rsi ;\
> + popq %rdi ;\
> + .data ;\
> +9: .ascii __FILE__"\0" ;\
> + .text
> +#endif
> #endif /* __ASSEMBLER__ */
>
> -
> #endif /* DEBUG */
>
> /* XXX #include_next "debug.h" */
> diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c
> index 41d032e3..61d73dff 100644
> --- a/i386/i386/debug_i386.c
> +++ b/i386/i386/debug_i386.c
> @@ -22,6 +22,7 @@
> */
>
> #include <kern/printf.h>
> +#include <kern/trace.h>
>
> #include "thread.h"
> #include "trap.h"
> @@ -140,39 +141,23 @@ debug_trace_dump(void)
>
> splx(s);
> }
> +#endif /* DEBUG */
>
> -#include <kern/syscall_sw.h>
> -
> -int syscall_trace = 0;
> -task_t syscall_trace_task;
> +#if KERNEL_TRACE
>
> -int
> -syscall_trace_print(int syscallvec, ...)
> +#ifndef __x86_64__
> +#define SYSCALL_NUM_SHIFT 4
> +#else
> +#define SYSCALL_NUM_SHIFT 5
> +#endif
> +long
> +syscall_trace_enter(long *sysv)
> {
> - int syscallnum = syscallvec >> 4;
> - int i;
> - const mach_trap_t *trap = &mach_trap_table[syscallnum];
> -
> - if (syscall_trace_task && syscall_trace_task != current_task())
> - goto out;
> -
> - printf("0x%08x:0x%08x:%s(",
> - current_task(), current_thread(), trap->mach_trap_name);
> - for (i = 0; i < trap->mach_trap_arg_count; i++) {
> - unsigned long value = (&syscallvec)[1+i];
> - /* Use a crude heuristic to format pointers. */
> - if (value > 1024)
> - printf("0x%08x", value);
> - else
> - printf("%d", value);
> -
> - if (i + 1 < trap->mach_trap_arg_count)
> - printf(", ");
> - }
> - printf(")\n");
> + long sysnum = sysv[0];
>
> - out:
> - return syscallvec;
> + syscall_trace_common(sysnum >> SYSCALL_NUM_SHIFT, sysv + 1);
> +
> + return sysnum;
> }
>
> -#endif /* DEBUG */
> +#endif /* KERNEL_TRACE */
> diff --git a/i386/i386/locore.S b/i386/i386/locore.S
> index 9d0513a1..26c5843c 100644
> --- a/i386/i386/locore.S
> +++ b/i386/i386/locore.S
> @@ -634,6 +634,15 @@ ENTRY(thread_bootstrap_return)
> */
>
> ENTRY(thread_syscall_return)
> +#if KERNEL_TRACE
> + testb $0xff,EXT(syscall_trace)
> + jz 1f
> + movl S_ARG0,%eax /* get return value */
> + pushl %eax
> + call syscall_trace_return
> + movl %eax,S_ARG0 /* restore return value */
> +1:
> +#endif
> movl S_ARG0,%eax /* get return value */
> movl %esp,%ecx /* get kernel stack */
> or $(KERNEL_STACK_SIZE-1),%ecx
> @@ -1174,18 +1183,26 @@ syscall_native:
>
> mach_call_call:
>
> -#ifdef DEBUG
> +#if KERNEL_TRACE
> testb $0xff,EXT(syscall_trace)
> jz 0f
> - pushl %eax
> - call EXT(syscall_trace_print)
> - /* will return with syscallofs still (or again) in eax */
> - addl $4,%esp
> + pushl %eax /* add syscall num to args array
> */
> + pushl %esp /* args array is the first
> argument*/
> + call EXT(syscall_trace_enter)
> + popl %eax
> + popl %eax
>
you are popping eax twice here, shouldn’t the second one be esp?
> 0:
> -#endif /* DEBUG */
> +#endif /* KERNEL_TRACE */
> + call *EXT(mach_trap_table)+4(%eax) /* call procedure */
> +#if KERNEL_TRACE
> + testb $0xff,EXT(syscall_trace)
> + jz 1f
> + pushl %eax
> + call syscall_trace_return
> + /* will return with return value still (or again) in eax */
> +1:
> +#endif /* KERNEL_TRACE */
>
> - call *EXT(mach_trap_table)+4(%eax)
> - /* call procedure */
> movl %esp,%ecx /* get kernel stack */
> or $(KERNEL_STACK_SIZE-1),%ecx
> movl -3-IKS_SIZE(%ecx),%esp /* switch back to PCB stack */
> diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c
> index e8901550..a3ed7d67 100644
> --- a/i386/i386/pcb.c
> +++ b/i386/i386/pcb.c
> @@ -37,6 +37,7 @@
> #include <kern/counters.h>
> #include <kern/debug.h>
> #include <kern/thread.h>
> +#include <kern/trace.h>
> #include <kern/sched_prim.h>
> #include <kern/slab.h>
> #include <vm/vm_kern.h>
> @@ -308,6 +309,7 @@ void stack_handoff(
> /*
> * Load the rest of the user state for the new thread
> */
> + context_switch_trace(old, NULL, new);
> switch_ktss(new->pcb);
>
> /*
> @@ -332,6 +334,7 @@ void stack_handoff(
> */
> void load_context(thread_t new)
> {
> + context_switch_trace(NULL, NULL, new);
> switch_ktss(new->pcb);
> Load_context(new);
> }
> @@ -382,6 +385,7 @@ thread_t switch_context(
> /*
> * Load the rest of the user state for the new thread
> */
> + context_switch_trace(old, continuation, new);
> switch_ktss(new->pcb);
> return Switch_context(old, continuation, new);
> }
> diff --git a/i386/i386/trap.c b/i386/i386/trap.c
> index 555d7498..17f7eb3c 100644
> --- a/i386/i386/trap.c
> +++ b/i386/i386/trap.c
> @@ -53,6 +53,7 @@
> #include <kern/printf.h>
> #include <kern/thread.h>
> #include <kern/task.h>
> +#include <kern/trace.h>
> #include <kern/sched.h>
> #include <kern/sched_prim.h>
> #include <kern/exception.h>
> @@ -162,6 +163,8 @@ void kernel_trap(struct i386_saved_state *regs)
> thread_t thread;
> extern char _start[], etext[];
>
> + trap_trace(TRACE_TRAP_KERNEL, regs);
> +
> type = regs->trapno;
> code = regs->err;
> thread = current_thread();
> @@ -366,6 +369,7 @@ int user_trap(struct i386_saved_state *regs)
> unsigned long type;
> thread_t thread = current_thread();
>
> + trap_trace(TRACE_TRAP_USER, regs);
> #ifdef __x86_64__
> assert(regs == &thread->pcb->iss);
> #endif
> @@ -592,6 +596,7 @@ int user_trap(struct i386_saved_state *regs)
> void
> i386_astintr(void)
> {
> + trap_trace(TRACE_TRAP_ASTINTR, &(current_thread()->pcb->iss));
> (void) splsched(); /* block interrupts to check reasons */
> #ifndef MACH_RING1
> int mycpu = cpu_number();
> diff --git a/include/mach/gnumach.defs b/include/mach/gnumach.defs
> index 7ecf74d3..583bdc36 100644
> --- a/include/mach/gnumach.defs
> +++ b/include/mach/gnumach.defs
> @@ -215,3 +215,36 @@ routine vm_pages_phys(
> simpleroutine thread_set_name(
> thread : thread_t;
> name : kernel_debug_name_t);
> +
> +/*
> + * Start tracing kernel events, using a buffer of BUFSIZE bytes and
> + * tracing only events set in KINDS.
> + */
> +simpleroutine ktrace_start(
> + host_priv : host_priv_t;
> + bufsize : rpc_vm_size_t;
> + kinds : rpc_long_integer_t);
> +
> +/*
> + * Stop tracing kernel events.
> + */
> +simpleroutine ktrace_stop(
> + host_priv : host_priv_t);
> +
> +/*
> + * Free the tracing buffer allocated in ktrace_start().
> + */
> +simpleroutine ktrace_free(
> + host_priv : host_priv_t);
> +
> +/*
> + * Read the content of the tracing buffer in BUF; additionally we get
> + * a pointer to the first free byte after the tracing data in LAST_PTR
> + * and some trace parameters in FLAGS.
> + */
> +routine ktrace_collect(
> + host_priv : host_priv_t;
> + out buf : pointer_t;
> + out last_ptr : vm_offset_t;
> + out flags : natural_t);
> +
> diff --git a/include/mach/trace.h b/include/mach/trace.h
> new file mode 100644
> index 00000000..3fccbbc7
> --- /dev/null
> +++ b/include/mach/trace.h
> @@ -0,0 +1,61 @@
> +/*
> + * Copyright (C) 2023 Free Software Foundation
> + *
> + * This program is free software ; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation ; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY ; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with the program ; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#ifndef _MACH_TRACE_H_
> +#define _MACH_TRACE_H_
> +
> +#include <stdint.h>
> +#include <mach/std_types.h>
> +
> +/* trace types as encoded in the ring buffer */
> +#define DEBUG_TRACE_INVALID 0
> +#define DEBUG_TRACE_SYSCALL 1
> +#define DEBUG_TRACE_SAMPLE 2
> +#define DEBUG_TRACE_TRAP 3
> +#define DEBUG_TRACE_CSW 4
> +#define DEBUG_TRACE_IRQ 5
> +#define DEBUG_TRACE_RPC 6
> +#define DEBUG_TRACE_DUMMY 15
> +
> +/* trap events */
> +#define TRACE_TRAP_USER 0
> +#define TRACE_TRAP_KERNEL 1
> +#define TRACE_TRAP_ASTINTR 2
> +
> +/* trace types bitmask, to be used in trace_start()*/
> +#define TRACE_TYPE_SYSCALL 0x01
> +#define TRACE_TYPE_TRAP 0x02
> +#define TRACE_TYPE_CSW 0x04
> +#define TRACE_TYPE_IRQ 0x08
> +#define TRACE_TYPE_RPC 0x10
> +#define TRACE_TYPE_ALL 0x1F
> +
> +/* Trace file header definitions */
> +#define TH_MAGIC "GMTR"
> +#define TH_OVERRUN_MASK 0x0f
> +#define TH_KERNEL64 0x10
> +#define TH_USER64 0x20
> +
> +typedef struct {
> + char magic[4];
> + uint32_t flags;
> + uint32_t last;
> + uint32_t ov_first;
> +} trace_header_t;
> +
> +#endif /* _MACH_TRACE_H_ */
> diff --git a/ipc/ipc_kmsg.c b/ipc/ipc_kmsg.c
> index 8bd645ff..8a153386 100644
> --- a/ipc/ipc_kmsg.c
> +++ b/ipc/ipc_kmsg.c
> @@ -509,7 +509,7 @@ ipc_kmsg_get(
> ikm_free(kmsg);
> return MACH_SEND_INVALID_DATA;
> }
> -
> + rpc_trace(0, msg, size);
> *kmsgp = kmsg;
> return MACH_MSG_SUCCESS;
> }
> @@ -576,8 +576,10 @@ ipc_kmsg_put(
>
> if (copyoutmsg(&kmsg->ikm_header, msg, size))
> mr = MACH_RCV_INVALID_DATA;
> - else
> + else {
> mr = MACH_MSG_SUCCESS;
> + rpc_trace(1, msg, msg->msgh_size);
> + }
>
> ikm_cache_free(kmsg);
>
> diff --git a/ipc/mach_msg.c b/ipc/mach_msg.c
> index ff5e5b09..3a4298ff 100644
> --- a/ipc/mach_msg.c
> +++ b/ipc/mach_msg.c
> @@ -464,6 +464,7 @@ mach_msg_trap(
> ikm_free(kmsg);
> goto slow_get;
> }
> + rpc_trace(0, msg, send_size);
>
> fast_copyin:
> /*
> @@ -1194,6 +1195,7 @@ mach_msg_trap(
> if (!ikm_cache_free_try(kmsg))
> goto slow_put;
>
> + rpc_trace(1, msg, msg->msgh_size);
> thread_syscall_return(MACH_MSG_SUCCESS);
> /*NOTREACHED*/
> return MACH_MSG_SUCCESS; /* help for the compiler */
> diff --git a/kern/exception.c b/kern/exception.c
> index 15f29705..0e0260f0 100644
> --- a/kern/exception.c
> +++ b/kern/exception.c
> @@ -702,6 +702,7 @@ exception_raise(
> /*NOTREACHED*/
> }
>
> + rpc_trace(1, receiver->ith_msg, receiver->ith_msg->msgh_size);
> thread_syscall_return(MACH_MSG_SUCCESS);
> /*NOTREACHED*/
> #ifndef __GNUC__
> diff --git a/kern/startup.c b/kern/startup.c
> index 61751d35..ce8f0123 100644
> --- a/kern/startup.c
> +++ b/kern/startup.c
> @@ -47,6 +47,7 @@
> #include <kern/thread.h>
> #include <kern/thread_swap.h>
> #include <kern/timer.h>
> +#include <kern/trace.h>
> #include <kern/xpr.h>
> #include <kern/bootstrap.h>
> #include <kern/startup.h>
> @@ -163,6 +164,7 @@ void setup_main(void)
> compute_mach_factor();
>
> gsync_setup ();
> + trace_init();
>
> /*
> * Create a kernel thread to start the other kernel
> diff --git a/kern/thread.h b/kern/thread.h
> index 81d32924..5b5d2235 100644
> --- a/kern/thread.h
> +++ b/kern/thread.h
> @@ -240,6 +240,10 @@ struct thread {
> #endif
>
> char name[THREAD_NAME_SIZE];
> +
> +#if KERNEL_TRACE
> + unsigned int syscall_num; /* for syscall tracing */
> +#endif
> };
>
> #include <kern/cpu_number.h>
> diff --git a/kern/trace.c b/kern/trace.c
> new file mode 100644
> index 00000000..69e6eaf0
> --- /dev/null
> +++ b/kern/trace.c
> @@ -0,0 +1,432 @@
> +/*
> + * Copyright (C) 2024 Free Software Foundation
> + *
> + * This program is free software ; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation ; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY ; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with the program ; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#include "kern/thread.h"
> +#include <kern/gnumach.server.h>
> +#include <kern/kalloc.h>
> +#include <kern/printf.h>
> +#include <kern/syscall_sw.h>
> +#include <kern/trace.h>
> +#include <mach/message.h>
> +#include <mach/trace.h>
> +#include <machine/locore.h>
> +#include <string.h>
> +#include "vm/vm_kern.h"
> +
> +/* Common information for each trace entry */
> +struct msg_trace_header {
> + char task_name[TASK_NAME_SIZE];
> + thread_t thread;
> + //int time; // to address thread/task reuse? we should trace also task
> creation/destruction...
> + unsigned int type : 8;
> + unsigned int size : 24; // in bytes, can probably be reduced
> + uint8_t data[];
> +} __attribute__((packed));
> +
> +struct syscall_trace_data {
> + unsigned int num : 7; // syscall id
> + unsigned int direction : 1; // entry/exit
> + unsigned int reserved : 24;
> + uint8_t data[]; // args/retcode, depending on the syscall number and
> direction
> +} __attribute__((packed));
> +
> +struct rpc_trace_data {
> + unsigned int size : 16; // in bytes
> + unsigned int direction : 1; // send-copyin/recv-copyout
> + unsigned int reserved : 15;
> + uint8_t data[];
> +} __attribute__((packed));
> +
> +struct trap_trace_data {
> + unsigned int origin : 4; // user/kernel
> + unsigned int reserved : 28;
> + struct i386_saved_state regs;
> +} __attribute__((packed));
> +
> +struct csw_trace_data {
> + char old_task_name[TASK_NAME_SIZE];
> + thread_t old_thread;
> + continuation_t continuation;
> + char new_task_name[TASK_NAME_SIZE];
> + thread_t new_thread;
> +} __attribute__((packed));
> +
> +struct irq_trace_data {
> + int irqn;
> +} __attribute__((packed));
> +
> +struct sample_trace_data {
> + unsigned int linenum;
> + char filename[];
> +} __attribute__((packed));
> +
> +static uint8_t *msg_trace_buf = NULL;
> +static vm_size_t msg_trace_size = 0;
> +static uint8_t *msg_trace_ptr = NULL;
> +static int msg_trace_overrun = 0;
> +
> +int syscall_trace;
> +static int trace_rpc_enable;
> +static int trace_trap_enable;
> +static int trace_csw_enable;
> +int interrupt_trace_enable;
> +
> +task_t syscall_trace_task = 0;
> +unsigned int syscall_seqn = 0;
> +
> +/* Called to initialize kernel tracing */
> +void trace_init(void)
> +{
> + msg_trace_buf = NULL;
> + msg_trace_ptr = NULL;
> + msg_trace_size = 0;
> + msg_trace_overrun = 0;
> +
> + syscall_trace = 0;
> + trace_rpc_enable = 0;
> + trace_trap_enable = 0;
> + trace_csw_enable = 0;
> + interrupt_trace_enable = 0;
> +
> + /* Useful for debugging small tests and the bootstrap modules */
> + extern char* kernel_cmdline; // TODO: find a nice header
> + if (strstr(kernel_cmdline, "tracestart") != NULL)
> + {
> + kern_return_t ret = ktrace_start(&realhost, 20*1024*1024,
> TRACE_TYPE_ALL);
> + if (ret == KERN_SUCCESS)
> + printf("trace: started from command line\n");
> + else
> + panic("Can't initialize trace buffer: error %d\n", ret);
> + }
> +}
> +
> +kern_return_t ktrace_start(const host_t host_priv, rpc_vm_size_t bufsize,
> + rpc_long_integer_t kinds)
> +{
> + if (host_priv == HOST_NULL)
> + return KERN_INVALID_ARGUMENT;
> +
> + if (msg_trace_buf != NULL)
> + return KERN_FAILURE;
> +
> + int kr = kmem_alloc(kernel_map, (vm_offset_t*) &msg_trace_buf, bufsize);
> + if (kr != KERN_SUCCESS)
> + return KERN_RESOURCE_SHORTAGE;
> +
> + if (msg_trace_buf == 0)
> + return KERN_RESOURCE_SHORTAGE;
> +
> + msg_trace_ptr = msg_trace_buf;
> + msg_trace_size = bufsize;
> + msg_trace_overrun = 0;
> +
> + if (kinds & TRACE_TYPE_SYSCALL)
> + syscall_trace = 1;
> + if (kinds & TRACE_TYPE_RPC)
> + trace_rpc_enable = 1;
> + if (kinds & TRACE_TYPE_TRAP)
> + trace_trap_enable = 1;
> + if (kinds & TRACE_TYPE_CSW)
> + trace_csw_enable = 1;
> + if (kinds & TRACE_TYPE_IRQ)
> + interrupt_trace_enable = 1;
> +
> + if (current_thread())
> + current_thread()->syscall_num = 25; // fake mach_msg() entry for
> better trace
> +
> + return KERN_SUCCESS;
> +}
> +
> +kern_return_t ktrace_stop(const host_t host_priv)
> +{
> + if (host_priv == HOST_NULL)
> + return KERN_INVALID_ARGUMENT;
> +
> + if (msg_trace_buf == NULL)
> + return KERN_FAILURE;
> +
> + syscall_trace = 0;
> + trace_rpc_enable = 0;
> + trace_trap_enable = 0;
> + trace_csw_enable = 0;
> + interrupt_trace_enable = 0;
> + return KERN_SUCCESS;
> +}
> +
> +kern_return_t ktrace_free(const host_t host_priv)
> +{
> + if (host_priv == HOST_NULL)
> + return KERN_INVALID_ARGUMENT;
> +
> + if (msg_trace_buf == NULL)
> + return KERN_FAILURE;
> +
> + kmem_free(kernel_map, (vm_offset_t) msg_trace_buf, msg_trace_size);
> + msg_trace_buf = NULL;
> + msg_trace_ptr = NULL;
> + return KERN_SUCCESS;
> +}
> +
> +kern_return_t ktrace_collect(const host_t host_priv, pointer_t *buf,
> + mach_msg_type_number_t *bufsize,
> + vm_size_t *last_ptr, natural_t *flags)
> +{
> + if (host_priv == HOST_NULL)
> + return KERN_INVALID_ARGUMENT;
> +
> + if (msg_trace_buf == NULL)
> + return KERN_FAILURE;
> +
> + kern_return_t err;
> + vm_map_copy_t copy;
> + err = vm_map_copyin(kernel_map, (vm_offset_t) msg_trace_buf,
> msg_trace_size,
> + FALSE, ©);
> + if (err == KERN_SUCCESS)
> + {
> + *buf = (pointer_t) copy;
> + *bufsize = msg_trace_size;
> + *last_ptr = msg_trace_ptr - msg_trace_buf;
> + *flags = 0;
> + if (msg_trace_overrun > 0)
> + {
> + if (msg_trace_overrun < TH_OVERRUN_MASK)
> + *flags |= msg_trace_overrun & TH_OVERRUN_MASK;
> + else
> + *flags |= TH_OVERRUN_MASK;
> + }
> +#ifdef __LP64__
> + *flags |= TH_KERNEL64;
> +#ifndef USER32
> + *flags |= TH_USER64;
> +#endif /* USER32 */
> +#endif /* __x86_64__ */
> + }
> + return err;
> +}
> +
> +/*
> + * Starts a new trace entry. It pre-allocates a given size and returns a
> pointer
> + * to a region that can be filled with the same amount of data.
> + * In this way we can copy the data only once, e.g. when tracing
> mach_msg()
> + */
> +static void* trace_msg(unsigned int type, size_t size)
> +{
> + const size_t msg_size = size + sizeof(struct msg_trace_header);
> + int overrun = 0;
> + uint8_t *tptr, *tptr_orig;
> + struct msg_trace_header *cur;
> +
> + /*
> + * Handle interrupitons from trap/irq in reserving the trace data
> + * entry.
> + *
> + * TODO: ensure this is SMP-safe and maybe add cpuid to the trace
> + * header.
> + */
> + do {
> + tptr = __atomic_load_n(&msg_trace_ptr, __ATOMIC_RELAXED);
> + tptr_orig = tptr;
> +
> + if ((tptr - msg_trace_buf) > (msg_trace_size - msg_size))
> + {
> + // circular buffer, reset to the beginning
> + tptr = msg_trace_buf;
> + overrun = 1;
> + }
> +
> + tptr += msg_size;
> +
> + } while (! __atomic_compare_exchange_n(&msg_trace_ptr, &tptr_orig,
> tptr, 0,
> + __ATOMIC_RELAXED,
> __ATOMIC_RELAXED));
> +
> + if (overrun)
> + msg_trace_overrun += 1;
> +
> + cur = (struct msg_trace_header*)(tptr - msg_size);
> + memcpy(cur->task_name, current_task()->name, TASK_NAME_SIZE);
> + cur->thread = current_thread();
> + cur->type = type;
> + cur->size = msg_size;
> + return cur->data;
> +}
> +
> +#ifdef SYSCALL_TRACE_RECORD
> +static void trace_syscall_entry(int num, long *args)
> +{
> + const mach_trap_t *trap = &mach_trap_table[num];
> + size_t argsize = sizeof(long)*trap->mach_trap_arg_count;
> + size_t size = sizeof(struct syscall_trace_data) + argsize;
> + struct syscall_trace_data *data = trace_msg(DEBUG_TRACE_SYSCALL, size);
> + data->direction = 0;
> + data->num = num;
> + memcpy(data->data, args, argsize);
> +}
> +
> +static void trace_syscall_exit(int num, long ret)
> +{
> + size_t argsize = sizeof(long);
> + size_t size = sizeof(struct syscall_trace_data) + argsize;
> + struct syscall_trace_data *data = trace_msg(DEBUG_TRACE_SYSCALL, size);
> + data->direction = 1;
> + data->num = num;
> + memcpy(data->data, &ret, argsize);
> +}
> +#endif /* SYSCALL_TRACE_RECORD */
> +
> +/* This needs to be called from the machine-specific entry point */
> +void syscall_trace_common(long sysnum, long argv[MAX_SYSCALL_ARGS])
> +{
> +
> + if (syscall_trace == 0)
> + goto out;
> +
> + current_thread()->syscall_num = sysnum;
> + if (syscall_trace_task && syscall_trace_task != current_task())
> + goto out;
> +
> +#ifdef SYSCALL_TRACE_PRINT
> + if (sysnum == 30) // don't track mach_print()
> + goto out;
> +
> + syscall_seqn++;
> + const mach_trap_t *trap = &mach_trap_table[sysnum];
> + printf("%08d:%s:0x%p:%s(",
> + syscall_seqn, current_task()->name, current_thread(),
> trap->mach_trap_name);
> + for (int i=0; i<MAX_SYSCALL_ARGS; i++)
> + {
> + int value = argv[i];
> + if (trap->mach_trap_arg_count <= i)
> + continue;
> + if ((value > 1024) || (value < -1024))
> + printf("0x%08x", value);
> + else
> + printf("%d", value);
> + if (i + 1 < trap->mach_trap_arg_count)
> + printf(", ");
> + }
> + printf(")\n");
> +#else /* SYSCALL_TRACE_PRINT */
> + trace_syscall_entry(sysnum, argv);
> +#endif
> + out:
> + return;
> +}
> +
> +/* This needs to be called from the machine-specific entry point */
> +int syscall_trace_return(int val)
> +{
> + if (syscall_trace == 0)
> + return val;
> + if (syscall_trace_task && syscall_trace_task != current_task())
> + return val;
> +#ifdef SYSCALL_TRACE_PRINT
> + if (current_thread()->syscall_num != 30) // don't track mach_print()
> + {
> + const mach_trap_t *trap =
> &mach_trap_table[current_thread()->syscall_num];
> + printf("%08d:%s:0x%p: %s return(%08x)\n",
> + syscall_seqn, current_task()->name, current_thread(),
> + trap->mach_trap_name, val);
> + }
> +#else /* SYSCALL_TRACE_PRINT */
> + trace_syscall_exit(current_thread()->syscall_num, val);
> +#endif
> +
> + return val;
> +}
> +
> +void rpc_trace(int direction, void *msg, long msize)
> +{
> + if (!trace_rpc_enable)
> + return;
> +
> + size_t size = sizeof(struct rpc_trace_data);
> + struct rpc_trace_data *data = trace_msg(DEBUG_TRACE_RPC, size + msize);
> + data->direction = direction;
> + data->size = msize;
> + if (copyin(msg, data->data, msize))
> + memset(data->data, 0, msize);
> +}
> +
> +/* This needs to be called from the machine-specific entry point */
> +void trap_trace(int origin, struct i386_saved_state *regs)
> +{
> + if (!trace_trap_enable)
> + return;
> +
> + size_t size = sizeof(struct trap_trace_data);
> + struct trap_trace_data *data = trace_msg(DEBUG_TRACE_TRAP, size);
> + data->origin = origin;
> + memcpy(&data->regs, regs, sizeof(*regs));
> +}
> +
> +void context_switch_trace(thread_t old, continuation_t continuation,
> thread_t new)
> +{
> + if (!trace_csw_enable)
> + return;
> +
> + size_t size = sizeof(struct csw_trace_data);
> + struct csw_trace_data *data = trace_msg(DEBUG_TRACE_CSW, size);
> + if (old)
> + strncpy(data->old_task_name, old->task->name, TASK_NAME_SIZE);
> + else
> + data->old_task_name[0] = '\0';
> + data->old_thread = old;
> + data->continuation = continuation;
> + strncpy(data->new_task_name, new->task->name, TASK_NAME_SIZE);
> + data->new_thread = new;
> +}
> +
> +void interrupt_trace(int iunit, /* 'unit' number */
> + int old_ipl, /* old interrupt level */
> + const char *ret_addr, /* return address in interrupt
> handler */
> + struct i386_interrupt_state *regs, /* saved
> registers */
> + int irqn)
> +{
> + size_t size = sizeof(struct irq_trace_data);
> + struct irq_trace_data *data = trace_msg(DEBUG_TRACE_IRQ, size);
> + data->irqn = irqn;
> +}
> +
> +/* This trace type is useful during debug to create on-the-fly custom
> + traces, e.g. to follow the evolution of a specific value or
> + expression along other trace points */
> +void dummy_trace(void)
> +{
> + struct dummy_data {
> + int nthreads;
> + };
> + size_t size = sizeof(struct dummy_data);
> + struct dummy_data *data = trace_msg(DEBUG_TRACE_DUMMY, size);
> +
> + // as an example, we count the threads
> + task_t task;
> + thread_t thread;
> + processor_set_t pset;
> + data->nthreads = 0;
> + if (queue_first(&all_psets) == 0)
> + return;
> + queue_iterate(&all_psets, pset, processor_set_t, all_psets) {
> + if (queue_first(&pset->tasks) == 0)
> + continue;
> + queue_iterate(&pset->tasks, task, task_t, pset_tasks) {
> + queue_iterate(&task->thread_list, thread, thread_t, thread_list) {
> + data->nthreads++;
> + }
> + }
> + }
> +}
> diff --git a/kern/trace.h b/kern/trace.h
> new file mode 100644
> index 00000000..06d77e93
> --- /dev/null
> +++ b/kern/trace.h
> @@ -0,0 +1,42 @@
> +/*
> + * Copyright (C) 2023 Free Software Foundation
> + *
> + * This program is free software ; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation ; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY ; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with the program ; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#if KERNEL_TRACE
> +
> +#include <mach/trace.h>
> +#include <kern/thread.h>
> +
> +/* syscall events */
> +#define MAX_SYSCALL_ARGS 12
> +//#define SYSCALL_TRACE_PRINT // useful for small tests
> +#define SYSCALL_TRACE_RECORD
> +
> +void trace_init(void);
> +void rpc_trace(int direction, void *msg, long size);
> +void trap_trace(int origin, struct i386_saved_state *regs);
> +void syscall_trace_common(long sysnum, long argv[MAX_SYSCALL_ARGS]);
> +int syscall_trace_return(int val);
> +void context_switch_trace(thread_t old, continuation_t continuation,
> thread_t new);
> +
> +#else /* ! KERNEL_TRACE */
> +
> +#define trace_init()
> +#define trap_trace(origin, regs)
> +#define context_switch_trace(old, cont, new)
> +
> +#endif /* KERNEL_TRACE */
> diff --git a/scripts/gnumach-gdb.py b/scripts/gnumach-gdb.py
> new file mode 100644
> index 00000000..16728c06
> --- /dev/null
> +++ b/scripts/gnumach-gdb.py
> @@ -0,0 +1,63 @@
> +
> +import logging
> +
> +import gdb
> +
> +
> +log = logging.getLogger("gnumach-gdb")
> +on64bit = gdb.selected_inferior().architecture().name() == "i386:x86-64"
> +on64bituser = gdb.lookup_symbol("syscall64")[0] is not None
> +
> +
> +class TraceCmd(gdb.Command):
> + """gnumach-trace [FILE]
> +
> +Save the content of the kernel trace buffer to a file (default
> ./trace.bin).
> +"""
> +
> + def __init__(self):
> + super().__init__("gnumach-trace", gdb.COMMAND_USER)
> +
> + def invoke(self, arg_string, from_tty):
> + try:
> + self._invoke(arg_string, from_tty)
> + except Exception:
> + log.exception(arg_string)
> +
> + def _invoke(self, arg_string, from_tty):
> + args = gdb.string_to_argv(arg_string)
> + if args in ['-h', '--help']:
> + print(TraceCmd.__doc__)
> + return
> + outfile = "trace.bin"
> + if len(args) >= 1:
> + outfile = args[0]
> +
> + tbuf = gdb.lookup_static_symbol("msg_trace_buf")
> + if tbuf is None:
> + log.error("trace buffer not available.")
> + return
> + tbuf_base = gdb.lookup_static_symbol("msg_trace_buf").value()
> + tbuf_top = gdb.lookup_static_symbol("msg_trace_ptr").value()
> + tbuf_overrun =
> gdb.lookup_static_symbol("msg_trace_overrun").value()
> + tbuf_len = tbuf_top - tbuf_base
> +
> + print(f"Writing {tbuf_len} bytes to {outfile} (overrun
> {tbuf_overrun})")
> + proc = gdb.selected_inferior()
> + with open(outfile, 'wb') as tf:
> + # trace header (see mach/trace.h)
> + tf.write(b"GMTR")
> + flags = int(tbuf_overrun & 0xF)
> + if on64bit:
> + flags |= 0x10
> + if on64bituser:
> + flags |= 0x20
> + tf.write(flags.to_bytes(4, byteorder='little'))
> + tf.write(int(tbuf_len).to_bytes(4, byteorder='little'))
> + tf.write(int(0).to_bytes(4, byteorder='little'))
> + # trace payload
> + tf.write(proc.read_memory(tbuf_base, tbuf_len))
> +
> +
> +logging.basicConfig(level=logging.INFO)
> +TraceCmd()
> diff --git a/tests/test-trace.c b/tests/test-trace.c
> new file mode 100644
> index 00000000..d6bc8051
> --- /dev/null
> +++ b/tests/test-trace.c
> @@ -0,0 +1,58 @@
> +/*
> + * Copyright (C) 2024 Free Software Foundation
> + *
> + * This program is free software ; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation ; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY ; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with the program ; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#include <syscalls.h>
> +#include <testlib.h>
> +
> +#include <mach/trace.h>
> +#include <gnumach.user.h>
> +#include <mach.user.h>
> +
> +void test_trace(void)
> +{
> + int err;
> + err = ktrace_start(host_priv(), 40 * 1024 * 1024, TRACE_TYPE_ALL);
> + ASSERT_RET(err, "ktrace_start");
> +
> + err = mach_msg(NULL, MACH_MSG_OPTION_NONE, 0, 0,
> + MACH_PORT_NULL, 0, MACH_PORT_NULL);
> + ASSERT_RET(err, "syscall path with thread_syscall_return()");
> +
> + err = ktrace_stop(host_priv());
> + ASSERT_RET(err, "ktrace_stop");
> +
> + vm_offset_t buf, last_ptr;
> + mach_msg_type_number_t bufsize;
> + int flags;
> + err = ktrace_collect(host_priv(), &buf, &bufsize, &last_ptr, &flags);
> + ASSERT_RET(err, "ktrace_collect");
> + printf("size %lu last %lu flags %x\n", bufsize, last_ptr, flags);
> + ASSERT(bufsize >= last_ptr, "ERROR overrun?");
> +
> + err = vm_deallocate(mach_task_self(), buf, bufsize);
> + ASSERT_RET(err, "vm_deallocate");
> +
> + err = ktrace_free(host_priv());
> + ASSERT_RET(err, "ktrace_free");
> +}
> +
> +int main(int argc, char *argv[], int envc, char *envp[])
> +{
> + test_trace();
> + return 0;
> +}
> diff --git a/tests/user-qemu.mk b/tests/user-qemu.mk
> index fd5ae1ab..c4df73f9 100644
> --- a/tests/user-qemu.mk
> +++ b/tests/user-qemu.mk
> @@ -204,7 +204,8 @@ USER_TESTS := \
> tests/test-syscalls \
> tests/test-machmsg \
> tests/test-task \
> - tests/test-threads
> + tests/test-threads \
> + tests/test-trace
>
> USER_TESTS_CLEAN = $(subst tests/,clean-,$(USER_TESTS))
>
> diff --git a/x86_64/debug_trace.S b/x86_64/debug_trace.S
> index 7bed5ccc..97e48063 100644
> --- a/x86_64/debug_trace.S
> +++ b/x86_64/debug_trace.S
> @@ -30,19 +30,18 @@
>
> .text
> ENTRY(_debug_trace)
> + ud2 // to test, do we need to keep it in asm?
> pushf
> cli
> pushq %rax
> pushq %rbx
> - .byte 0x36 /* SS: bug in gas? */
> movl %ss:EXT(debug_trace_pos),%eax
> movq S_ARG0,%rbx
> - movq %rbx,%ss:EXT(debug_trace_buf)(,%eax,16)
> - movl S_ARG1,%ebx
> - movl %ebx,%ss:EXT(debug_trace_buf)+8(,%eax,16)
> + movq %rbx,%ss:EXT(debug_trace_buf)(,%eax,8)
> + movq S_ARG1,%rbx
> + movq %rbx,%ss:EXT(debug_trace_buf)+8(,%eax,8)
> incl %eax
> andl $DEBUG_TRACE_LEN-1,%eax
> - .byte 0x36 /* SS: bug in gas? */
> movl %eax,%ss:EXT(debug_trace_pos)
> popq %rbx
> popq %rax
> diff --git a/x86_64/interrupt.S b/x86_64/interrupt.S
> index 6fb77727..72a2fd0c 100644
> --- a/x86_64/interrupt.S
> +++ b/x86_64/interrupt.S
> @@ -115,6 +115,25 @@ ENTRY(interrupt)
> shll $2,%eax /* irq * 4 */
> movl EXT(iunit)(%rax),%edi /* get device unit number as 1st
> arg */
>
> +#if KERNEL_TRACE
> + testb $0xff,EXT(interrupt_trace_enable)
> + jz 0f
> + movq S_IRQ,S_ARG4
> + pushq %rax
> + // save args for the real handler
> + pushq S_ARG0
> + pushq S_ARG1
> + pushq S_ARG2
> + pushq S_ARG3
> + callq EXT(interrupt_trace)
> + popq S_ARG3
> + popq S_ARG2
> + popq S_ARG1
> + popq S_ARG0
> + popq %rax
> +0:
> +#endif /* KERNEL_TRACE */
> +
> shll $1,%eax /* irq * 8 */
> call *EXT(ivect)(%rax) /* call interrupt handler */
>
> diff --git a/x86_64/locore.S b/x86_64/locore.S
> index 8f39a677..9e9e5833 100644
> --- a/x86_64/locore.S
> +++ b/x86_64/locore.S
> @@ -747,6 +747,15 @@ ENTRY(thread_bootstrap_return)
> */
>
> ENTRY(thread_syscall_return)
> +#if KERNEL_TRACE
> + testb $0xff,EXT(syscall_trace)
> + jz 1f
> + /* first arg is already set in rdi, save it */
> + pushq %rdi
> + callq syscall_trace_return
> + popq %rdi
> +1:
> +#endif
> movq S_ARG0,%rax /* get return value */
> movq %rsp,%rcx /* get kernel stack */
> or $(KERNEL_STACK_SIZE-1),%rcx
> @@ -1275,16 +1284,41 @@ syscall_native:
> jnz 0b /* loop for all arguments */
>
> mach_call_call:
> -
> -#ifdef DEBUG
> +#if KERNEL_TRACE
> testb $0xff,EXT(syscall_trace)
> jz 0f
> - movq %rax,%rdi
> - call EXT(syscall_trace_print)
> - /* will return with syscallofs still (or again) in eax */
> + /*
> + * We need to save register arguments, for the actual tracing
> + * and to restore them before calling the real syscall.
> + */
> + pushq %r9
> + pushq %r8
> + pushq %rcx
> + pushq %rdx
> + pushq %rsi
> + pushq %rdi
> + pushq %rax
> + movq %rsp,%rdi
> + callq EXT(syscall_trace_enter)
> + popq %rax
> + popq %rdi
> + popq %rsi
> + popq %rdx
> + popq %rcx
> + popq %r8
> + popq %r9
> + movq %rax,%r12 /* save sysnum for tracing return value */
> 0:
> -#endif /* DEBUG */
> +#endif /* KERNEL_TRACE */
> call *EXT(mach_trap_table)+8(%rax) /* call procedure */
> +#if KERNEL_TRACE
> + testb $0xff,EXT(syscall_trace)
> + jz 1f
> + movq %rax,%rdi
> + callq syscall_trace_return
> + /* will return with return value still (or again) in eax */
> +1:
> +#endif /* KERNEL_TRACE */
> movq %rsp,%rcx /* get kernel stack */
> or $(KERNEL_STACK_SIZE-1),%rcx
> movq -7-IKS_SIZE(%rcx),%rsp /* switch back to PCB stack */
> @@ -1460,7 +1494,45 @@ _syscall64_args_stack:
> jnz 0b /* loop for all remaining
> arguments */
>
> _syscall64_call:
> +#if KERNEL_TRACE
> + testb $0xff,EXT(syscall_trace)
> + jz 0f
> + /*
> + * We need to save register arguments, for the actual tracing
> + * and to restore them before calling the real syscall.
> + * We store all arguments and the syscall number in an array
> + * on the stack; fortunately args > 6 are already there, so
> + * push the rest and restore them before actually calling the
> + * handler.
> + */
> + pushq %r9
> + pushq %r8
> + pushq %rcx
> + pushq %rdx
> + pushq %rsi
> + pushq %rdi
> + pushq %rax
> + movq %rsp,%rdi
> + callq EXT(syscall_trace_enter)
> + popq %rax
> + popq %rdi
> + popq %rsi
> + popq %rdx
> + popq %rcx
> + popq %r8
> + popq %r9
> + movq %rax,%r12 /* save sysnum for tracing return value */
> +0:
> +#endif /* KERNEL_TRACE */
> call *EXT(mach_trap_table)+8(%rax) /* call procedure */
> +#if KERNEL_TRACE
> + testb $0xff,EXT(syscall_trace)
> + jz 1f
> + movq %rax,%rdi
> + callq syscall_trace_return
> + /* will return with return value still (or again) in eax */
> +1:
> +#endif /* KERNEL_TRACE */
>
> _syscall64_check_for_ast:
> /* Check for ast. */
> --
> 2.39.2
>
>
>