This speeds up smp again, by storing the struct processor in a percpu area and avoiding an expensive cpu_number every call of current_processor(), as well as getting the cpu_number by an offset into the percpu area. Needs work for 64 bit and replacing other percpu arrays. --- i386/Makefrag.am | 2 ++ i386/i386/cpu_number.c | 8 ++++- i386/i386/cpu_number.h | 1 + i386/i386/gdt.c | 9 +++++- i386/i386/gdt.h | 11 ++++++- i386/i386/i386asm.sym | 7 ---- i386/i386/locore.S | 13 +++++--- i386/i386/mp_desc.c | 4 ++- i386/i386/percpu.c | 30 ++++++++++++++++++ i386/i386/percpu.h | 72 ++++++++++++++++++++++++++++++++++++++++++ kern/processor.c | 7 ++-- kern/processor.h | 18 ++++------- 12 files changed, 150 insertions(+), 32 deletions(-) create mode 100644 i386/i386/percpu.c create mode 100644 i386/i386/percpu.h
diff --git a/i386/Makefrag.am b/i386/Makefrag.am index 274e8695..c1724cea 100644 --- a/i386/Makefrag.am +++ b/i386/Makefrag.am @@ -108,6 +108,8 @@ libkernel_a_SOURCES += \ i386/i386/irq.c \ i386/i386/irq.h \ i386/i386/msr.h \ + i386/i386/percpu.c \ + i386/i386/percpu.h \ i386/i386/pit.c \ i386/i386/pit.h diff --git a/i386/i386/cpu_number.c b/i386/i386/cpu_number.c index ef19e11f..241015b5 100644 --- a/i386/i386/cpu_number.c +++ b/i386/i386/cpu_number.c @@ -20,11 +20,17 @@ #include <i386/smp.h> #include <i386/cpu.h> #include <i386/mp_desc.h> +#include <i386/percpu.h> #include <kern/printf.h> #if NCPUS > 1 -int cpu_number(void) +int cpu_number_slow(void) { return cpu_id_lut[apic_get_current_cpu()]; } + +int cpu_number(void) +{ + return *((int *)percpu_ptr(int, cpu_id)); +} #endif diff --git a/i386/i386/cpu_number.h b/i386/i386/cpu_number.h index 479a847a..098696a3 100644 --- a/i386/i386/cpu_number.h +++ b/i386/i386/cpu_number.h @@ -65,6 +65,7 @@ #ifndef __ASSEMBLER__ #include "kern/cpu_number.h" +int cpu_number_slow(void); int cpu_number(void); #endif diff --git a/i386/i386/gdt.c b/i386/i386/gdt.c index ddda603b..e335de50 100644 --- a/i386/i386/gdt.c +++ b/i386/i386/gdt.c @@ -35,6 +35,7 @@ #include <kern/assert.h> #include <intel/pmap.h> +#include <machine/percpu.h> #include "vm_param.h" #include "seg.h" @@ -73,6 +74,11 @@ gdt_fill(struct real_descriptor *mygdt) 0xffffffff, ACC_PL_K|ACC_DATA_W, SZ_32); #endif /* MACH_PV_DESCRIPTORS */ + vm_offset_t thiscpu = kvtolin(&percpu_array[cpu_number_slow()]); + _fill_gdt_descriptor(mygdt, PERCPU_DS, + thiscpu, + thiscpu + sizeof(struct percpu) - 1, + ACC_PL_K|ACC_DATA_W, SZ_32); #endif #ifdef MACH_PV_DESCRIPTORS @@ -119,8 +125,9 @@ reload_segs(void) "movw %w1,%%ds\n" "movw %w1,%%es\n" + "movw %w3,%%gs\n" "movw %w1,%%ss\n" - : : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0)); + : : "i" (KERNEL_CS), "r" (KERNEL_DS), "r" (0), "r" (PERCPU_DS)); #endif } diff --git a/i386/i386/gdt.h b/i386/i386/gdt.h index 5def73cb..d5d78d43 100644 --- a/i386/i386/gdt.h +++ b/i386/i386/gdt.h @@ -77,12 +77,20 @@ /* 0x58 used by user TSS in 64bit mode */ +#ifdef __x86_64__ +/* XXX */ +#else +#define PERCPU_DS 0x68 /* per-cpu data mapping */ +#endif + #ifdef __x86_64__ #define GDTSZ sel_idx(0x60) #else -#define GDTSZ sel_idx(0x58) +#define GDTSZ sel_idx(0x70) #endif +#ifndef __ASSEMBLER__ + extern struct real_descriptor gdt[GDTSZ]; /* Fill a segment descriptor in the GDT. */ @@ -117,4 +125,5 @@ extern struct real_descriptor gdt[GDTSZ]; extern void gdt_init(void); extern void ap_gdt_init(int cpu); +#endif /* __ASSEMBLER__ */ #endif /* _I386_GDT_ */ diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym index 436e296a..832c7041 100644 --- a/i386/i386/i386asm.sym +++ b/i386/i386/i386asm.sym @@ -154,17 +154,10 @@ expr NPTES PTES_PER_PAGE expr INTEL_PTE_VALID|INTEL_PTE_WRITE INTEL_PTE_KERNEL expr IDTSZ -expr GDTSZ -expr LDTSZ expr KERNEL_RING - expr KERNEL_CS expr KERNEL_DS -expr KERNEL_TSS -#ifndef MACH_PV_DESCRIPTORS -expr KERNEL_LDT -#endif /* MACH_PV_DESCRIPTORS */ expr (VM_MIN_KERNEL_ADDRESS>>PDESHIFT)*sizeof(pt_entry_t) KERNELBASEPDE diff --git a/i386/i386/locore.S b/i386/i386/locore.S index 55aa9d60..463cce55 100644 --- a/i386/i386/locore.S +++ b/i386/i386/locore.S @@ -33,6 +33,7 @@ #include <i386/proc_reg.h> #include <i386/trap.h> #include <i386/seg.h> +#include <i386/gdt.h> #include <i386/ldt.h> #include <i386/i386asm.h> #include <i386/cpu_number.h> @@ -468,7 +469,8 @@ trap_push_segs: mov %ax,%ds /* (same as kernel stack segment) */ mov %ax,%es mov %ax,%fs - mov %ax,%gs + mov $(PERCPU_DS),%ax + movw %ax,%gs trap_set_segs: cld /* clear direction flag */ @@ -686,7 +688,8 @@ ENTRY(all_intrs) mov %dx,%ds mov %dx,%es mov %dx,%fs - mov %dx,%gs + mov $(PERCPU_DS),%dx + movw %dx,%gs CPU_NUMBER(%edx) @@ -792,7 +795,8 @@ ast_from_interrupt: mov %dx,%ds mov %dx,%es mov %dx,%fs - mov %dx,%gs + mov $(PERCPU_DS),%dx + movw %dx,%gs CPU_NUMBER(%edx) TIME_TRAP_UENTRY @@ -1051,7 +1055,8 @@ syscall_entry_2: mov %dx,%ds mov %dx,%es mov %dx,%fs - mov %dx,%gs + mov $(PERCPU_DS),%dx + movw %dx,%gs /* * Shuffle eflags,eip,cs into proper places diff --git a/i386/i386/mp_desc.c b/i386/i386/mp_desc.c index f1a1f989..465ffacc 100644 --- a/i386/i386/mp_desc.c +++ b/i386/i386/mp_desc.c @@ -143,6 +143,8 @@ mp_desc_init(int mycpu) struct mp_desc_table *mpt; vm_offset_t mem; + init_percpu(mycpu); + if (mycpu == 0) { /* * Master CPU uses the tables built at boot time. @@ -275,7 +277,7 @@ cpu_setup(int cpu) void cpu_ap_main() { - int cpu = cpu_number(); + int cpu = cpu_number_slow(); do { cpu_pause(); diff --git a/i386/i386/percpu.c b/i386/i386/percpu.c new file mode 100644 index 00000000..0bc8b234 --- /dev/null +++ b/i386/i386/percpu.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2023 Free Software Foundation, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <i386/smp.h> +#include <i386/apic.h> +#include <i386/percpu.h> + +struct percpu percpu_array[NCPUS] __aligned(0x8000); + +void init_percpu(int cpu) +{ + int apic_id = apic_get_current_cpu(); + + percpu_array[cpu].self = &percpu_array[cpu]; + percpu_array[cpu].apic_id = apic_id; + percpu_array[cpu].cpu_id = cpu_id_lut[apic_id]; +} diff --git a/i386/i386/percpu.h b/i386/i386/percpu.h new file mode 100644 index 00000000..b22d512c --- /dev/null +++ b/i386/i386/percpu.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2023 Free Software Foundation, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _PERCPU_H_ +#define _PERCPU_H_ + +#include <kern/ast.h> +#include <kern/processor.h> +#include <kern/thread.h> +#include <kern/timer.h> +#include <i386/mp_desc.h> +#include <i386/spl.h> +#include <intel/pmap.h> +#include <ipc/ipc_kmsg.h> + +#define percpu_assign(stm, val) \ + asm("mov %0, %%gs:%1" \ + : : "r" (val), "m" (__builtin_offsetof(struct percpu, stm))); + +#define percpu_ptr(typ, stm) \ +MACRO_BEGIN \ + typ *ptr_ = (typ *)__builtin_offsetof(struct percpu, stm); \ + \ + asm("add %%gs:0, %0" \ + : "+r" (ptr_) \ + : ); \ + \ + ptr_; \ +MACRO_END + +struct percpu { + struct percpu *self; + struct processor processor; +/* + struct machine_slot machine_slot; + struct mp_desc_table mp_desc_table; + thread_t active_thread; + vm_offset_t active_stack; + vm_offset_t int_stack_top; + vm_offset_t int_stack_base; + ast_t need_ast; + ipc_kmsg_t ipc_kmsg_cache; + pmap_update_list cpu_update_list; + spl_t saved_ipl; + spl_t curr_ipl; + timer_data_t kernel_timer; + timer_t current_timer; + unsigned long in_interrupt; +*/ + int apic_id; + int cpu_id; +}; + +extern struct percpu percpu_array[NCPUS]; + +void init_percpu(int cpu); + +#endif /* _PERCPU_H_ */ diff --git a/kern/processor.c b/kern/processor.c index 2cd6d46c..76735381 100644 --- a/kern/processor.c +++ b/kern/processor.c @@ -60,14 +60,12 @@ struct kmem_cache pset_cache; int master_cpu; struct processor_set default_pset; -struct processor processor_array[NCPUS]; queue_head_t all_psets; int all_psets_count; def_simple_lock_data(, all_psets_lock); processor_t master_processor; -processor_t processor_ptr[NCPUS]; /* * Bootstrap the processor/pset system so the scheduler can run. @@ -81,10 +79,9 @@ void pset_sys_bootstrap(void) for (i = 0; i < NCPUS; i++) { /* * Initialize processor data structures. - * Note that cpu_to_processor(i) is processor_ptr[i]. + * Note that cpu_to_processor is processor_ptr. */ - processor_ptr[i] = &processor_array[i]; - processor_init(processor_ptr[i], i); + processor_init(processor_ptr(i), i); } master_processor = cpu_to_processor(master_cpu); queue_init(&all_psets); diff --git a/kern/processor.h b/kern/processor.h index 17b784a3..d83cdf3c 100644 --- a/kern/processor.h +++ b/kern/processor.h @@ -112,6 +112,8 @@ struct processor { typedef struct processor Processor; extern struct processor processor_array[NCPUS]; +#include <machine/percpu.h> + /* * Chain of all processor sets. */ @@ -195,23 +197,15 @@ extern processor_t master_processor; #define PROCESSOR_ASSIGN 4 /* Assignment is changing */ #define PROCESSOR_SHUTDOWN 5 /* Being shutdown */ -/* - * Use processor ptr array to find current processor's data structure. - * This replaces a multiplication (index into processor_array) with - * an array lookup and a memory reference. It also allows us to save - * space if processor numbering gets too sparse. - */ - -extern processor_t processor_ptr[NCPUS]; - -#define cpu_to_processor(i) (processor_ptr[i]) +#define processor_ptr(i) (&percpu_array[i].processor) +#define cpu_to_processor processor_ptr -#define current_processor() (processor_ptr[cpu_number()]) +#define current_processor() (percpu_ptr(struct processor, processor)) #define current_processor_set() (current_processor()->processor_set) /* Compatibility -- will go away */ -#define cpu_state(slot_num) (processor_ptr[slot_num]->state) +#define cpu_state(slot_num) (processor_ptr(slot_num)->state) #define cpu_idle(slot_num) (cpu_state(slot_num) == PROCESSOR_IDLE) /* Useful lock macros */ -- 2.40.1