commit: 9291c0c0c06fefce13e8b3de296acf45d239d0e8 Author: Arisu Tachibana <alicef <AT> gentoo <DOT> org> AuthorDate: Mon Nov 3 01:31:15 2025 +0000 Commit: Arisu Tachibana <alicef <AT> gentoo <DOT> org> CommitDate: Mon Nov 3 01:31:15 2025 +0000 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=9291c0c0
Linux patch 6.17.7 Signed-off-by: Arisu Tachibana <alicef <AT> gentoo.org> 0000_README | 4 + 1006_linux-6.17.7.patch | 3343 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 3347 insertions(+) diff --git a/0000_README b/0000_README index 24a12c6c..6ab66ab4 100644 --- a/0000_README +++ b/0000_README @@ -67,6 +67,10 @@ Patch: 1005_linux-6.17.6.patch From: https://www.kernel.org Desc: Linux 6.17.6 +Patch: 1006_linux-6.17.7.patch +From: https://www.kernel.org +Desc: Linux 6.17.7 + Patch: 1510_fs-enable-link-security-restrictions-by-default.patch From: http://sources.debian.net/src/linux/3.16.7-ckt4-3/debian/patches/debian/fs-enable-link-security-restrictions-by-default.patch/ Desc: Enable link security restrictions by default. diff --git a/1006_linux-6.17.7.patch b/1006_linux-6.17.7.patch new file mode 100644 index 00000000..2130d13c --- /dev/null +++ b/1006_linux-6.17.7.patch @@ -0,0 +1,3343 @@ +diff --git a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst +index 5964901d66e317..d0bdbd81dcf9f2 100644 +--- a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst ++++ b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst +@@ -218,6 +218,7 @@ SRSO X X X X + SSB X + TAA X X X X * (Note 2) + TSA X X X X ++VMSCAPE X + =============== ============== ============ ============= ============== ============ ======== + + Notes: +diff --git a/Makefile b/Makefile +index d090c7c253e8d3..570042d208fd3e 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 + VERSION = 6 + PATCHLEVEL = 17 +-SUBLEVEL = 6 ++SUBLEVEL = 7 + EXTRAVERSION = + NAME = Baby Opossum Posse + +diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c +index e9dad60b147f33..1ebb058904992b 100644 +--- a/arch/alpha/kernel/asm-offsets.c ++++ b/arch/alpha/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #include <linux/types.h> + #include <linux/stddef.h> +diff --git a/arch/arc/kernel/asm-offsets.c b/arch/arc/kernel/asm-offsets.c +index f77deb7991757e..2978da85fcb65b 100644 +--- a/arch/arc/kernel/asm-offsets.c ++++ b/arch/arc/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) + */ ++#define COMPILE_OFFSETS + + #include <linux/sched.h> + #include <linux/mm.h> +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index 123f4a8ef44660..2101938d27fcbc 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -7,6 +7,8 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS ++ + #include <linux/compiler.h> + #include <linux/sched.h> + #include <linux/mm.h> +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 30d4bbe68661f4..b6367ff3a49ca1 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -6,6 +6,7 @@ + * 2001-2002 Keith Owens + * Copyright (C) 2012 ARM Ltd. + */ ++#define COMPILE_OFFSETS + + #include <linux/arm_sdei.h> + #include <linux/sched.h> +diff --git a/arch/csky/kernel/asm-offsets.c b/arch/csky/kernel/asm-offsets.c +index d1e9035794733d..5525c8e7e1d9ea 100644 +--- a/arch/csky/kernel/asm-offsets.c ++++ b/arch/csky/kernel/asm-offsets.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. ++#define COMPILE_OFFSETS + + #include <linux/sched.h> + #include <linux/kernel_stat.h> +diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c +index 03a7063f945614..50eea9fa6f1375 100644 +--- a/arch/hexagon/kernel/asm-offsets.c ++++ b/arch/hexagon/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * + * Copyright (c) 2010-2012, The Linux Foundation. All rights reserved. + */ ++#define COMPILE_OFFSETS + + #include <linux/compat.h> + #include <linux/types.h> +diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c +index db1e4bb26b6a01..3017c715760099 100644 +--- a/arch/loongarch/kernel/asm-offsets.c ++++ b/arch/loongarch/kernel/asm-offsets.c +@@ -4,6 +4,8 @@ + * + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ ++#define COMPILE_OFFSETS ++ + #include <linux/types.h> + #include <linux/sched.h> + #include <linux/mm.h> +diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c +index 906d7323053744..67a1990f9d748f 100644 +--- a/arch/m68k/kernel/asm-offsets.c ++++ b/arch/m68k/kernel/asm-offsets.c +@@ -9,6 +9,7 @@ + * #defines from the assembly-language output. + */ + ++#define COMPILE_OFFSETS + #define ASM_OFFSETS_C + + #include <linux/stddef.h> +diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c +index 104c3ac5f30c88..b4b67d58e7f6ae 100644 +--- a/arch/microblaze/kernel/asm-offsets.c ++++ b/arch/microblaze/kernel/asm-offsets.c +@@ -7,6 +7,7 @@ + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ ++#define COMPILE_OFFSETS + + #include <linux/init.h> + #include <linux/stddef.h> +diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c +index 1e29efcba46e57..5debd9a3854a9e 100644 +--- a/arch/mips/kernel/asm-offsets.c ++++ b/arch/mips/kernel/asm-offsets.c +@@ -9,6 +9,8 @@ + * Kevin Kissell, [email protected] and Carsten Langgaard, [email protected] + * Copyright (C) 2000 MIPS Technologies, Inc. + */ ++#define COMPILE_OFFSETS ++ + #include <linux/compat.h> + #include <linux/types.h> + #include <linux/sched.h> +diff --git a/arch/nios2/kernel/asm-offsets.c b/arch/nios2/kernel/asm-offsets.c +index e3d9b7b6fb48aa..88190b503ce5de 100644 +--- a/arch/nios2/kernel/asm-offsets.c ++++ b/arch/nios2/kernel/asm-offsets.c +@@ -2,6 +2,7 @@ + /* + * Copyright (C) 2011 Tobias Klauser <[email protected]> + */ ++#define COMPILE_OFFSETS + + #include <linux/stddef.h> + #include <linux/sched.h> +diff --git a/arch/openrisc/kernel/asm-offsets.c b/arch/openrisc/kernel/asm-offsets.c +index 710651d5aaae10..3cc826f2216b10 100644 +--- a/arch/openrisc/kernel/asm-offsets.c ++++ b/arch/openrisc/kernel/asm-offsets.c +@@ -18,6 +18,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include <linux/signal.h> + #include <linux/sched.h> +diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c +index 757816a7bd4b28..9abfe65492c65e 100644 +--- a/arch/parisc/kernel/asm-offsets.c ++++ b/arch/parisc/kernel/asm-offsets.c +@@ -13,6 +13,7 @@ + * Copyright (C) 2002 Randolph Chung <tausq with parisc-linux.org> + * Copyright (C) 2003 James Bottomley <jejb at parisc-linux.org> + */ ++#define COMPILE_OFFSETS + + #include <linux/types.h> + #include <linux/sched.h> +diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c +index b3048f6d3822c0..a4bc80b30410ae 100644 +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include <linux/compat.h> + #include <linux/signal.h> +diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c +index 6e8c0d6feae9e9..7d42d3b8a32a75 100644 +--- a/arch/riscv/kernel/asm-offsets.c ++++ b/arch/riscv/kernel/asm-offsets.c +@@ -3,6 +3,7 @@ + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + */ ++#define COMPILE_OFFSETS + + #include <linux/kbuild.h> + #include <linux/mm.h> +diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c +index 95ecad9c7d7d27..a8915663e917fa 100644 +--- a/arch/s390/kernel/asm-offsets.c ++++ b/arch/s390/kernel/asm-offsets.c +@@ -4,6 +4,7 @@ + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ ++#define COMPILE_OFFSETS + + #include <linux/kbuild.h> + #include <linux/sched.h> +diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c +index a0322e8328456e..429b6a76314684 100644 +--- a/arch/sh/kernel/asm-offsets.c ++++ b/arch/sh/kernel/asm-offsets.c +@@ -8,6 +8,7 @@ + * compile this file to assembler, and then extract the + * #defines from the assembly-language output. + */ ++#define COMPILE_OFFSETS + + #include <linux/stddef.h> + #include <linux/types.h> +diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c +index 3d9b9855dce917..6e660bde48dd89 100644 +--- a/arch/sparc/kernel/asm-offsets.c ++++ b/arch/sparc/kernel/asm-offsets.c +@@ -10,6 +10,7 @@ + * + * On sparc, thread_info data is static and TI_XXX offsets are computed by hand. + */ ++#define COMPILE_OFFSETS + + #include <linux/sched.h> + #include <linux/mm_types.h> +diff --git a/arch/um/kernel/asm-offsets.c b/arch/um/kernel/asm-offsets.c +index 1fb12235ab9c84..a69873aa697f4f 100644 +--- a/arch/um/kernel/asm-offsets.c ++++ b/arch/um/kernel/asm-offsets.c +@@ -1 +1,3 @@ ++#define COMPILE_OFFSETS ++ + #include <sysdep/kernel-offsets.h> +diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c +index 15da60cf69f20c..046d12281fd94e 100644 +--- a/arch/x86/events/intel/core.c ++++ b/arch/x86/events/intel/core.c +@@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) + { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; +- u64 mask, bits = 0; + int idx = hwc->idx; ++ u64 bits = 0; + + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); +@@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) + + idx -= INTEL_PMC_IDX_FIXED; + bits = intel_fixed_bits_by_idx(idx, bits); +- mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); +- +- if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { ++ if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) + bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); +- mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); +- } + +- cpuc->fixed_ctrl_val &= ~mask; ++ cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); + cpuc->fixed_ctrl_val |= bits; + } + +diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h +index 70d1d94aca7e63..ee943bd1595af5 100644 +--- a/arch/x86/include/asm/perf_event.h ++++ b/arch/x86/include/asm/perf_event.h +@@ -35,7 +35,6 @@ + #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) + #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) + +-#define INTEL_FIXED_BITS_MASK 0xFULL + #define INTEL_FIXED_BITS_STRIDE 4 + #define INTEL_FIXED_0_KERNEL (1ULL << 0) + #define INTEL_FIXED_0_USER (1ULL << 1) +@@ -48,6 +47,11 @@ + #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) + #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) + ++#define INTEL_FIXED_BITS_MASK \ ++ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ ++ INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ ++ ICL_FIXED_0_ADAPTIVE) ++ + #define intel_fixed_bits_by_idx(_idx, _bits) \ + ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) + +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 36dcfc5105be9a..26ece97011fd7e 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -434,6 +434,9 @@ static bool __init should_mitigate_vuln(unsigned int bug) + case X86_BUG_SPEC_STORE_BYPASS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + ++ case X86_BUG_VMSCAPE: ++ return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); ++ + default: + WARN(1, "Unknown bug %x\n", bug); + return false; +@@ -1460,8 +1463,12 @@ static void __init retbleed_update_mitigation(void) + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: +- if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) +- pr_err(RETBLEED_INTEL_MSG); ++ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { ++ if (retbleed_mitigation != RETBLEED_MITIGATION_NONE) ++ pr_err(RETBLEED_INTEL_MSG); ++ ++ retbleed_mitigation = RETBLEED_MITIGATION_NONE; ++ } + } + } + +@@ -2045,7 +2052,7 @@ static void __init spectre_v2_user_apply_mitigation(void) + static const char * const spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", +- [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", ++ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", +@@ -3304,15 +3311,18 @@ early_param("vmscape", vmscape_parse_cmdline); + + static void __init vmscape_select_mitigation(void) + { +- if (cpu_mitigations_off() || +- !boot_cpu_has_bug(X86_BUG_VMSCAPE) || ++ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + +- if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) +- vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; ++ if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { ++ if (should_mitigate_vuln(X86_BUG_VMSCAPE)) ++ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; ++ else ++ vmscape_mitigation = VMSCAPE_MITIGATION_NONE; ++ } + } + + static void __init vmscape_update_mitigation(void) +@@ -3626,9 +3636,6 @@ static const char *spectre_bhi_state(void) + + static ssize_t spectre_v2_show_state(char *buf) + { +- if (spectre_v2_enabled == SPECTRE_V2_LFENCE) +- return sysfs_emit(buf, "Vulnerable: LFENCE\n"); +- + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + +diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h +index ad89d0bd600581..103604c4b33b58 100644 +--- a/arch/x86/kvm/pmu.h ++++ b/arch/x86/kvm/pmu.h +@@ -13,7 +13,7 @@ + #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + +-/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ ++/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ + #define fixed_ctrl_field(ctrl_reg, idx) \ + (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) + +diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c +index da38de20ae598b..cfbced95e944a4 100644 +--- a/arch/xtensa/kernel/asm-offsets.c ++++ b/arch/xtensa/kernel/asm-offsets.c +@@ -11,6 +11,7 @@ + * + * Chris Zankel <[email protected]> + */ ++#define COMPILE_OFFSETS + + #include <asm/processor.h> + #include <asm/coprocessor.h> +diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c +old mode 100755 +new mode 100644 +diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c +index 0f338adf7d9376..8689631f190536 100644 +--- a/drivers/edac/edac_mc_sysfs.c ++++ b/drivers/edac/edac_mc_sysfs.c +@@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 10); + DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 11); ++DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 12); ++DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 13); ++DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 14); ++DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, ++ channel_dimm_label_show, channel_dimm_label_store, 15); + + /* Total possible dynamic DIMM Label attribute file table */ + static struct attribute *dynamic_csrow_dimm_attr[] = { +@@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { + &dev_attr_legacy_ch9_dimm_label.attr.attr, + &dev_attr_legacy_ch10_dimm_label.attr.attr, + &dev_attr_legacy_ch11_dimm_label.attr.attr, ++ &dev_attr_legacy_ch12_dimm_label.attr.attr, ++ &dev_attr_legacy_ch13_dimm_label.attr.attr, ++ &dev_attr_legacy_ch14_dimm_label.attr.attr, ++ &dev_attr_legacy_ch15_dimm_label.attr.attr, + NULL + }; + +@@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 10); + DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 11); ++DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 12); ++DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 13); ++DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 14); ++DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, ++ channel_ce_count_show, NULL, 15); + + /* Total possible dynamic ce_count attribute file table */ + static struct attribute *dynamic_csrow_ce_count_attr[] = { +@@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { + &dev_attr_legacy_ch9_ce_count.attr.attr, + &dev_attr_legacy_ch10_ce_count.attr.attr, + &dev_attr_legacy_ch11_ce_count.attr.attr, ++ &dev_attr_legacy_ch12_ce_count.attr.attr, ++ &dev_attr_legacy_ch13_ce_count.attr.attr, ++ &dev_attr_legacy_ch14_ce_count.attr.attr, ++ &dev_attr_legacy_ch15_ce_count.attr.attr, + NULL + }; + +diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c +index 5c1fa1c0d12e3c..5a080ab65476da 100644 +--- a/drivers/edac/ie31200_edac.c ++++ b/drivers/edac/ie31200_edac.c +@@ -99,6 +99,8 @@ + + /* Alder Lake-S */ + #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1 0x4660 ++#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2 0x4668 /* 8P+4E, e.g. i7-12700K */ ++#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3 0x4648 /* 6P+4E, e.g. i5-12600K */ + + /* Bartlett Lake-S */ + #define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1 0x4639 +@@ -761,6 +763,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = { + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg}, ++ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg}, ++ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg}, +diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c +old mode 100755 +new mode 100644 +diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c +old mode 100755 +new mode 100644 +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 70fc4e7cc5a0e6..0b02e36b30558e 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2087,10 +2087,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); ++ btrfs_put_root(log_tree_root); + if (ret) { + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); +- btrfs_put_root(log_tree_root); + return ret; + } + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 97d517cdf2df75..682d21a73a67a4 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4297,7 +4297,8 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + } + + static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, +- struct find_free_extent_ctl *ffe_ctl) ++ struct find_free_extent_ctl *ffe_ctl, ++ struct btrfs_space_info *space_info) + { + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); +@@ -4321,6 +4322,7 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, + u64 avail = block_group->zone_capacity - block_group->alloc_offset; + + if (block_group_bits(block_group, ffe_ctl->flags) && ++ block_group->space_info == space_info && + avail >= ffe_ctl->num_bytes) { + ffe_ctl->hint_byte = block_group->start; + break; +@@ -4342,7 +4344,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, + return prepare_allocation_clustered(fs_info, ffe_ctl, + space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: +- return prepare_allocation_zoned(fs_info, ffe_ctl); ++ return prepare_allocation_zoned(fs_info, ffe_ctl, space_info); + default: + BUG(); + } +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 4031cbdea07400..41da405181b4f0 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3107,9 +3107,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) + goto out; + } + +- if (btrfs_is_zoned(fs_info)) +- btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, +- ordered_extent->disk_num_bytes); ++ ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ++ ordered_extent->disk_num_bytes); ++ if (ret) ++ goto out; + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 6776e6ab8d1080..fd4c1ca34b5e47 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -1369,8 +1369,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ +- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); +- div = min_t(u32, 64, div); ++ div = clamp(bwlimit / (16 * 1024 * 1024), 1, 64); + + /* Start new epoch, set deadline */ + now = ktime_get(); +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index c5c0d9cf1a8088..a4e486a600bed0 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1806,7 +1806,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + } + /* see comments in should_cow_block() */ + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); +- smp_wmb(); ++ smp_mb__after_atomic(); + + btrfs_set_root_node(new_root_item, tmp); + /* record when the snapshot was created in key.offset */ +diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c +index a997c7cc35a26f..30bc8eb28005cd 100644 +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -183,6 +183,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, + /* Only these key->types needs to be checked */ + ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || + key->type == BTRFS_INODE_REF_KEY || ++ key->type == BTRFS_INODE_EXTREF_KEY || + key->type == BTRFS_DIR_INDEX_KEY || + key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_EXTENT_DATA_KEY); +@@ -1782,6 +1783,39 @@ static int check_inode_ref(struct extent_buffer *leaf, + return 0; + } + ++static int check_inode_extref(struct extent_buffer *leaf, ++ struct btrfs_key *key, struct btrfs_key *prev_key, ++ int slot) ++{ ++ unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); ++ unsigned long end = ptr + btrfs_item_size(leaf, slot); ++ ++ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) ++ return -EUCLEAN; ++ ++ while (ptr < end) { ++ struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; ++ u16 namelen; ++ ++ if (unlikely(ptr + sizeof(*extref) > end)) { ++ inode_ref_err(leaf, slot, ++ "inode extref overflow, ptr %lu end %lu inode_extref size %zu", ++ ptr, end, sizeof(*extref)); ++ return -EUCLEAN; ++ } ++ ++ namelen = btrfs_inode_extref_name_len(leaf, extref); ++ if (unlikely(ptr + sizeof(*extref) + namelen > end)) { ++ inode_ref_err(leaf, slot, ++ "inode extref overflow, ptr %lu end %lu namelen %u", ++ ptr, end, namelen); ++ return -EUCLEAN; ++ } ++ ptr += sizeof(*extref) + namelen; ++ } ++ return 0; ++} ++ + static int check_raid_stripe_extent(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) + { +@@ -1893,6 +1927,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, + case BTRFS_INODE_REF_KEY: + ret = check_inode_ref(leaf, key, prev_key, slot); + break; ++ case BTRFS_INODE_EXTREF_KEY: ++ ret = check_inode_extref(leaf, key, prev_key, slot); ++ break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + ret = check_block_group_item(leaf, key, slot); + break; +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 7a63afedd01e6e..165d2ee500ca3b 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -347,6 +347,7 @@ static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen, int level) + { ++ struct btrfs_trans_handle *trans = wc->trans; + struct btrfs_fs_info *fs_info = log->fs_info; + int ret = 0; + +@@ -361,18 +362,29 @@ static int process_one_buffer(struct btrfs_root *log, + }; + + ret = btrfs_read_extent_buffer(eb, &check); +- if (ret) ++ if (ret) { ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; ++ } + } + + if (wc->pin) { +- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); +- if (ret) ++ ASSERT(trans != NULL); ++ ret = btrfs_pin_extent_for_log_replay(trans, eb); ++ if (ret) { ++ btrfs_abort_transaction(trans, ret); + return ret; ++ } + + if (btrfs_buffer_uptodate(eb, gen, 0) && +- btrfs_header_level(eb) == 0) ++ btrfs_header_level(eb) == 0) { + ret = btrfs_exclude_logged_extents(eb); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ } + } + return ret; + } +@@ -1784,6 +1796,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + else + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); + } else if (ret == -EEXIST) { + ret = 0; + } +@@ -2449,15 +2463,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + int i; + int ret; + ++ if (level != 0) ++ return 0; ++ + ret = btrfs_read_extent_buffer(eb, &check); + if (ret) + return ret; + +- level = btrfs_header_level(eb); +- +- if (level != 0) +- return 0; +- + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +@@ -2630,15 +2642,24 @@ static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) + static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) + { ++ int ret; ++ + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + +- if (trans) +- return btrfs_pin_reserved_extent(trans, eb); ++ if (trans) { ++ ret = btrfs_pin_reserved_extent(trans, eb); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ return ret; ++ } + +- return unaccount_log_buffer(eb->fs_info, eb->start); ++ ret = unaccount_log_buffer(eb->fs_info, eb->start); ++ if (ret) ++ btrfs_handle_fs_error(eb->fs_info, ret, NULL); ++ return ret; + } + + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, +@@ -2674,8 +2695,14 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), + *level - 1); +- if (IS_ERR(next)) +- return PTR_ERR(next); ++ if (IS_ERR(next)) { ++ ret = PTR_ERR(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); ++ return ret; ++ } + + if (*level == 1) { + ret = wc->process_func(root, next, wc, ptr_gen, +@@ -2690,6 +2717,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } + +@@ -2705,6 +2736,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + ret = btrfs_read_extent_buffer(next, &check); + if (ret) { + free_extent_buffer(next); ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(fs_info, ret, NULL); + return ret; + } + +@@ -7434,7 +7469,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) + + log_root_tree->log_root = NULL; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); +- btrfs_put_root(log_root_tree); + + return 0; + error: +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 87c5dd3ad016e4..fcdf7b058a584c 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2464,16 +2464,17 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) + return ret; + } + +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) + { + struct btrfs_block_group *block_group; + u64 min_alloc_bytes; + + if (!btrfs_is_zoned(fs_info)) +- return; ++ return 0; + + block_group = btrfs_lookup_block_group(fs_info, logical); +- ASSERT(block_group); ++ if (WARN_ON_ONCE(!block_group)) ++ return -ENOENT; + + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) +@@ -2490,6 +2491,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len + + out: + btrfs_put_block_group(block_group); ++ return 0; + } + + static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index 6e11533b8e14c2..17c5656580dd97 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -83,7 +83,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + bool btrfs_zone_activate(struct btrfs_block_group *block_group); + int btrfs_zone_finish(struct btrfs_block_group *block_group); + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); +-void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, ++int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); + void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); +@@ -234,8 +234,11 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + return true; + } + +-static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, +- u64 logical, u64 length) { } ++static inline int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, ++ u64 logical, u64 length) ++{ ++ return 0; ++} + + static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } +diff --git a/include/linux/audit.h b/include/linux/audit.h +index a394614ccd0b81..e3f06eba9c6e6e 100644 +--- a/include/linux/audit.h ++++ b/include/linux/audit.h +@@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name) + + static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) + { +- if (!audit_dummy_context()) ++ if (audit_enabled) + __audit_fanotify(response, friar); + } + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index fef93032fe7e4d..fd890b34a84038 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -1728,11 +1728,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + +- /* +- * A parent can be left with no CPU as long as there is no +- * task directly associated with the parent partition. +- */ +- if (nocpu) ++ if (tasks_nocpu_error(parent, cs, xcpus)) + return PERR_NOCPUS; + + /* +diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c +index 6c83ad674d0104..2609998ca07f1f 100644 +--- a/kernel/events/callchain.c ++++ b/kernel/events/callchain.c +@@ -224,6 +224,10 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + struct perf_callchain_entry_ctx ctx; + int rctx, start_entry_idx; + ++ /* crosstask is not supported for user stacks */ ++ if (crosstask && user && !kernel) ++ return NULL; ++ + entry = get_callchain_entry(&rctx); + if (!entry) + return NULL; +@@ -240,18 +244,15 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + perf_callchain_kernel(&ctx, regs); + } + +- if (user) { ++ if (user && !crosstask) { + if (!user_mode(regs)) { +- if (current->mm) +- regs = task_pt_regs(current); +- else ++ if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + regs = NULL; ++ else ++ regs = task_pt_regs(current); + } + + if (regs) { +- if (crosstask) +- goto exit_put; +- + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + +@@ -261,7 +262,6 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + } + } + +-exit_put: + put_callchain_entry(rctx); + + return entry; +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 6e9427c4aaff70..c0e938d28758f5 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -7440,7 +7440,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, + if (user_mode(regs)) { + regs_user->abi = perf_reg_abi(current); + regs_user->regs = regs; +- } else if (!(current->flags & PF_KTHREAD)) { ++ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + perf_get_regs_user(regs_user, regs); + } else { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; +@@ -8080,7 +8080,7 @@ static u64 perf_virt_to_phys(u64 virt) + * Try IRQ-safe get_user_page_fast_only first. + * If failed, leave phys_addr as 0. + */ +- if (current->mm != NULL) { ++ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct page *p; + + pagefault_disable(); +@@ -8192,7 +8192,8 @@ struct perf_callchain_entry * + perf_callchain(struct perf_event *event, struct pt_regs *regs) + { + bool kernel = !event->attr.exclude_callchain_kernel; +- bool user = !event->attr.exclude_callchain_user; ++ bool user = !event->attr.exclude_callchain_user && ++ !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; +diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c +index 3ffa0d80ddd19c..d1917b28761a33 100644 +--- a/kernel/irq/chip.c ++++ b/kernel/irq/chip.c +@@ -1030,7 +1030,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, + void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) + { +- scoped_irqdesc_get_and_lock(irq, 0) ++ scoped_irqdesc_get_and_buslock(irq, 0) + __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name); + } + EXPORT_SYMBOL_GPL(__irq_set_handler); +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index c94837382037e4..400856abf67219 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -659,7 +659,7 @@ void __disable_irq(struct irq_desc *desc) + + static int __disable_irq_nosync(unsigned int irq) + { +- scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { ++ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + __disable_irq(scoped_irqdesc); + return 0; + } +@@ -789,7 +789,7 @@ void __enable_irq(struct irq_desc *desc) + */ + void enable_irq(unsigned int irq) + { +- scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { ++ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; + + if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) +diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c +index c4a488e67aa7d8..755883faf75186 100644 +--- a/kernel/sched/build_policy.c ++++ b/kernel/sched/build_policy.c +@@ -58,6 +58,7 @@ + #include "deadline.c" + + #ifdef CONFIG_SCHED_CLASS_EXT ++# include "ext_internal.h" + # include "ext.c" + # include "ext_idle.c" + #endif +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 088ceff38c8a47..14724dae0b7951 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -9,1040 +9,6 @@ + #include <linux/btf_ids.h> + #include "ext_idle.h" + +-#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) +- +-enum scx_consts { +- SCX_DSP_DFL_MAX_BATCH = 32, +- SCX_DSP_MAX_LOOPS = 32, +- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, +- +- SCX_EXIT_BT_LEN = 64, +- SCX_EXIT_MSG_LEN = 1024, +- SCX_EXIT_DUMP_DFL_LEN = 32768, +- +- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, +- +- /* +- * Iterating all tasks may take a while. Periodically drop +- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. +- */ +- SCX_TASK_ITER_BATCH = 32, +-}; +- +-enum scx_exit_kind { +- SCX_EXIT_NONE, +- SCX_EXIT_DONE, +- +- SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ +- SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ +- SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ +- SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ +- +- SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ +- SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ +- SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +-}; +- +-/* +- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), +- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes +- * are 64bit of the format: +- * +- * Bits: [63 .. 48 47 .. 32 31 .. 0] +- * [ SYS ACT ] [ SYS RSN ] [ USR ] +- * +- * SYS ACT: System-defined exit actions +- * SYS RSN: System-defined exit reasons +- * USR : User-defined exit codes and reasons +- * +- * Using the above, users may communicate intention and context by ORing system +- * actions and/or system reasons with a user-defined exit code. +- */ +-enum scx_exit_code { +- /* Reasons */ +- SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, +- +- /* Actions */ +- SCX_ECODE_ACT_RESTART = 1LLU << 48, +-}; +- +-/* +- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is +- * being disabled. +- */ +-struct scx_exit_info { +- /* %SCX_EXIT_* - broad category of the exit reason */ +- enum scx_exit_kind kind; +- +- /* exit code if gracefully exiting */ +- s64 exit_code; +- +- /* textual representation of the above */ +- const char *reason; +- +- /* backtrace if exiting due to an error */ +- unsigned long *bt; +- u32 bt_len; +- +- /* informational message */ +- char *msg; +- +- /* debug dump */ +- char *dump; +-}; +- +-/* sched_ext_ops.flags */ +-enum scx_ops_flags { +- /* +- * Keep built-in idle tracking even if ops.update_idle() is implemented. +- */ +- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, +- +- /* +- * By default, if there are no other task to run on the CPU, ext core +- * keeps running the current task even after its slice expires. If this +- * flag is specified, such tasks are passed to ops.enqueue() with +- * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. +- */ +- SCX_OPS_ENQ_LAST = 1LLU << 1, +- +- /* +- * An exiting task may schedule after PF_EXITING is set. In such cases, +- * bpf_task_from_pid() may not be able to find the task and if the BPF +- * scheduler depends on pid lookup for dispatching, the task will be +- * lost leading to various issues including RCU grace period stalls. +- * +- * To mask this problem, by default, unhashed tasks are automatically +- * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't +- * depend on pid lookups and wants to handle these tasks directly, the +- * following flag can be used. +- */ +- SCX_OPS_ENQ_EXITING = 1LLU << 2, +- +- /* +- * If set, only tasks with policy set to SCHED_EXT are attached to +- * sched_ext. If clear, SCHED_NORMAL tasks are also included. +- */ +- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, +- +- /* +- * A migration disabled task can only execute on its current CPU. By +- * default, such tasks are automatically put on the CPU's local DSQ with +- * the default slice on enqueue. If this ops flag is set, they also go +- * through ops.enqueue(). +- * +- * A migration disabled task never invokes ops.select_cpu() as it can +- * only select the current CPU. Also, p->cpus_ptr will only contain its +- * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr +- * and thus may disagree with cpumask_weight(p->cpus_ptr). +- */ +- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, +- +- /* +- * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes +- * ops.enqueue() on the ops.select_cpu() selected or the wakee's +- * previous CPU via IPI (inter-processor interrupt) to reduce cacheline +- * transfers. When this optimization is enabled, ops.select_cpu() is +- * skipped in some cases (when racing against the wakee switching out). +- * As the BPF scheduler may depend on ops.select_cpu() being invoked +- * during wakeups, queued wakeup is disabled by default. +- * +- * If this ops flag is set, queued wakeup optimization is enabled and +- * the BPF scheduler must be able to handle ops.enqueue() invoked on the +- * wakee's CPU without preceding ops.select_cpu() even for tasks which +- * may be executed on multiple CPUs. +- */ +- SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, +- +- /* +- * If set, enable per-node idle cpumasks. If clear, use a single global +- * flat idle cpumask. +- */ +- SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, +- +- /* +- * CPU cgroup support flags +- */ +- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ +- +- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | +- SCX_OPS_ENQ_LAST | +- SCX_OPS_ENQ_EXITING | +- SCX_OPS_ENQ_MIGRATION_DISABLED | +- SCX_OPS_ALLOW_QUEUED_WAKEUP | +- SCX_OPS_SWITCH_PARTIAL | +- SCX_OPS_BUILTIN_IDLE_PER_NODE | +- SCX_OPS_HAS_CGROUP_WEIGHT, +- +- /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ +- __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, +- +- SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +-}; +- +-/* argument container for ops.init_task() */ +-struct scx_init_task_args { +- /* +- * Set if ops.init_task() is being invoked on the fork path, as opposed +- * to the scheduler transition path. +- */ +- bool fork; +-#ifdef CONFIG_EXT_GROUP_SCHED +- /* the cgroup the task is joining */ +- struct cgroup *cgroup; +-#endif +-}; +- +-/* argument container for ops.exit_task() */ +-struct scx_exit_task_args { +- /* Whether the task exited before running on sched_ext. */ +- bool cancelled; +-}; +- +-/* argument container for ops->cgroup_init() */ +-struct scx_cgroup_init_args { +- /* the weight of the cgroup [1..10000] */ +- u32 weight; +- +- /* bandwidth control parameters from cpu.max and cpu.max.burst */ +- u64 bw_period_us; +- u64 bw_quota_us; +- u64 bw_burst_us; +-}; +- +-enum scx_cpu_preempt_reason { +- /* next task is being scheduled by &sched_class_rt */ +- SCX_CPU_PREEMPT_RT, +- /* next task is being scheduled by &sched_class_dl */ +- SCX_CPU_PREEMPT_DL, +- /* next task is being scheduled by &sched_class_stop */ +- SCX_CPU_PREEMPT_STOP, +- /* unknown reason for SCX being preempted */ +- SCX_CPU_PREEMPT_UNKNOWN, +-}; +- +-/* +- * Argument container for ops->cpu_acquire(). Currently empty, but may be +- * expanded in the future. +- */ +-struct scx_cpu_acquire_args {}; +- +-/* argument container for ops->cpu_release() */ +-struct scx_cpu_release_args { +- /* the reason the CPU was preempted */ +- enum scx_cpu_preempt_reason reason; +- +- /* the task that's going to be scheduled on the CPU */ +- struct task_struct *task; +-}; +- +-/* +- * Informational context provided to dump operations. +- */ +-struct scx_dump_ctx { +- enum scx_exit_kind kind; +- s64 exit_code; +- const char *reason; +- u64 at_ns; +- u64 at_jiffies; +-}; +- +-/** +- * struct sched_ext_ops - Operation table for BPF scheduler implementation +- * +- * A BPF scheduler can implement an arbitrary scheduling policy by +- * implementing and loading operations in this table. Note that a userland +- * scheduling policy can also be implemented using the BPF scheduler +- * as a shim layer. +- */ +-struct sched_ext_ops { +- /** +- * @select_cpu: Pick the target CPU for a task which is being woken up +- * @p: task being woken up +- * @prev_cpu: the cpu @p was on before sleeping +- * @wake_flags: SCX_WAKE_* +- * +- * Decision made here isn't final. @p may be moved to any CPU while it +- * is getting dispatched for execution later. However, as @p is not on +- * the rq at this point, getting the eventual execution CPU right here +- * saves a small bit of overhead down the line. +- * +- * If an idle CPU is returned, the CPU is kicked and will try to +- * dispatch. While an explicit custom mechanism can be added, +- * select_cpu() serves as the default way to wake up idle CPUs. +- * +- * @p may be inserted into a DSQ directly by calling +- * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. +- * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ +- * of the CPU returned by this operation. +- * +- * Note that select_cpu() is never called for tasks that can only run +- * on a single CPU or tasks with migration disabled, as they don't have +- * the option to select a different CPU. See select_task_rq() for +- * details. +- */ +- s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); +- +- /** +- * @enqueue: Enqueue a task on the BPF scheduler +- * @p: task being enqueued +- * @enq_flags: %SCX_ENQ_* +- * +- * @p is ready to run. Insert directly into a DSQ by calling +- * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly +- * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, +- * the task will stall. +- * +- * If @p was inserted into a DSQ from ops.select_cpu(), this callback is +- * skipped. +- */ +- void (*enqueue)(struct task_struct *p, u64 enq_flags); +- +- /** +- * @dequeue: Remove a task from the BPF scheduler +- * @p: task being dequeued +- * @deq_flags: %SCX_DEQ_* +- * +- * Remove @p from the BPF scheduler. This is usually called to isolate +- * the task while updating its scheduling properties (e.g. priority). +- * +- * The ext core keeps track of whether the BPF side owns a given task or +- * not and can gracefully ignore spurious dispatches from BPF side, +- * which makes it safe to not implement this method. However, depending +- * on the scheduling logic, this can lead to confusing behaviors - e.g. +- * scheduling position not being updated across a priority change. +- */ +- void (*dequeue)(struct task_struct *p, u64 deq_flags); +- +- /** +- * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs +- * @cpu: CPU to dispatch tasks for +- * @prev: previous task being switched out +- * +- * Called when a CPU's local dsq is empty. The operation should dispatch +- * one or more tasks from the BPF scheduler into the DSQs using +- * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ +- * using scx_bpf_dsq_move_to_local(). +- * +- * The maximum number of times scx_bpf_dsq_insert() can be called +- * without an intervening scx_bpf_dsq_move_to_local() is specified by +- * ops.dispatch_max_batch. See the comments on top of the two functions +- * for more details. +- * +- * When not %NULL, @prev is an SCX task with its slice depleted. If +- * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in +- * @prev->scx.flags, it is not enqueued yet and will be enqueued after +- * ops.dispatch() returns. To keep executing @prev, return without +- * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. +- */ +- void (*dispatch)(s32 cpu, struct task_struct *prev); +- +- /** +- * @tick: Periodic tick +- * @p: task running currently +- * +- * This operation is called every 1/HZ seconds on CPUs which are +- * executing an SCX task. Setting @p->scx.slice to 0 will trigger an +- * immediate dispatch cycle on the CPU. +- */ +- void (*tick)(struct task_struct *p); +- +- /** +- * @runnable: A task is becoming runnable on its associated CPU +- * @p: task becoming runnable +- * @enq_flags: %SCX_ENQ_* +- * +- * This and the following three functions can be used to track a task's +- * execution state transitions. A task becomes ->runnable() on a CPU, +- * and then goes through one or more ->running() and ->stopping() pairs +- * as it runs on the CPU, and eventually becomes ->quiescent() when it's +- * done running on the CPU. +- * +- * @p is becoming runnable on the CPU because it's +- * +- * - waking up (%SCX_ENQ_WAKEUP) +- * - being moved from another CPU +- * - being restored after temporarily taken off the queue for an +- * attribute change. +- * +- * This and ->enqueue() are related but not coupled. This operation +- * notifies @p's state transition and may not be followed by ->enqueue() +- * e.g. when @p is being dispatched to a remote CPU, or when @p is +- * being enqueued on a CPU experiencing a hotplug event. Likewise, a +- * task may be ->enqueue()'d without being preceded by this operation +- * e.g. after exhausting its slice. +- */ +- void (*runnable)(struct task_struct *p, u64 enq_flags); +- +- /** +- * @running: A task is starting to run on its associated CPU +- * @p: task starting to run +- * +- * Note that this callback may be called from a CPU other than the +- * one the task is going to run on. This can happen when a task +- * property is changed (i.e., affinity), since scx_next_task_scx(), +- * which triggers this callback, may run on a CPU different from +- * the task's assigned CPU. +- * +- * Therefore, always use scx_bpf_task_cpu(@p) to determine the +- * target CPU the task is going to use. +- * +- * See ->runnable() for explanation on the task state notifiers. +- */ +- void (*running)(struct task_struct *p); +- +- /** +- * @stopping: A task is stopping execution +- * @p: task stopping to run +- * @runnable: is task @p still runnable? +- * +- * Note that this callback may be called from a CPU other than the +- * one the task was running on. This can happen when a task +- * property is changed (i.e., affinity), since dequeue_task_scx(), +- * which triggers this callback, may run on a CPU different from +- * the task's assigned CPU. +- * +- * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU +- * the task was running on. +- * +- * See ->runnable() for explanation on the task state notifiers. If +- * !@runnable, ->quiescent() will be invoked after this operation +- * returns. +- */ +- void (*stopping)(struct task_struct *p, bool runnable); +- +- /** +- * @quiescent: A task is becoming not runnable on its associated CPU +- * @p: task becoming not runnable +- * @deq_flags: %SCX_DEQ_* +- * +- * See ->runnable() for explanation on the task state notifiers. +- * +- * @p is becoming quiescent on the CPU because it's +- * +- * - sleeping (%SCX_DEQ_SLEEP) +- * - being moved to another CPU +- * - being temporarily taken off the queue for an attribute change +- * (%SCX_DEQ_SAVE) +- * +- * This and ->dequeue() are related but not coupled. This operation +- * notifies @p's state transition and may not be preceded by ->dequeue() +- * e.g. when @p is being dispatched to a remote CPU. +- */ +- void (*quiescent)(struct task_struct *p, u64 deq_flags); +- +- /** +- * @yield: Yield CPU +- * @from: yielding task +- * @to: optional yield target task +- * +- * If @to is NULL, @from is yielding the CPU to other runnable tasks. +- * The BPF scheduler should ensure that other available tasks are +- * dispatched before the yielding task. Return value is ignored in this +- * case. +- * +- * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf +- * scheduler can implement the request, return %true; otherwise, %false. +- */ +- bool (*yield)(struct task_struct *from, struct task_struct *to); +- +- /** +- * @core_sched_before: Task ordering for core-sched +- * @a: task A +- * @b: task B +- * +- * Used by core-sched to determine the ordering between two tasks. See +- * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on +- * core-sched. +- * +- * Both @a and @b are runnable and may or may not currently be queued on +- * the BPF scheduler. Should return %true if @a should run before @b. +- * %false if there's no required ordering or @b should run before @a. +- * +- * If not specified, the default is ordering them according to when they +- * became runnable. +- */ +- bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); +- +- /** +- * @set_weight: Set task weight +- * @p: task to set weight for +- * @weight: new weight [1..10000] +- * +- * Update @p's weight to @weight. +- */ +- void (*set_weight)(struct task_struct *p, u32 weight); +- +- /** +- * @set_cpumask: Set CPU affinity +- * @p: task to set CPU affinity for +- * @cpumask: cpumask of cpus that @p can run on +- * +- * Update @p's CPU affinity to @cpumask. +- */ +- void (*set_cpumask)(struct task_struct *p, +- const struct cpumask *cpumask); +- +- /** +- * @update_idle: Update the idle state of a CPU +- * @cpu: CPU to update the idle state for +- * @idle: whether entering or exiting the idle state +- * +- * This operation is called when @rq's CPU goes or leaves the idle +- * state. By default, implementing this operation disables the built-in +- * idle CPU tracking and the following helpers become unavailable: +- * +- * - scx_bpf_select_cpu_dfl() +- * - scx_bpf_select_cpu_and() +- * - scx_bpf_test_and_clear_cpu_idle() +- * - scx_bpf_pick_idle_cpu() +- * +- * The user also must implement ops.select_cpu() as the default +- * implementation relies on scx_bpf_select_cpu_dfl(). +- * +- * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle +- * tracking. +- */ +- void (*update_idle)(s32 cpu, bool idle); +- +- /** +- * @cpu_acquire: A CPU is becoming available to the BPF scheduler +- * @cpu: The CPU being acquired by the BPF scheduler. +- * @args: Acquire arguments, see the struct definition. +- * +- * A CPU that was previously released from the BPF scheduler is now once +- * again under its control. +- */ +- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); +- +- /** +- * @cpu_release: A CPU is taken away from the BPF scheduler +- * @cpu: The CPU being released by the BPF scheduler. +- * @args: Release arguments, see the struct definition. +- * +- * The specified CPU is no longer under the control of the BPF +- * scheduler. This could be because it was preempted by a higher +- * priority sched_class, though there may be other reasons as well. The +- * caller should consult @args->reason to determine the cause. +- */ +- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); +- +- /** +- * @init_task: Initialize a task to run in a BPF scheduler +- * @p: task to initialize for BPF scheduling +- * @args: init arguments, see the struct definition +- * +- * Either we're loading a BPF scheduler or a new task is being forked. +- * Initialize @p for BPF scheduling. This operation may block and can +- * be used for allocations, and is called exactly once for a task. +- * +- * Return 0 for success, -errno for failure. An error return while +- * loading will abort loading of the BPF scheduler. During a fork, it +- * will abort that specific fork. +- */ +- s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); +- +- /** +- * @exit_task: Exit a previously-running task from the system +- * @p: task to exit +- * @args: exit arguments, see the struct definition +- * +- * @p is exiting or the BPF scheduler is being unloaded. Perform any +- * necessary cleanup for @p. +- */ +- void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); +- +- /** +- * @enable: Enable BPF scheduling for a task +- * @p: task to enable BPF scheduling for +- * +- * Enable @p for BPF scheduling. enable() is called on @p any time it +- * enters SCX, and is always paired with a matching disable(). +- */ +- void (*enable)(struct task_struct *p); +- +- /** +- * @disable: Disable BPF scheduling for a task +- * @p: task to disable BPF scheduling for +- * +- * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. +- * Disable BPF scheduling for @p. A disable() call is always matched +- * with a prior enable() call. +- */ +- void (*disable)(struct task_struct *p); +- +- /** +- * @dump: Dump BPF scheduler state on error +- * @ctx: debug dump context +- * +- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. +- */ +- void (*dump)(struct scx_dump_ctx *ctx); +- +- /** +- * @dump_cpu: Dump BPF scheduler state for a CPU on error +- * @ctx: debug dump context +- * @cpu: CPU to generate debug dump for +- * @idle: @cpu is currently idle without any runnable tasks +- * +- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for +- * @cpu. If @idle is %true and this operation doesn't produce any +- * output, @cpu is skipped for dump. +- */ +- void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); +- +- /** +- * @dump_task: Dump BPF scheduler state for a runnable task on error +- * @ctx: debug dump context +- * @p: runnable task to generate debug dump for +- * +- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for +- * @p. +- */ +- void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); +- +-#ifdef CONFIG_EXT_GROUP_SCHED +- /** +- * @cgroup_init: Initialize a cgroup +- * @cgrp: cgroup being initialized +- * @args: init arguments, see the struct definition +- * +- * Either the BPF scheduler is being loaded or @cgrp created, initialize +- * @cgrp for sched_ext. This operation may block. +- * +- * Return 0 for success, -errno for failure. An error return while +- * loading will abort loading of the BPF scheduler. During cgroup +- * creation, it will abort the specific cgroup creation. +- */ +- s32 (*cgroup_init)(struct cgroup *cgrp, +- struct scx_cgroup_init_args *args); +- +- /** +- * @cgroup_exit: Exit a cgroup +- * @cgrp: cgroup being exited +- * +- * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit +- * @cgrp for sched_ext. This operation my block. +- */ +- void (*cgroup_exit)(struct cgroup *cgrp); +- +- /** +- * @cgroup_prep_move: Prepare a task to be moved to a different cgroup +- * @p: task being moved +- * @from: cgroup @p is being moved from +- * @to: cgroup @p is being moved to +- * +- * Prepare @p for move from cgroup @from to @to. This operation may +- * block and can be used for allocations. +- * +- * Return 0 for success, -errno for failure. An error return aborts the +- * migration. +- */ +- s32 (*cgroup_prep_move)(struct task_struct *p, +- struct cgroup *from, struct cgroup *to); +- +- /** +- * @cgroup_move: Commit cgroup move +- * @p: task being moved +- * @from: cgroup @p is being moved from +- * @to: cgroup @p is being moved to +- * +- * Commit the move. @p is dequeued during this operation. +- */ +- void (*cgroup_move)(struct task_struct *p, +- struct cgroup *from, struct cgroup *to); +- +- /** +- * @cgroup_cancel_move: Cancel cgroup move +- * @p: task whose cgroup move is being canceled +- * @from: cgroup @p was being moved from +- * @to: cgroup @p was being moved to +- * +- * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). +- * Undo the preparation. +- */ +- void (*cgroup_cancel_move)(struct task_struct *p, +- struct cgroup *from, struct cgroup *to); +- +- /** +- * @cgroup_set_weight: A cgroup's weight is being changed +- * @cgrp: cgroup whose weight is being updated +- * @weight: new weight [1..10000] +- * +- * Update @cgrp's weight to @weight. +- */ +- void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +- +- /** +- * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed +- * @cgrp: cgroup whose bandwidth is being updated +- * @period_us: bandwidth control period +- * @quota_us: bandwidth control quota +- * @burst_us: bandwidth control burst +- * +- * Update @cgrp's bandwidth control parameters. This is from the cpu.max +- * cgroup interface. +- * +- * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled +- * to. For example, if @period_us is 1_000_000 and @quota_us is +- * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be +- * interpreted in the same fashion and specifies how much @cgrp can +- * burst temporarily. The specific control mechanism and thus the +- * interpretation of @period_us and burstiness is upto to the BPF +- * scheduler. +- */ +- void (*cgroup_set_bandwidth)(struct cgroup *cgrp, +- u64 period_us, u64 quota_us, u64 burst_us); +- +-#endif /* CONFIG_EXT_GROUP_SCHED */ +- +- /* +- * All online ops must come before ops.cpu_online(). +- */ +- +- /** +- * @cpu_online: A CPU became online +- * @cpu: CPU which just came up +- * +- * @cpu just came online. @cpu will not call ops.enqueue() or +- * ops.dispatch(), nor run tasks associated with other CPUs beforehand. +- */ +- void (*cpu_online)(s32 cpu); +- +- /** +- * @cpu_offline: A CPU is going offline +- * @cpu: CPU which is going offline +- * +- * @cpu is going offline. @cpu will not call ops.enqueue() or +- * ops.dispatch(), nor run tasks associated with other CPUs afterwards. +- */ +- void (*cpu_offline)(s32 cpu); +- +- /* +- * All CPU hotplug ops must come before ops.init(). +- */ +- +- /** +- * @init: Initialize the BPF scheduler +- */ +- s32 (*init)(void); +- +- /** +- * @exit: Clean up after the BPF scheduler +- * @info: Exit info +- * +- * ops.exit() is also called on ops.init() failure, which is a bit +- * unusual. This is to allow rich reporting through @info on how +- * ops.init() failed. +- */ +- void (*exit)(struct scx_exit_info *info); +- +- /** +- * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch +- */ +- u32 dispatch_max_batch; +- +- /** +- * @flags: %SCX_OPS_* flags +- */ +- u64 flags; +- +- /** +- * @timeout_ms: The maximum amount of time, in milliseconds, that a +- * runnable task should be able to wait before being scheduled. The +- * maximum timeout may not exceed the default timeout of 30 seconds. +- * +- * Defaults to the maximum allowed timeout value of 30 seconds. +- */ +- u32 timeout_ms; +- +- /** +- * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default +- * value of 32768 is used. +- */ +- u32 exit_dump_len; +- +- /** +- * @hotplug_seq: A sequence number that may be set by the scheduler to +- * detect when a hotplug event has occurred during the loading process. +- * If 0, no detection occurs. Otherwise, the scheduler will fail to +- * load if the sequence number does not match @scx_hotplug_seq on the +- * enable path. +- */ +- u64 hotplug_seq; +- +- /** +- * @name: BPF scheduler's name +- * +- * Must be a non-zero valid BPF object name including only isalnum(), +- * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the +- * BPF scheduler is enabled. +- */ +- char name[SCX_OPS_NAME_LEN]; +- +- /* internal use only, must be NULL */ +- void *priv; +-}; +- +-enum scx_opi { +- SCX_OPI_BEGIN = 0, +- SCX_OPI_NORMAL_BEGIN = 0, +- SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), +- SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), +- SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), +- SCX_OPI_END = SCX_OP_IDX(init), +-}; +- +-/* +- * Collection of event counters. Event types are placed in descending order. +- */ +-struct scx_event_stats { +- /* +- * If ops.select_cpu() returns a CPU which can't be used by the task, +- * the core scheduler code silently picks a fallback CPU. +- */ +- s64 SCX_EV_SELECT_CPU_FALLBACK; +- +- /* +- * When dispatching to a local DSQ, the CPU may have gone offline in +- * the meantime. In this case, the task is bounced to the global DSQ. +- */ +- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; +- +- /* +- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task +- * continued to run because there were no other tasks on the CPU. +- */ +- s64 SCX_EV_DISPATCH_KEEP_LAST; +- +- /* +- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task +- * is dispatched to a local DSQ when exiting. +- */ +- s64 SCX_EV_ENQ_SKIP_EXITING; +- +- /* +- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a +- * migration disabled task skips ops.enqueue() and is dispatched to its +- * local DSQ. +- */ +- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; +- +- /* +- * Total number of times a task's time slice was refilled with the +- * default value (SCX_SLICE_DFL). +- */ +- s64 SCX_EV_REFILL_SLICE_DFL; +- +- /* +- * The total duration of bypass modes in nanoseconds. +- */ +- s64 SCX_EV_BYPASS_DURATION; +- +- /* +- * The number of tasks dispatched in the bypassing mode. +- */ +- s64 SCX_EV_BYPASS_DISPATCH; +- +- /* +- * The number of times the bypassing mode has been activated. +- */ +- s64 SCX_EV_BYPASS_ACTIVATE; +-}; +- +-struct scx_sched { +- struct sched_ext_ops ops; +- DECLARE_BITMAP(has_op, SCX_OPI_END); +- +- /* +- * Dispatch queues. +- * +- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. +- * This is to avoid live-locking in bypass mode where all tasks are +- * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If +- * per-node split isn't sufficient, it can be further split. +- */ +- struct rhashtable dsq_hash; +- struct scx_dispatch_q **global_dsqs; +- +- /* +- * The event counters are in a per-CPU variable to minimize the +- * accounting overhead. A system-wide view on the event counter is +- * constructed when requested by scx_bpf_events(). +- */ +- struct scx_event_stats __percpu *event_stats_cpu; +- +- bool warned_zero_slice; +- +- atomic_t exit_kind; +- struct scx_exit_info *exit_info; +- +- struct kobject kobj; +- +- struct kthread_worker *helper; +- struct irq_work error_irq_work; +- struct kthread_work disable_work; +- struct rcu_work rcu_work; +-}; +- +-enum scx_wake_flags { +- /* expose select WF_* flags as enums */ +- SCX_WAKE_FORK = WF_FORK, +- SCX_WAKE_TTWU = WF_TTWU, +- SCX_WAKE_SYNC = WF_SYNC, +-}; +- +-enum scx_enq_flags { +- /* expose select ENQUEUE_* flags as enums */ +- SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, +- SCX_ENQ_HEAD = ENQUEUE_HEAD, +- SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, +- +- /* high 32bits are SCX specific */ +- +- /* +- * Set the following to trigger preemption when calling +- * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the +- * current task is cleared to zero and the CPU is kicked into the +- * scheduling path. Implies %SCX_ENQ_HEAD. +- */ +- SCX_ENQ_PREEMPT = 1LLU << 32, +- +- /* +- * The task being enqueued was previously enqueued on the current CPU's +- * %SCX_DSQ_LOCAL, but was removed from it in a call to the +- * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was +- * invoked in a ->cpu_release() callback, and the task is again +- * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the +- * task will not be scheduled on the CPU until at least the next invocation +- * of the ->cpu_acquire() callback. +- */ +- SCX_ENQ_REENQ = 1LLU << 40, +- +- /* +- * The task being enqueued is the only task available for the cpu. By +- * default, ext core keeps executing such tasks but when +- * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the +- * %SCX_ENQ_LAST flag set. +- * +- * The BPF scheduler is responsible for triggering a follow-up +- * scheduling event. Otherwise, Execution may stall. +- */ +- SCX_ENQ_LAST = 1LLU << 41, +- +- /* high 8 bits are internal */ +- __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, +- +- SCX_ENQ_CLEAR_OPSS = 1LLU << 56, +- SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +-}; +- +-enum scx_deq_flags { +- /* expose select DEQUEUE_* flags as enums */ +- SCX_DEQ_SLEEP = DEQUEUE_SLEEP, +- +- /* high 32bits are SCX specific */ +- +- /* +- * The generic core-sched layer decided to execute the task even though +- * it hasn't been dispatched yet. Dequeue from the BPF side. +- */ +- SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +-}; +- +-enum scx_pick_idle_cpu_flags { +- SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ +- SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +-}; +- +-enum scx_kick_flags { +- /* +- * Kick the target CPU if idle. Guarantees that the target CPU goes +- * through at least one full scheduling cycle before going idle. If the +- * target CPU can be determined to be currently not idle and going to go +- * through a scheduling cycle before going idle, noop. +- */ +- SCX_KICK_IDLE = 1LLU << 0, +- +- /* +- * Preempt the current task and execute the dispatch path. If the +- * current task of the target CPU is an SCX task, its ->scx.slice is +- * cleared to zero before the scheduling path is invoked so that the +- * task expires and the dispatch path is invoked. +- */ +- SCX_KICK_PREEMPT = 1LLU << 1, +- +- /* +- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will +- * return after the target CPU finishes picking the next task. +- */ +- SCX_KICK_WAIT = 1LLU << 2, +-}; +- +-enum scx_tg_flags { +- SCX_TG_ONLINE = 1U << 0, +- SCX_TG_INITED = 1U << 1, +-}; +- +-enum scx_enable_state { +- SCX_ENABLING, +- SCX_ENABLED, +- SCX_DISABLING, +- SCX_DISABLED, +-}; +- +-static const char *scx_enable_state_str[] = { +- [SCX_ENABLING] = "enabling", +- [SCX_ENABLED] = "enabled", +- [SCX_DISABLING] = "disabling", +- [SCX_DISABLED] = "disabled", +-}; +- +-/* +- * sched_ext_entity->ops_state +- * +- * Used to track the task ownership between the SCX core and the BPF scheduler. +- * State transitions look as follows: +- * +- * NONE -> QUEUEING -> QUEUED -> DISPATCHING +- * ^ | | +- * | v v +- * \-------------------------------/ +- * +- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call +- * sites for explanations on the conditions being waited upon and why they are +- * safe. Transitions out of them into NONE or QUEUED must store_release and the +- * waiters should load_acquire. +- * +- * Tracking scx_ops_state enables sched_ext core to reliably determine whether +- * any given task can be dispatched by the BPF scheduler at all times and thus +- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler +- * to try to dispatch any task anytime regardless of its state as the SCX core +- * can safely reject invalid dispatches. +- */ +-enum scx_ops_state { +- SCX_OPSS_NONE, /* owned by the SCX core */ +- SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ +- SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ +- SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ +- +- /* +- * QSEQ brands each QUEUED instance so that, when dispatch races +- * dequeue/requeue, the dispatcher can tell whether it still has a claim +- * on the task being dispatched. +- * +- * As some 32bit archs can't do 64bit store_release/load_acquire, +- * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on +- * 32bit machines. The dispatch race window QSEQ protects is very narrow +- * and runs with IRQ disabled. 30 bits should be sufficient. +- */ +- SCX_OPSS_QSEQ_SHIFT = 2, +-}; +- +-/* Use macros to ensure that the type is unsigned long for the masks */ +-#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +-#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) +- + /* + * NOTE: sched_ext is in the process of growing multiple scheduler support and + * scx_root usage is in a transitional state. Naked dereferences are safe if the +@@ -1664,7 +630,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) + * This can be used when preemption is not disabled. + */ + #define scx_add_event(sch, name, cnt) do { \ +- this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ ++ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ + trace_sched_ext_event(#name, (cnt)); \ + } while(0) + +@@ -1677,7 +643,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) + * This should be used only when preemption is disabled. + */ + #define __scx_add_event(sch, name, cnt) do { \ +- __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ ++ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ + trace_sched_ext_event(#name, cnt); \ + } while(0) + +@@ -4571,8 +3537,10 @@ static void scx_sched_free_rcu_work(struct work_struct *work) + struct scx_dispatch_q *dsq; + int node; + ++ irq_work_sync(&sch->error_irq_work); + kthread_stop(sch->helper->task); +- free_percpu(sch->event_stats_cpu); ++ ++ free_percpu(sch->pcpu); + + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); +@@ -5473,13 +4441,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) + sch->global_dsqs[node] = dsq; + } + +- sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); +- if (!sch->event_stats_cpu) ++ sch->pcpu = alloc_percpu(struct scx_sched_pcpu); ++ if (!sch->pcpu) + goto err_free_gdsqs; + + sch->helper = kthread_run_worker(0, "sched_ext_helper"); + if (!sch->helper) +- goto err_free_event_stats; ++ goto err_free_pcpu; + sched_set_fifo(sch->helper->task); + + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); +@@ -5497,8 +4465,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) + + err_stop_helper: + kthread_stop(sch->helper->task); +-err_free_event_stats: +- free_percpu(sch->event_stats_cpu); ++err_free_pcpu: ++ free_percpu(sch->pcpu); + err_free_gdsqs: + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); +@@ -5795,7 +4763,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) + err_disable_unlock_all: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +- scx_bypass(false); ++ /* we'll soon enter disable path, keep bypass on */ + err_disable: + mutex_unlock(&scx_enable_mutex); + /* +@@ -7524,7 +6492,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event + /* Aggregate per-CPU event counters into @events. */ + memset(events, 0, sizeof(*events)); + for_each_possible_cpu(cpu) { +- e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); ++ e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; + scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); +diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h +index 292bb41a242ec1..33858607bc97f5 100644 +--- a/kernel/sched/ext.h ++++ b/kernel/sched/ext.h +@@ -8,29 +8,6 @@ + */ + #ifdef CONFIG_SCHED_CLASS_EXT + +-static inline bool scx_kf_allowed_if_unlocked(void) +-{ +- return !current->scx.kf_mask; +-} +- +-static inline bool scx_rq_bypassing(struct rq *rq) +-{ +- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +-} +- +-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); +- +-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); +- +-/* +- * Return the rq currently locked from an scx callback, or NULL if no rq is +- * locked. +- */ +-static inline struct rq *scx_locked_rq(void) +-{ +- return __this_cpu_read(scx_locked_rq_state); +-} +- + void scx_tick(struct rq *rq); + void init_scx_entity(struct sched_ext_entity *scx); + void scx_pre_fork(struct task_struct *p); +diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h +new file mode 100644 +index 00000000000000..af4c054fb6f852 +--- /dev/null ++++ b/kernel/sched/ext_internal.h +@@ -0,0 +1,1064 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst ++ * ++ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2025 Tejun Heo <[email protected]> ++ */ ++#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) ++ ++enum scx_consts { ++ SCX_DSP_DFL_MAX_BATCH = 32, ++ SCX_DSP_MAX_LOOPS = 32, ++ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, ++ ++ SCX_EXIT_BT_LEN = 64, ++ SCX_EXIT_MSG_LEN = 1024, ++ SCX_EXIT_DUMP_DFL_LEN = 32768, ++ ++ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, ++ ++ /* ++ * Iterating all tasks may take a while. Periodically drop ++ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. ++ */ ++ SCX_TASK_ITER_BATCH = 32, ++}; ++ ++enum scx_exit_kind { ++ SCX_EXIT_NONE, ++ SCX_EXIT_DONE, ++ ++ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ ++ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ ++ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ ++ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ ++ ++ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ ++ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ ++ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ ++}; ++ ++/* ++ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), ++ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes ++ * are 64bit of the format: ++ * ++ * Bits: [63 .. 48 47 .. 32 31 .. 0] ++ * [ SYS ACT ] [ SYS RSN ] [ USR ] ++ * ++ * SYS ACT: System-defined exit actions ++ * SYS RSN: System-defined exit reasons ++ * USR : User-defined exit codes and reasons ++ * ++ * Using the above, users may communicate intention and context by ORing system ++ * actions and/or system reasons with a user-defined exit code. ++ */ ++enum scx_exit_code { ++ /* Reasons */ ++ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, ++ ++ /* Actions */ ++ SCX_ECODE_ACT_RESTART = 1LLU << 48, ++}; ++ ++/* ++ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is ++ * being disabled. ++ */ ++struct scx_exit_info { ++ /* %SCX_EXIT_* - broad category of the exit reason */ ++ enum scx_exit_kind kind; ++ ++ /* exit code if gracefully exiting */ ++ s64 exit_code; ++ ++ /* textual representation of the above */ ++ const char *reason; ++ ++ /* backtrace if exiting due to an error */ ++ unsigned long *bt; ++ u32 bt_len; ++ ++ /* informational message */ ++ char *msg; ++ ++ /* debug dump */ ++ char *dump; ++}; ++ ++/* sched_ext_ops.flags */ ++enum scx_ops_flags { ++ /* ++ * Keep built-in idle tracking even if ops.update_idle() is implemented. ++ */ ++ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, ++ ++ /* ++ * By default, if there are no other task to run on the CPU, ext core ++ * keeps running the current task even after its slice expires. If this ++ * flag is specified, such tasks are passed to ops.enqueue() with ++ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. ++ */ ++ SCX_OPS_ENQ_LAST = 1LLU << 1, ++ ++ /* ++ * An exiting task may schedule after PF_EXITING is set. In such cases, ++ * bpf_task_from_pid() may not be able to find the task and if the BPF ++ * scheduler depends on pid lookup for dispatching, the task will be ++ * lost leading to various issues including RCU grace period stalls. ++ * ++ * To mask this problem, by default, unhashed tasks are automatically ++ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't ++ * depend on pid lookups and wants to handle these tasks directly, the ++ * following flag can be used. ++ */ ++ SCX_OPS_ENQ_EXITING = 1LLU << 2, ++ ++ /* ++ * If set, only tasks with policy set to SCHED_EXT are attached to ++ * sched_ext. If clear, SCHED_NORMAL tasks are also included. ++ */ ++ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, ++ ++ /* ++ * A migration disabled task can only execute on its current CPU. By ++ * default, such tasks are automatically put on the CPU's local DSQ with ++ * the default slice on enqueue. If this ops flag is set, they also go ++ * through ops.enqueue(). ++ * ++ * A migration disabled task never invokes ops.select_cpu() as it can ++ * only select the current CPU. Also, p->cpus_ptr will only contain its ++ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr ++ * and thus may disagree with cpumask_weight(p->cpus_ptr). ++ */ ++ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, ++ ++ /* ++ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes ++ * ops.enqueue() on the ops.select_cpu() selected or the wakee's ++ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline ++ * transfers. When this optimization is enabled, ops.select_cpu() is ++ * skipped in some cases (when racing against the wakee switching out). ++ * As the BPF scheduler may depend on ops.select_cpu() being invoked ++ * during wakeups, queued wakeup is disabled by default. ++ * ++ * If this ops flag is set, queued wakeup optimization is enabled and ++ * the BPF scheduler must be able to handle ops.enqueue() invoked on the ++ * wakee's CPU without preceding ops.select_cpu() even for tasks which ++ * may be executed on multiple CPUs. ++ */ ++ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, ++ ++ /* ++ * If set, enable per-node idle cpumasks. If clear, use a single global ++ * flat idle cpumask. ++ */ ++ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, ++ ++ /* ++ * CPU cgroup support flags ++ */ ++ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ ++ ++ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | ++ SCX_OPS_ENQ_LAST | ++ SCX_OPS_ENQ_EXITING | ++ SCX_OPS_ENQ_MIGRATION_DISABLED | ++ SCX_OPS_ALLOW_QUEUED_WAKEUP | ++ SCX_OPS_SWITCH_PARTIAL | ++ SCX_OPS_BUILTIN_IDLE_PER_NODE | ++ SCX_OPS_HAS_CGROUP_WEIGHT, ++ ++ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ ++ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, ++ ++ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, ++}; ++ ++/* argument container for ops.init_task() */ ++struct scx_init_task_args { ++ /* ++ * Set if ops.init_task() is being invoked on the fork path, as opposed ++ * to the scheduler transition path. ++ */ ++ bool fork; ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /* the cgroup the task is joining */ ++ struct cgroup *cgroup; ++#endif ++}; ++ ++/* argument container for ops.exit_task() */ ++struct scx_exit_task_args { ++ /* Whether the task exited before running on sched_ext. */ ++ bool cancelled; ++}; ++ ++/* argument container for ops->cgroup_init() */ ++struct scx_cgroup_init_args { ++ /* the weight of the cgroup [1..10000] */ ++ u32 weight; ++ ++ /* bandwidth control parameters from cpu.max and cpu.max.burst */ ++ u64 bw_period_us; ++ u64 bw_quota_us; ++ u64 bw_burst_us; ++}; ++ ++enum scx_cpu_preempt_reason { ++ /* next task is being scheduled by &sched_class_rt */ ++ SCX_CPU_PREEMPT_RT, ++ /* next task is being scheduled by &sched_class_dl */ ++ SCX_CPU_PREEMPT_DL, ++ /* next task is being scheduled by &sched_class_stop */ ++ SCX_CPU_PREEMPT_STOP, ++ /* unknown reason for SCX being preempted */ ++ SCX_CPU_PREEMPT_UNKNOWN, ++}; ++ ++/* ++ * Argument container for ops->cpu_acquire(). Currently empty, but may be ++ * expanded in the future. ++ */ ++struct scx_cpu_acquire_args {}; ++ ++/* argument container for ops->cpu_release() */ ++struct scx_cpu_release_args { ++ /* the reason the CPU was preempted */ ++ enum scx_cpu_preempt_reason reason; ++ ++ /* the task that's going to be scheduled on the CPU */ ++ struct task_struct *task; ++}; ++ ++/* ++ * Informational context provided to dump operations. ++ */ ++struct scx_dump_ctx { ++ enum scx_exit_kind kind; ++ s64 exit_code; ++ const char *reason; ++ u64 at_ns; ++ u64 at_jiffies; ++}; ++ ++/** ++ * struct sched_ext_ops - Operation table for BPF scheduler implementation ++ * ++ * A BPF scheduler can implement an arbitrary scheduling policy by ++ * implementing and loading operations in this table. Note that a userland ++ * scheduling policy can also be implemented using the BPF scheduler ++ * as a shim layer. ++ */ ++struct sched_ext_ops { ++ /** ++ * @select_cpu: Pick the target CPU for a task which is being woken up ++ * @p: task being woken up ++ * @prev_cpu: the cpu @p was on before sleeping ++ * @wake_flags: SCX_WAKE_* ++ * ++ * Decision made here isn't final. @p may be moved to any CPU while it ++ * is getting dispatched for execution later. However, as @p is not on ++ * the rq at this point, getting the eventual execution CPU right here ++ * saves a small bit of overhead down the line. ++ * ++ * If an idle CPU is returned, the CPU is kicked and will try to ++ * dispatch. While an explicit custom mechanism can be added, ++ * select_cpu() serves as the default way to wake up idle CPUs. ++ * ++ * @p may be inserted into a DSQ directly by calling ++ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. ++ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ ++ * of the CPU returned by this operation. ++ * ++ * Note that select_cpu() is never called for tasks that can only run ++ * on a single CPU or tasks with migration disabled, as they don't have ++ * the option to select a different CPU. See select_task_rq() for ++ * details. ++ */ ++ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); ++ ++ /** ++ * @enqueue: Enqueue a task on the BPF scheduler ++ * @p: task being enqueued ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * @p is ready to run. Insert directly into a DSQ by calling ++ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly ++ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, ++ * the task will stall. ++ * ++ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is ++ * skipped. ++ */ ++ void (*enqueue)(struct task_struct *p, u64 enq_flags); ++ ++ /** ++ * @dequeue: Remove a task from the BPF scheduler ++ * @p: task being dequeued ++ * @deq_flags: %SCX_DEQ_* ++ * ++ * Remove @p from the BPF scheduler. This is usually called to isolate ++ * the task while updating its scheduling properties (e.g. priority). ++ * ++ * The ext core keeps track of whether the BPF side owns a given task or ++ * not and can gracefully ignore spurious dispatches from BPF side, ++ * which makes it safe to not implement this method. However, depending ++ * on the scheduling logic, this can lead to confusing behaviors - e.g. ++ * scheduling position not being updated across a priority change. ++ */ ++ void (*dequeue)(struct task_struct *p, u64 deq_flags); ++ ++ /** ++ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs ++ * @cpu: CPU to dispatch tasks for ++ * @prev: previous task being switched out ++ * ++ * Called when a CPU's local dsq is empty. The operation should dispatch ++ * one or more tasks from the BPF scheduler into the DSQs using ++ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ ++ * using scx_bpf_dsq_move_to_local(). ++ * ++ * The maximum number of times scx_bpf_dsq_insert() can be called ++ * without an intervening scx_bpf_dsq_move_to_local() is specified by ++ * ops.dispatch_max_batch. See the comments on top of the two functions ++ * for more details. ++ * ++ * When not %NULL, @prev is an SCX task with its slice depleted. If ++ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in ++ * @prev->scx.flags, it is not enqueued yet and will be enqueued after ++ * ops.dispatch() returns. To keep executing @prev, return without ++ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. ++ */ ++ void (*dispatch)(s32 cpu, struct task_struct *prev); ++ ++ /** ++ * @tick: Periodic tick ++ * @p: task running currently ++ * ++ * This operation is called every 1/HZ seconds on CPUs which are ++ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an ++ * immediate dispatch cycle on the CPU. ++ */ ++ void (*tick)(struct task_struct *p); ++ ++ /** ++ * @runnable: A task is becoming runnable on its associated CPU ++ * @p: task becoming runnable ++ * @enq_flags: %SCX_ENQ_* ++ * ++ * This and the following three functions can be used to track a task's ++ * execution state transitions. A task becomes ->runnable() on a CPU, ++ * and then goes through one or more ->running() and ->stopping() pairs ++ * as it runs on the CPU, and eventually becomes ->quiescent() when it's ++ * done running on the CPU. ++ * ++ * @p is becoming runnable on the CPU because it's ++ * ++ * - waking up (%SCX_ENQ_WAKEUP) ++ * - being moved from another CPU ++ * - being restored after temporarily taken off the queue for an ++ * attribute change. ++ * ++ * This and ->enqueue() are related but not coupled. This operation ++ * notifies @p's state transition and may not be followed by ->enqueue() ++ * e.g. when @p is being dispatched to a remote CPU, or when @p is ++ * being enqueued on a CPU experiencing a hotplug event. Likewise, a ++ * task may be ->enqueue()'d without being preceded by this operation ++ * e.g. after exhausting its slice. ++ */ ++ void (*runnable)(struct task_struct *p, u64 enq_flags); ++ ++ /** ++ * @running: A task is starting to run on its associated CPU ++ * @p: task starting to run ++ * ++ * Note that this callback may be called from a CPU other than the ++ * one the task is going to run on. This can happen when a task ++ * property is changed (i.e., affinity), since scx_next_task_scx(), ++ * which triggers this callback, may run on a CPU different from ++ * the task's assigned CPU. ++ * ++ * Therefore, always use scx_bpf_task_cpu(@p) to determine the ++ * target CPU the task is going to use. ++ * ++ * See ->runnable() for explanation on the task state notifiers. ++ */ ++ void (*running)(struct task_struct *p); ++ ++ /** ++ * @stopping: A task is stopping execution ++ * @p: task stopping to run ++ * @runnable: is task @p still runnable? ++ * ++ * Note that this callback may be called from a CPU other than the ++ * one the task was running on. This can happen when a task ++ * property is changed (i.e., affinity), since dequeue_task_scx(), ++ * which triggers this callback, may run on a CPU different from ++ * the task's assigned CPU. ++ * ++ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU ++ * the task was running on. ++ * ++ * See ->runnable() for explanation on the task state notifiers. If ++ * !@runnable, ->quiescent() will be invoked after this operation ++ * returns. ++ */ ++ void (*stopping)(struct task_struct *p, bool runnable); ++ ++ /** ++ * @quiescent: A task is becoming not runnable on its associated CPU ++ * @p: task becoming not runnable ++ * @deq_flags: %SCX_DEQ_* ++ * ++ * See ->runnable() for explanation on the task state notifiers. ++ * ++ * @p is becoming quiescent on the CPU because it's ++ * ++ * - sleeping (%SCX_DEQ_SLEEP) ++ * - being moved to another CPU ++ * - being temporarily taken off the queue for an attribute change ++ * (%SCX_DEQ_SAVE) ++ * ++ * This and ->dequeue() are related but not coupled. This operation ++ * notifies @p's state transition and may not be preceded by ->dequeue() ++ * e.g. when @p is being dispatched to a remote CPU. ++ */ ++ void (*quiescent)(struct task_struct *p, u64 deq_flags); ++ ++ /** ++ * @yield: Yield CPU ++ * @from: yielding task ++ * @to: optional yield target task ++ * ++ * If @to is NULL, @from is yielding the CPU to other runnable tasks. ++ * The BPF scheduler should ensure that other available tasks are ++ * dispatched before the yielding task. Return value is ignored in this ++ * case. ++ * ++ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf ++ * scheduler can implement the request, return %true; otherwise, %false. ++ */ ++ bool (*yield)(struct task_struct *from, struct task_struct *to); ++ ++ /** ++ * @core_sched_before: Task ordering for core-sched ++ * @a: task A ++ * @b: task B ++ * ++ * Used by core-sched to determine the ordering between two tasks. See ++ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on ++ * core-sched. ++ * ++ * Both @a and @b are runnable and may or may not currently be queued on ++ * the BPF scheduler. Should return %true if @a should run before @b. ++ * %false if there's no required ordering or @b should run before @a. ++ * ++ * If not specified, the default is ordering them according to when they ++ * became runnable. ++ */ ++ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); ++ ++ /** ++ * @set_weight: Set task weight ++ * @p: task to set weight for ++ * @weight: new weight [1..10000] ++ * ++ * Update @p's weight to @weight. ++ */ ++ void (*set_weight)(struct task_struct *p, u32 weight); ++ ++ /** ++ * @set_cpumask: Set CPU affinity ++ * @p: task to set CPU affinity for ++ * @cpumask: cpumask of cpus that @p can run on ++ * ++ * Update @p's CPU affinity to @cpumask. ++ */ ++ void (*set_cpumask)(struct task_struct *p, ++ const struct cpumask *cpumask); ++ ++ /** ++ * @update_idle: Update the idle state of a CPU ++ * @cpu: CPU to update the idle state for ++ * @idle: whether entering or exiting the idle state ++ * ++ * This operation is called when @rq's CPU goes or leaves the idle ++ * state. By default, implementing this operation disables the built-in ++ * idle CPU tracking and the following helpers become unavailable: ++ * ++ * - scx_bpf_select_cpu_dfl() ++ * - scx_bpf_select_cpu_and() ++ * - scx_bpf_test_and_clear_cpu_idle() ++ * - scx_bpf_pick_idle_cpu() ++ * ++ * The user also must implement ops.select_cpu() as the default ++ * implementation relies on scx_bpf_select_cpu_dfl(). ++ * ++ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle ++ * tracking. ++ */ ++ void (*update_idle)(s32 cpu, bool idle); ++ ++ /** ++ * @cpu_acquire: A CPU is becoming available to the BPF scheduler ++ * @cpu: The CPU being acquired by the BPF scheduler. ++ * @args: Acquire arguments, see the struct definition. ++ * ++ * A CPU that was previously released from the BPF scheduler is now once ++ * again under its control. ++ */ ++ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); ++ ++ /** ++ * @cpu_release: A CPU is taken away from the BPF scheduler ++ * @cpu: The CPU being released by the BPF scheduler. ++ * @args: Release arguments, see the struct definition. ++ * ++ * The specified CPU is no longer under the control of the BPF ++ * scheduler. This could be because it was preempted by a higher ++ * priority sched_class, though there may be other reasons as well. The ++ * caller should consult @args->reason to determine the cause. ++ */ ++ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); ++ ++ /** ++ * @init_task: Initialize a task to run in a BPF scheduler ++ * @p: task to initialize for BPF scheduling ++ * @args: init arguments, see the struct definition ++ * ++ * Either we're loading a BPF scheduler or a new task is being forked. ++ * Initialize @p for BPF scheduling. This operation may block and can ++ * be used for allocations, and is called exactly once for a task. ++ * ++ * Return 0 for success, -errno for failure. An error return while ++ * loading will abort loading of the BPF scheduler. During a fork, it ++ * will abort that specific fork. ++ */ ++ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); ++ ++ /** ++ * @exit_task: Exit a previously-running task from the system ++ * @p: task to exit ++ * @args: exit arguments, see the struct definition ++ * ++ * @p is exiting or the BPF scheduler is being unloaded. Perform any ++ * necessary cleanup for @p. ++ */ ++ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); ++ ++ /** ++ * @enable: Enable BPF scheduling for a task ++ * @p: task to enable BPF scheduling for ++ * ++ * Enable @p for BPF scheduling. enable() is called on @p any time it ++ * enters SCX, and is always paired with a matching disable(). ++ */ ++ void (*enable)(struct task_struct *p); ++ ++ /** ++ * @disable: Disable BPF scheduling for a task ++ * @p: task to disable BPF scheduling for ++ * ++ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. ++ * Disable BPF scheduling for @p. A disable() call is always matched ++ * with a prior enable() call. ++ */ ++ void (*disable)(struct task_struct *p); ++ ++ /** ++ * @dump: Dump BPF scheduler state on error ++ * @ctx: debug dump context ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. ++ */ ++ void (*dump)(struct scx_dump_ctx *ctx); ++ ++ /** ++ * @dump_cpu: Dump BPF scheduler state for a CPU on error ++ * @ctx: debug dump context ++ * @cpu: CPU to generate debug dump for ++ * @idle: @cpu is currently idle without any runnable tasks ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for ++ * @cpu. If @idle is %true and this operation doesn't produce any ++ * output, @cpu is skipped for dump. ++ */ ++ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); ++ ++ /** ++ * @dump_task: Dump BPF scheduler state for a runnable task on error ++ * @ctx: debug dump context ++ * @p: runnable task to generate debug dump for ++ * ++ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for ++ * @p. ++ */ ++ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); ++ ++#ifdef CONFIG_EXT_GROUP_SCHED ++ /** ++ * @cgroup_init: Initialize a cgroup ++ * @cgrp: cgroup being initialized ++ * @args: init arguments, see the struct definition ++ * ++ * Either the BPF scheduler is being loaded or @cgrp created, initialize ++ * @cgrp for sched_ext. This operation may block. ++ * ++ * Return 0 for success, -errno for failure. An error return while ++ * loading will abort loading of the BPF scheduler. During cgroup ++ * creation, it will abort the specific cgroup creation. ++ */ ++ s32 (*cgroup_init)(struct cgroup *cgrp, ++ struct scx_cgroup_init_args *args); ++ ++ /** ++ * @cgroup_exit: Exit a cgroup ++ * @cgrp: cgroup being exited ++ * ++ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit ++ * @cgrp for sched_ext. This operation my block. ++ */ ++ void (*cgroup_exit)(struct cgroup *cgrp); ++ ++ /** ++ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Prepare @p for move from cgroup @from to @to. This operation may ++ * block and can be used for allocations. ++ * ++ * Return 0 for success, -errno for failure. An error return aborts the ++ * migration. ++ */ ++ s32 (*cgroup_prep_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * @cgroup_move: Commit cgroup move ++ * @p: task being moved ++ * @from: cgroup @p is being moved from ++ * @to: cgroup @p is being moved to ++ * ++ * Commit the move. @p is dequeued during this operation. ++ */ ++ void (*cgroup_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * @cgroup_cancel_move: Cancel cgroup move ++ * @p: task whose cgroup move is being canceled ++ * @from: cgroup @p was being moved from ++ * @to: cgroup @p was being moved to ++ * ++ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). ++ * Undo the preparation. ++ */ ++ void (*cgroup_cancel_move)(struct task_struct *p, ++ struct cgroup *from, struct cgroup *to); ++ ++ /** ++ * @cgroup_set_weight: A cgroup's weight is being changed ++ * @cgrp: cgroup whose weight is being updated ++ * @weight: new weight [1..10000] ++ * ++ * Update @cgrp's weight to @weight. ++ */ ++ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); ++ ++ /** ++ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed ++ * @cgrp: cgroup whose bandwidth is being updated ++ * @period_us: bandwidth control period ++ * @quota_us: bandwidth control quota ++ * @burst_us: bandwidth control burst ++ * ++ * Update @cgrp's bandwidth control parameters. This is from the cpu.max ++ * cgroup interface. ++ * ++ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled ++ * to. For example, if @period_us is 1_000_000 and @quota_us is ++ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be ++ * interpreted in the same fashion and specifies how much @cgrp can ++ * burst temporarily. The specific control mechanism and thus the ++ * interpretation of @period_us and burstiness is upto to the BPF ++ * scheduler. ++ */ ++ void (*cgroup_set_bandwidth)(struct cgroup *cgrp, ++ u64 period_us, u64 quota_us, u64 burst_us); ++ ++#endif /* CONFIG_EXT_GROUP_SCHED */ ++ ++ /* ++ * All online ops must come before ops.cpu_online(). ++ */ ++ ++ /** ++ * @cpu_online: A CPU became online ++ * @cpu: CPU which just came up ++ * ++ * @cpu just came online. @cpu will not call ops.enqueue() or ++ * ops.dispatch(), nor run tasks associated with other CPUs beforehand. ++ */ ++ void (*cpu_online)(s32 cpu); ++ ++ /** ++ * @cpu_offline: A CPU is going offline ++ * @cpu: CPU which is going offline ++ * ++ * @cpu is going offline. @cpu will not call ops.enqueue() or ++ * ops.dispatch(), nor run tasks associated with other CPUs afterwards. ++ */ ++ void (*cpu_offline)(s32 cpu); ++ ++ /* ++ * All CPU hotplug ops must come before ops.init(). ++ */ ++ ++ /** ++ * @init: Initialize the BPF scheduler ++ */ ++ s32 (*init)(void); ++ ++ /** ++ * @exit: Clean up after the BPF scheduler ++ * @info: Exit info ++ * ++ * ops.exit() is also called on ops.init() failure, which is a bit ++ * unusual. This is to allow rich reporting through @info on how ++ * ops.init() failed. ++ */ ++ void (*exit)(struct scx_exit_info *info); ++ ++ /** ++ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch ++ */ ++ u32 dispatch_max_batch; ++ ++ /** ++ * @flags: %SCX_OPS_* flags ++ */ ++ u64 flags; ++ ++ /** ++ * @timeout_ms: The maximum amount of time, in milliseconds, that a ++ * runnable task should be able to wait before being scheduled. The ++ * maximum timeout may not exceed the default timeout of 30 seconds. ++ * ++ * Defaults to the maximum allowed timeout value of 30 seconds. ++ */ ++ u32 timeout_ms; ++ ++ /** ++ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default ++ * value of 32768 is used. ++ */ ++ u32 exit_dump_len; ++ ++ /** ++ * @hotplug_seq: A sequence number that may be set by the scheduler to ++ * detect when a hotplug event has occurred during the loading process. ++ * If 0, no detection occurs. Otherwise, the scheduler will fail to ++ * load if the sequence number does not match @scx_hotplug_seq on the ++ * enable path. ++ */ ++ u64 hotplug_seq; ++ ++ /** ++ * @name: BPF scheduler's name ++ * ++ * Must be a non-zero valid BPF object name including only isalnum(), ++ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the ++ * BPF scheduler is enabled. ++ */ ++ char name[SCX_OPS_NAME_LEN]; ++ ++ /* internal use only, must be NULL */ ++ void *priv; ++}; ++ ++enum scx_opi { ++ SCX_OPI_BEGIN = 0, ++ SCX_OPI_NORMAL_BEGIN = 0, ++ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), ++ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), ++ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), ++ SCX_OPI_END = SCX_OP_IDX(init), ++}; ++ ++/* ++ * Collection of event counters. Event types are placed in descending order. ++ */ ++struct scx_event_stats { ++ /* ++ * If ops.select_cpu() returns a CPU which can't be used by the task, ++ * the core scheduler code silently picks a fallback CPU. ++ */ ++ s64 SCX_EV_SELECT_CPU_FALLBACK; ++ ++ /* ++ * When dispatching to a local DSQ, the CPU may have gone offline in ++ * the meantime. In this case, the task is bounced to the global DSQ. ++ */ ++ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; ++ ++ /* ++ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task ++ * continued to run because there were no other tasks on the CPU. ++ */ ++ s64 SCX_EV_DISPATCH_KEEP_LAST; ++ ++ /* ++ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task ++ * is dispatched to a local DSQ when exiting. ++ */ ++ s64 SCX_EV_ENQ_SKIP_EXITING; ++ ++ /* ++ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a ++ * migration disabled task skips ops.enqueue() and is dispatched to its ++ * local DSQ. ++ */ ++ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; ++ ++ /* ++ * Total number of times a task's time slice was refilled with the ++ * default value (SCX_SLICE_DFL). ++ */ ++ s64 SCX_EV_REFILL_SLICE_DFL; ++ ++ /* ++ * The total duration of bypass modes in nanoseconds. ++ */ ++ s64 SCX_EV_BYPASS_DURATION; ++ ++ /* ++ * The number of tasks dispatched in the bypassing mode. ++ */ ++ s64 SCX_EV_BYPASS_DISPATCH; ++ ++ /* ++ * The number of times the bypassing mode has been activated. ++ */ ++ s64 SCX_EV_BYPASS_ACTIVATE; ++}; ++ ++struct scx_sched_pcpu { ++ /* ++ * The event counters are in a per-CPU variable to minimize the ++ * accounting overhead. A system-wide view on the event counter is ++ * constructed when requested by scx_bpf_events(). ++ */ ++ struct scx_event_stats event_stats; ++}; ++ ++struct scx_sched { ++ struct sched_ext_ops ops; ++ DECLARE_BITMAP(has_op, SCX_OPI_END); ++ ++ /* ++ * Dispatch queues. ++ * ++ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. ++ * This is to avoid live-locking in bypass mode where all tasks are ++ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If ++ * per-node split isn't sufficient, it can be further split. ++ */ ++ struct rhashtable dsq_hash; ++ struct scx_dispatch_q **global_dsqs; ++ struct scx_sched_pcpu __percpu *pcpu; ++ ++ bool warned_zero_slice; ++ ++ atomic_t exit_kind; ++ struct scx_exit_info *exit_info; ++ ++ struct kobject kobj; ++ ++ struct kthread_worker *helper; ++ struct irq_work error_irq_work; ++ struct kthread_work disable_work; ++ struct rcu_work rcu_work; ++}; ++ ++enum scx_wake_flags { ++ /* expose select WF_* flags as enums */ ++ SCX_WAKE_FORK = WF_FORK, ++ SCX_WAKE_TTWU = WF_TTWU, ++ SCX_WAKE_SYNC = WF_SYNC, ++}; ++ ++enum scx_enq_flags { ++ /* expose select ENQUEUE_* flags as enums */ ++ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, ++ SCX_ENQ_HEAD = ENQUEUE_HEAD, ++ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, ++ ++ /* high 32bits are SCX specific */ ++ ++ /* ++ * Set the following to trigger preemption when calling ++ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the ++ * current task is cleared to zero and the CPU is kicked into the ++ * scheduling path. Implies %SCX_ENQ_HEAD. ++ */ ++ SCX_ENQ_PREEMPT = 1LLU << 32, ++ ++ /* ++ * The task being enqueued was previously enqueued on the current CPU's ++ * %SCX_DSQ_LOCAL, but was removed from it in a call to the ++ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was ++ * invoked in a ->cpu_release() callback, and the task is again ++ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the ++ * task will not be scheduled on the CPU until at least the next invocation ++ * of the ->cpu_acquire() callback. ++ */ ++ SCX_ENQ_REENQ = 1LLU << 40, ++ ++ /* ++ * The task being enqueued is the only task available for the cpu. By ++ * default, ext core keeps executing such tasks but when ++ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the ++ * %SCX_ENQ_LAST flag set. ++ * ++ * The BPF scheduler is responsible for triggering a follow-up ++ * scheduling event. Otherwise, Execution may stall. ++ */ ++ SCX_ENQ_LAST = 1LLU << 41, ++ ++ /* high 8 bits are internal */ ++ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, ++ ++ SCX_ENQ_CLEAR_OPSS = 1LLU << 56, ++ SCX_ENQ_DSQ_PRIQ = 1LLU << 57, ++}; ++ ++enum scx_deq_flags { ++ /* expose select DEQUEUE_* flags as enums */ ++ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, ++ ++ /* high 32bits are SCX specific */ ++ ++ /* ++ * The generic core-sched layer decided to execute the task even though ++ * it hasn't been dispatched yet. Dequeue from the BPF side. ++ */ ++ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, ++}; ++ ++enum scx_pick_idle_cpu_flags { ++ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ ++ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ ++}; ++ ++enum scx_kick_flags { ++ /* ++ * Kick the target CPU if idle. Guarantees that the target CPU goes ++ * through at least one full scheduling cycle before going idle. If the ++ * target CPU can be determined to be currently not idle and going to go ++ * through a scheduling cycle before going idle, noop. ++ */ ++ SCX_KICK_IDLE = 1LLU << 0, ++ ++ /* ++ * Preempt the current task and execute the dispatch path. If the ++ * current task of the target CPU is an SCX task, its ->scx.slice is ++ * cleared to zero before the scheduling path is invoked so that the ++ * task expires and the dispatch path is invoked. ++ */ ++ SCX_KICK_PREEMPT = 1LLU << 1, ++ ++ /* ++ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will ++ * return after the target CPU finishes picking the next task. ++ */ ++ SCX_KICK_WAIT = 1LLU << 2, ++}; ++ ++enum scx_tg_flags { ++ SCX_TG_ONLINE = 1U << 0, ++ SCX_TG_INITED = 1U << 1, ++}; ++ ++enum scx_enable_state { ++ SCX_ENABLING, ++ SCX_ENABLED, ++ SCX_DISABLING, ++ SCX_DISABLED, ++}; ++ ++static const char *scx_enable_state_str[] = { ++ [SCX_ENABLING] = "enabling", ++ [SCX_ENABLED] = "enabled", ++ [SCX_DISABLING] = "disabling", ++ [SCX_DISABLED] = "disabled", ++}; ++ ++/* ++ * sched_ext_entity->ops_state ++ * ++ * Used to track the task ownership between the SCX core and the BPF scheduler. ++ * State transitions look as follows: ++ * ++ * NONE -> QUEUEING -> QUEUED -> DISPATCHING ++ * ^ | | ++ * | v v ++ * \-------------------------------/ ++ * ++ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call ++ * sites for explanations on the conditions being waited upon and why they are ++ * safe. Transitions out of them into NONE or QUEUED must store_release and the ++ * waiters should load_acquire. ++ * ++ * Tracking scx_ops_state enables sched_ext core to reliably determine whether ++ * any given task can be dispatched by the BPF scheduler at all times and thus ++ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler ++ * to try to dispatch any task anytime regardless of its state as the SCX core ++ * can safely reject invalid dispatches. ++ */ ++enum scx_ops_state { ++ SCX_OPSS_NONE, /* owned by the SCX core */ ++ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ ++ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ ++ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ ++ ++ /* ++ * QSEQ brands each QUEUED instance so that, when dispatch races ++ * dequeue/requeue, the dispatcher can tell whether it still has a claim ++ * on the task being dispatched. ++ * ++ * As some 32bit archs can't do 64bit store_release/load_acquire, ++ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on ++ * 32bit machines. The dispatch race window QSEQ protects is very narrow ++ * and runs with IRQ disabled. 30 bits should be sufficient. ++ */ ++ SCX_OPSS_QSEQ_SHIFT = 2, ++}; ++ ++/* Use macros to ensure that the type is unsigned long for the masks */ ++#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) ++#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) ++ ++DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); ++ ++/* ++ * Return the rq currently locked from an scx callback, or NULL if no rq is ++ * locked. ++ */ ++static inline struct rq *scx_locked_rq(void) ++{ ++ return __this_cpu_read(scx_locked_rq_state); ++} ++ ++static inline bool scx_kf_allowed_if_unlocked(void) ++{ ++ return !current->scx.kf_mask; ++} ++ ++static inline bool scx_rq_bypassing(struct rq *rq) ++{ ++ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); ++} +diff --git a/kernel/seccomp.c b/kernel/seccomp.c +index 3bbfba30a777a1..25f62867a16d93 100644 +--- a/kernel/seccomp.c ++++ b/kernel/seccomp.c +@@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) + } + + #ifdef SECCOMP_ARCH_NATIVE ++static bool seccomp_uprobe_exception(struct seccomp_data *sd) ++{ ++#if defined __NR_uretprobe || defined __NR_uprobe ++#ifdef SECCOMP_ARCH_COMPAT ++ if (sd->arch == SECCOMP_ARCH_NATIVE) ++#endif ++ { ++#ifdef __NR_uretprobe ++ if (sd->nr == __NR_uretprobe) ++ return true; ++#endif ++#ifdef __NR_uprobe ++ if (sd->nr == __NR_uprobe) ++ return true; ++#endif ++ } ++#endif ++ return false; ++} ++ + /** + * seccomp_is_const_allow - check if filter is constant allow with given data + * @fprog: The BPF programs +@@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, + return false; + + /* Our single exception to filtering. */ +-#ifdef __NR_uretprobe +-#ifdef SECCOMP_ARCH_COMPAT +- if (sd->arch == SECCOMP_ARCH_NATIVE) +-#endif +- if (sd->nr == __NR_uretprobe) +- return true; +-#endif ++ if (seccomp_uprobe_exception(sd)) ++ return true; + + for (pc = 0; pc < fprog->len; pc++) { + struct sock_filter *insn = &fprog->filter[pc]; +@@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = { + __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, + #ifdef __NR_uretprobe + __NR_uretprobe, ++#endif ++#ifdef __NR_uprobe ++ __NR_uprobe, + #endif + -1, /* negative terminated */ + }; +diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c +index b6974fce800cd8..3a4d3b2e3f7409 100644 +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -3070,7 +3070,7 @@ static int __init tk_aux_sysfs_init(void) + return -ENOMEM; + } + +- for (int i = 0; i <= MAX_AUX_CLOCKS; i++) { ++ for (int i = 0; i < MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + +diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c +index 69d877501cb727..cd50a94326e3a9 100644 +--- a/tools/sched_ext/scx_qmap.bpf.c ++++ b/tools/sched_ext/scx_qmap.bpf.c +@@ -56,7 +56,8 @@ struct qmap { + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), +- queue4 SEC(".maps"); ++ queue4 SEC(".maps"), ++ dump_store SEC(".maps"); + + struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); +@@ -578,11 +579,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) + return; + + scx_bpf_dump("QMAP FIFO[%d]:", i); ++ ++ /* ++ * Dump can be invoked anytime and there is no way to iterate in ++ * a non-destructive way. Pop and store in dump_store and then ++ * restore afterwards. If racing against new enqueues, ordering ++ * can get mixed up. ++ */ + bpf_repeat(4096) { + if (bpf_map_pop_elem(fifo, &pid)) + break; ++ bpf_map_push_elem(&dump_store, &pid, 0); + scx_bpf_dump(" %d", pid); + } ++ ++ bpf_repeat(4096) { ++ if (bpf_map_pop_elem(&dump_store, &pid)) ++ break; ++ bpf_map_push_elem(fifo, &pid, 0); ++ } ++ + scx_bpf_dump("\n"); + } + }
