commit: e758eda21d87b283279b863edee5538d33067904 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org> AuthorDate: Sat Nov 16 10:54:45 2019 +0000 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org> CommitDate: Sat Nov 16 10:54:45 2019 +0000 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=e758eda2
Linux patch 4.9.202 Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org> 0000_README | 4 + 1201_linux-4.9.202.patch | 3319 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 3323 insertions(+) diff --git a/0000_README b/0000_README index 7d079d7..33f5858 100644 --- a/0000_README +++ b/0000_README @@ -847,6 +847,10 @@ Patch: 1200_linux-4.9.201.patch From: http://www.kernel.org Desc: Linux 4.9.201 +Patch: 1201_linux-4.9.202.patch +From: http://www.kernel.org +Desc: Linux 4.9.202 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. diff --git a/1201_linux-4.9.202.patch b/1201_linux-4.9.202.patch new file mode 100644 index 0000000..bbc562b --- /dev/null +++ b/1201_linux-4.9.202.patch @@ -0,0 +1,3319 @@ +diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu +index cadb7a9a5218..b41046b5713b 100644 +--- a/Documentation/ABI/testing/sysfs-devices-system-cpu ++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu +@@ -358,6 +358,8 @@ What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/spec_store_bypass + /sys/devices/system/cpu/vulnerabilities/l1tf + /sys/devices/system/cpu/vulnerabilities/mds ++ /sys/devices/system/cpu/vulnerabilities/tsx_async_abort ++ /sys/devices/system/cpu/vulnerabilities/itlb_multihit + Date: January 2018 + Contact: Linux kernel mailing list <[email protected]> + Description: Information about CPU vulnerabilities +diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst +index ffc064c1ec68..24f53c501366 100644 +--- a/Documentation/hw-vuln/index.rst ++++ b/Documentation/hw-vuln/index.rst +@@ -11,3 +11,5 @@ are configurable at compile, boot or run time. + + l1tf + mds ++ tsx_async_abort ++ multihit.rst +diff --git a/Documentation/hw-vuln/multihit.rst b/Documentation/hw-vuln/multihit.rst +new file mode 100644 +index 000000000000..ba9988d8bce5 +--- /dev/null ++++ b/Documentation/hw-vuln/multihit.rst +@@ -0,0 +1,163 @@ ++iTLB multihit ++============= ++ ++iTLB multihit is an erratum where some processors may incur a machine check ++error, possibly resulting in an unrecoverable CPU lockup, when an ++instruction fetch hits multiple entries in the instruction TLB. This can ++occur when the page size is changed along with either the physical address ++or cache type. A malicious guest running on a virtualized system can ++exploit this erratum to perform a denial of service attack. ++ ++ ++Affected processors ++------------------- ++ ++Variations of this erratum are present on most Intel Core and Xeon processor ++models. The erratum is not present on: ++ ++ - non-Intel processors ++ ++ - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont) ++ ++ - Intel processors that have the PSCHANGE_MC_NO bit set in the ++ IA32_ARCH_CAPABILITIES MSR. ++ ++ ++Related CVEs ++------------ ++ ++The following CVE entry is related to this issue: ++ ++ ============== ================================================= ++ CVE-2018-12207 Machine Check Error Avoidance on Page Size Change ++ ============== ================================================= ++ ++ ++Problem ++------- ++ ++Privileged software, including OS and virtual machine managers (VMM), are in ++charge of memory management. A key component in memory management is the control ++of the page tables. Modern processors use virtual memory, a technique that creates ++the illusion of a very large memory for processors. This virtual space is split ++into pages of a given size. Page tables translate virtual addresses to physical ++addresses. ++ ++To reduce latency when performing a virtual to physical address translation, ++processors include a structure, called TLB, that caches recent translations. ++There are separate TLBs for instruction (iTLB) and data (dTLB). ++ ++Under this errata, instructions are fetched from a linear address translated ++using a 4 KB translation cached in the iTLB. Privileged software modifies the ++paging structure so that the same linear address using large page size (2 MB, 4 ++MB, 1 GB) with a different physical address or memory type. After the page ++structure modification but before the software invalidates any iTLB entries for ++the linear address, a code fetch that happens on the same linear address may ++cause a machine-check error which can result in a system hang or shutdown. ++ ++ ++Attack scenarios ++---------------- ++ ++Attacks against the iTLB multihit erratum can be mounted from malicious ++guests in a virtualized system. ++ ++ ++iTLB multihit system information ++-------------------------------- ++ ++The Linux kernel provides a sysfs interface to enumerate the current iTLB ++multihit status of the system:whether the system is vulnerable and which ++mitigations are active. The relevant sysfs file is: ++ ++/sys/devices/system/cpu/vulnerabilities/itlb_multihit ++ ++The possible values in this file are: ++ ++.. list-table:: ++ ++ * - Not affected ++ - The processor is not vulnerable. ++ * - KVM: Mitigation: Split huge pages ++ - Software changes mitigate this issue. ++ * - KVM: Vulnerable ++ - The processor is vulnerable, but no mitigation enabled ++ ++ ++Enumeration of the erratum ++-------------------------------- ++ ++A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr ++and will be set on CPU's which are mitigated against this issue. ++ ++ ======================================= =========== =============================== ++ IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model ++ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model ++ IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable ++ ======================================= =========== =============================== ++ ++ ++Mitigation mechanism ++------------------------- ++ ++This erratum can be mitigated by restricting the use of large page sizes to ++non-executable pages. This forces all iTLB entries to be 4K, and removes ++the possibility of multiple hits. ++ ++In order to mitigate the vulnerability, KVM initially marks all huge pages ++as non-executable. If the guest attempts to execute in one of those pages, ++the page is broken down into 4K pages, which are then marked executable. ++ ++If EPT is disabled or not available on the host, KVM is in control of TLB ++flushes and the problematic situation cannot happen. However, the shadow ++EPT paging mechanism used by nested virtualization is vulnerable, because ++the nested guest can trigger multiple iTLB hits by modifying its own ++(non-nested) page tables. For simplicity, KVM will make large pages ++non-executable in all shadow paging modes. ++ ++Mitigation control on the kernel command line and KVM - module parameter ++------------------------------------------------------------------------ ++ ++The KVM hypervisor mitigation mechanism for marking huge pages as ++non-executable can be controlled with a module parameter "nx_huge_pages=". ++The kernel command line allows to control the iTLB multihit mitigations at ++boot time with the option "kvm.nx_huge_pages=". ++ ++The valid arguments for these options are: ++ ++ ========== ================================================================ ++ force Mitigation is enabled. In this case, the mitigation implements ++ non-executable huge pages in Linux kernel KVM module. All huge ++ pages in the EPT are marked as non-executable. ++ If a guest attempts to execute in one of those pages, the page is ++ broken down into 4K pages, which are then marked executable. ++ ++ off Mitigation is disabled. ++ ++ auto Enable mitigation only if the platform is affected and the kernel ++ was not booted with the "mitigations=off" command line parameter. ++ This is the default option. ++ ========== ================================================================ ++ ++ ++Mitigation selection guide ++-------------------------- ++ ++1. No virtualization in use ++^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ The system is protected by the kernel unconditionally and no further ++ action is required. ++ ++2. Virtualization with trusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++ If the guest comes from a trusted source, you may assume that the guest will ++ not attempt to maliciously exploit these errata and no further action is ++ required. ++ ++3. Virtualization with untrusted guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ If the guest comes from an untrusted source, the guest host kernel will need ++ to apply iTLB multihit mitigation via the kernel command line or kvm ++ module parameter. +diff --git a/Documentation/hw-vuln/tsx_async_abort.rst b/Documentation/hw-vuln/tsx_async_abort.rst +new file mode 100644 +index 000000000000..fddbd7579c53 +--- /dev/null ++++ b/Documentation/hw-vuln/tsx_async_abort.rst +@@ -0,0 +1,276 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++TAA - TSX Asynchronous Abort ++====================================== ++ ++TAA is a hardware vulnerability that allows unprivileged speculative access to ++data which is available in various CPU internal buffers by using asynchronous ++aborts within an Intel TSX transactional region. ++ ++Affected processors ++------------------- ++ ++This vulnerability only affects Intel processors that support Intel ++Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8) ++is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit ++(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations ++also mitigate against TAA. ++ ++Whether a processor is affected or not can be read out from the TAA ++vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`. ++ ++Related CVEs ++------------ ++ ++The following CVE entry is related to this TAA issue: ++ ++ ============== ===== =================================================== ++ CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some ++ microprocessors utilizing speculative execution may ++ allow an authenticated user to potentially enable ++ information disclosure via a side channel with ++ local access. ++ ============== ===== =================================================== ++ ++Problem ++------- ++ ++When performing store, load or L1 refill operations, processors write ++data into temporary microarchitectural structures (buffers). The data in ++those buffers can be forwarded to load operations as an optimization. ++ ++Intel TSX is an extension to the x86 instruction set architecture that adds ++hardware transactional memory support to improve performance of multi-threaded ++software. TSX lets the processor expose and exploit concurrency hidden in an ++application due to dynamically avoiding unnecessary synchronization. ++ ++TSX supports atomic memory transactions that are either committed (success) or ++aborted. During an abort, operations that happened within the transactional region ++are rolled back. An asynchronous abort takes place, among other options, when a ++different thread accesses a cache line that is also used within the transactional ++region when that access might lead to a data race. ++ ++Immediately after an uncompleted asynchronous abort, certain speculatively ++executed loads may read data from those internal buffers and pass it to dependent ++operations. This can be then used to infer the value via a cache side channel ++attack. ++ ++Because the buffers are potentially shared between Hyper-Threads cross ++Hyper-Thread attacks are possible. ++ ++The victim of a malicious actor does not need to make use of TSX. Only the ++attacker needs to begin a TSX transaction and raise an asynchronous abort ++which in turn potenitally leaks data stored in the buffers. ++ ++More detailed technical information is available in the TAA specific x86 ++architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`. ++ ++ ++Attack scenarios ++---------------- ++ ++Attacks against the TAA vulnerability can be implemented from unprivileged ++applications running on hosts or guests. ++ ++As for MDS, the attacker has no control over the memory addresses that can ++be leaked. Only the victim is responsible for bringing data to the CPU. As ++a result, the malicious actor has to sample as much data as possible and ++then postprocess it to try to infer any useful information from it. ++ ++A potential attacker only has read access to the data. Also, there is no direct ++privilege escalation by using this technique. ++ ++ ++.. _tsx_async_abort_sys_info: ++ ++TAA system information ++----------------------- ++ ++The Linux kernel provides a sysfs interface to enumerate the current TAA status ++of mitigated systems. The relevant sysfs file is: ++ ++/sys/devices/system/cpu/vulnerabilities/tsx_async_abort ++ ++The possible values in this file are: ++ ++.. list-table:: ++ ++ * - 'Vulnerable' ++ - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied. ++ * - 'Vulnerable: Clear CPU buffers attempted, no microcode' ++ - The system tries to clear the buffers but the microcode might not support the operation. ++ * - 'Mitigation: Clear CPU buffers' ++ - The microcode has been updated to clear the buffers. TSX is still enabled. ++ * - 'Mitigation: TSX disabled' ++ - TSX is disabled. ++ * - 'Not affected' ++ - The CPU is not affected by this issue. ++ ++.. _ucode_needed: ++ ++Best effort mitigation mode ++^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++If the processor is vulnerable, but the availability of the microcode-based ++mitigation mechanism is not advertised via CPUID the kernel selects a best ++effort mitigation mode. This mode invokes the mitigation instructions ++without a guarantee that they clear the CPU buffers. ++ ++This is done to address virtualization scenarios where the host has the ++microcode update applied, but the hypervisor is not yet updated to expose the ++CPUID to the guest. If the host has updated microcode the protection takes ++effect; otherwise a few CPU cycles are wasted pointlessly. ++ ++The state in the tsx_async_abort sysfs file reflects this situation ++accordingly. ++ ++ ++Mitigation mechanism ++-------------------- ++ ++The kernel detects the affected CPUs and the presence of the microcode which is ++required. If a CPU is affected and the microcode is available, then the kernel ++enables the mitigation by default. ++ ++ ++The mitigation can be controlled at boot time via a kernel command line option. ++See :ref:`taa_mitigation_control_command_line`. ++ ++.. _virt_mechanism: ++ ++Virtualization mitigation ++^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++Affected systems where the host has TAA microcode and TAA is mitigated by ++having disabled TSX previously, are not vulnerable regardless of the status ++of the VMs. ++ ++In all other cases, if the host either does not have the TAA microcode or ++the kernel is not mitigated, the system might be vulnerable. ++ ++ ++.. _taa_mitigation_control_command_line: ++ ++Mitigation control on the kernel command line ++--------------------------------------------- ++ ++The kernel command line allows to control the TAA mitigations at boot time with ++the option "tsx_async_abort=". The valid arguments for this option are: ++ ++ ============ ============================================================= ++ off This option disables the TAA mitigation on affected platforms. ++ If the system has TSX enabled (see next parameter) and the CPU ++ is affected, the system is vulnerable. ++ ++ full TAA mitigation is enabled. If TSX is enabled, on an affected ++ system it will clear CPU buffers on ring transitions. On ++ systems which are MDS-affected and deploy MDS mitigation, ++ TAA is also mitigated. Specifying this option on those ++ systems will have no effect. ++ ++ full,nosmt The same as tsx_async_abort=full, with SMT disabled on ++ vulnerable CPUs that have TSX enabled. This is the complete ++ mitigation. When TSX is disabled, SMT is not disabled because ++ CPU is not vulnerable to cross-thread TAA attacks. ++ ============ ============================================================= ++ ++Not specifying this option is equivalent to "tsx_async_abort=full". ++ ++The kernel command line also allows to control the TSX feature using the ++parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used ++to control the TSX feature and the enumeration of the TSX feature bits (RTM ++and HLE) in CPUID. ++ ++The valid options are: ++ ++ ============ ============================================================= ++ off Disables TSX on the system. ++ ++ Note that this option takes effect only on newer CPUs which are ++ not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 ++ and which get the new IA32_TSX_CTRL MSR through a microcode ++ update. This new MSR allows for the reliable deactivation of ++ the TSX functionality. ++ ++ on Enables TSX. ++ ++ Although there are mitigations for all known security ++ vulnerabilities, TSX has been known to be an accelerator for ++ several previous speculation-related CVEs, and so there may be ++ unknown security risks associated with leaving it enabled. ++ ++ auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX ++ on the system. ++ ============ ============================================================= ++ ++Not specifying this option is equivalent to "tsx=off". ++ ++The following combinations of the "tsx_async_abort" and "tsx" are possible. For ++affected platforms tsx=auto is equivalent to tsx=off and the result will be: ++ ++ ========= ========================== ========================================= ++ tsx=on tsx_async_abort=full The system will use VERW to clear CPU ++ buffers. Cross-thread attacks are still ++ possible on SMT machines. ++ tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT ++ mitigated. ++ tsx=on tsx_async_abort=off The system is vulnerable. ++ tsx=off tsx_async_abort=full TSX might be disabled if microcode ++ provides a TSX control MSR. If so, ++ system is not vulnerable. ++ tsx=off tsx_async_abort=full,nosmt Ditto ++ tsx=off tsx_async_abort=off ditto ++ ========= ========================== ========================================= ++ ++ ++For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU ++buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0) ++"tsx" command line argument has no effect. ++ ++For the affected platforms below table indicates the mitigation status for the ++combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO ++and TSX_CTRL_MSR. ++ ++ ======= ========= ============= ======================================== ++ MDS_NO MD_CLEAR TSX_CTRL_MSR Status ++ ======= ========= ============= ======================================== ++ 0 0 0 Vulnerable (needs microcode) ++ 0 1 0 MDS and TAA mitigated via VERW ++ 1 1 0 MDS fixed, TAA vulnerable if TSX enabled ++ because MD_CLEAR has no meaning and ++ VERW is not guaranteed to clear buffers ++ 1 X 1 MDS fixed, TAA can be mitigated by ++ VERW or TSX_CTRL_MSR ++ ======= ========= ============= ======================================== ++ ++Mitigation selection guide ++-------------------------- ++ ++1. Trusted userspace and guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++If all user space applications are from a trusted source and do not execute ++untrusted code which is supplied externally, then the mitigation can be ++disabled. The same applies to virtualized environments with trusted guests. ++ ++ ++2. Untrusted userspace and guests ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ ++If there are untrusted applications or guests on the system, enabling TSX ++might allow a malicious actor to leak data from the host or from other ++processes running on the same physical core. ++ ++If the microcode is available and the TSX is disabled on the host, attacks ++are prevented in a virtualized environment as well, even if the VMs do not ++explicitly enable the mitigation. ++ ++ ++.. _taa_default_mitigations: ++ ++Default mitigations ++------------------- ++ ++The kernel's default action for vulnerable processors is: ++ ++ - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off). +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index 61b73e42f488..c81a008d6512 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -1975,6 +1975,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + KVM MMU at runtime. + Default is 0 (off) + ++ kvm.nx_huge_pages= ++ [KVM] Controls the software workaround for the ++ X86_BUG_ITLB_MULTIHIT bug. ++ force : Always deploy workaround. ++ off : Never deploy workaround. ++ auto : Deploy workaround based on the presence of ++ X86_BUG_ITLB_MULTIHIT. ++ ++ Default is 'auto'. ++ ++ If the software workaround is enabled for the host, ++ guests do need not to enable it for nested guests. ++ ++ kvm.nx_huge_pages_recovery_ratio= ++ [KVM] Controls how many 4KiB pages are periodically zapped ++ back to huge pages. 0 disables the recovery, otherwise if ++ the value is N KVM will zap 1/Nth of the 4KiB pages every ++ minute. The default is 60. ++ + kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. + Default is 1 (enabled) + +@@ -2490,6 +2509,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + spec_store_bypass_disable=off [X86] + l1tf=off [X86] + mds=off [X86] ++ tsx_async_abort=off [X86] ++ kvm.nx_huge_pages=off [X86] ++ ++ Exceptions: ++ This does not have any effect on ++ kvm.nx_huge_pages when ++ kvm.nx_huge_pages=force. + + auto (default) + Mitigate all CPU vulnerabilities, but leave SMT +@@ -2505,6 +2531,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + be fully mitigated, even if it means losing SMT. + Equivalent to: l1tf=flush,nosmt [X86] + mds=full,nosmt [X86] ++ tsx_async_abort=full,nosmt [X86] + + mminit_loglevel= + [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this +@@ -4516,6 +4543,71 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + platforms where RDTSC is slow and this accounting + can add overhead. + ++ tsx= [X86] Control Transactional Synchronization ++ Extensions (TSX) feature in Intel processors that ++ support TSX control. ++ ++ This parameter controls the TSX feature. The options are: ++ ++ on - Enable TSX on the system. Although there are ++ mitigations for all known security vulnerabilities, ++ TSX has been known to be an accelerator for ++ several previous speculation-related CVEs, and ++ so there may be unknown security risks associated ++ with leaving it enabled. ++ ++ off - Disable TSX on the system. (Note that this ++ option takes effect only on newer CPUs which are ++ not vulnerable to MDS, i.e., have ++ MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get ++ the new IA32_TSX_CTRL MSR through a microcode ++ update. This new MSR allows for the reliable ++ deactivation of the TSX functionality.) ++ ++ auto - Disable TSX if X86_BUG_TAA is present, ++ otherwise enable TSX on the system. ++ ++ Not specifying this option is equivalent to tsx=off. ++ ++ See Documentation/hw-vuln/tsx_async_abort.rst ++ for more details. ++ ++ tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async ++ Abort (TAA) vulnerability. ++ ++ Similar to Micro-architectural Data Sampling (MDS) ++ certain CPUs that support Transactional ++ Synchronization Extensions (TSX) are vulnerable to an ++ exploit against CPU internal buffers which can forward ++ information to a disclosure gadget under certain ++ conditions. ++ ++ In vulnerable processors, the speculatively forwarded ++ data can be used in a cache side channel attack, to ++ access data to which the attacker does not have direct ++ access. ++ ++ This parameter controls the TAA mitigation. The ++ options are: ++ ++ full - Enable TAA mitigation on vulnerable CPUs ++ if TSX is enabled. ++ ++ full,nosmt - Enable TAA mitigation and disable SMT on ++ vulnerable CPUs. If TSX is disabled, SMT ++ is not disabled because CPU is not ++ vulnerable to cross-thread TAA attacks. ++ off - Unconditionally disable TAA mitigation ++ ++ Not specifying this option is equivalent to ++ tsx_async_abort=full. On CPUs which are MDS affected ++ and deploy MDS mitigation, TAA mitigation is not ++ required and doesn't provide any additional ++ mitigation. ++ ++ For details see: ++ Documentation/hw-vuln/tsx_async_abort.rst ++ + turbografx.map[2|3]= [HW,JOY] + TurboGraFX parallel port interface + Format: +diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt +index e5dd9f4d6100..46ef3680c8ab 100644 +--- a/Documentation/virtual/kvm/locking.txt ++++ b/Documentation/virtual/kvm/locking.txt +@@ -13,8 +13,8 @@ The acquisition orders for mutexes are as follows: + - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring + them together is quite rare. + +-For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything +-else is a leaf: no other lock is taken inside the critical sections. ++Everything else is a leaf: no other lock is taken inside the critical ++sections. + + 2: Exception + ------------ +@@ -142,7 +142,7 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). + ------------ + + Name: kvm_lock +-Type: spinlock_t ++Type: mutex + Arch: any + Protects: - vm_list + +diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst +index ef389dcf1b1d..0780d55c5aa8 100644 +--- a/Documentation/x86/index.rst ++++ b/Documentation/x86/index.rst +@@ -6,3 +6,4 @@ x86 architecture specifics + :maxdepth: 1 + + mds ++ tsx_async_abort +diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst +new file mode 100644 +index 000000000000..4a4336a89372 +--- /dev/null ++++ b/Documentation/x86/tsx_async_abort.rst +@@ -0,0 +1,117 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++TSX Async Abort (TAA) mitigation ++================================ ++ ++.. _tsx_async_abort: ++ ++Overview ++-------- ++ ++TSX Async Abort (TAA) is a side channel attack on internal buffers in some ++Intel processors similar to Microachitectural Data Sampling (MDS). In this ++case certain loads may speculatively pass invalid data to dependent operations ++when an asynchronous abort condition is pending in a Transactional ++Synchronization Extensions (TSX) transaction. This includes loads with no ++fault or assist condition. Such loads may speculatively expose stale data from ++the same uarch data structures as in MDS, with same scope of exposure i.e. ++same-thread and cross-thread. This issue affects all current processors that ++support TSX. ++ ++Mitigation strategy ++------------------- ++ ++a) TSX disable - one of the mitigations is to disable TSX. A new MSR ++IA32_TSX_CTRL will be available in future and current processors after ++microcode update which can be used to disable TSX. In addition, it ++controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID. ++ ++b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this ++vulnerability. More details on this approach can be found in ++:ref:`Documentation/hw-vuln/mds.rst <mds>`. ++ ++Kernel internal mitigation modes ++-------------------------------- ++ ++ ============= ============================================================ ++ off Mitigation is disabled. Either the CPU is not affected or ++ tsx_async_abort=off is supplied on the kernel command line. ++ ++ tsx disabled Mitigation is enabled. TSX feature is disabled by default at ++ bootup on processors that support TSX control. ++ ++ verw Mitigation is enabled. CPU is affected and MD_CLEAR is ++ advertised in CPUID. ++ ++ ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not ++ advertised in CPUID. That is mainly for virtualization ++ scenarios where the host has the updated microcode but the ++ hypervisor does not expose MD_CLEAR in CPUID. It's a best ++ effort approach without guarantee. ++ ============= ============================================================ ++ ++If the CPU is affected and the "tsx_async_abort" kernel command line parameter is ++not provided then the kernel selects an appropriate mitigation depending on the ++status of RTM and MD_CLEAR CPUID bits. ++ ++Below tables indicate the impact of tsx=on|off|auto cmdline options on state of ++TAA mitigation, VERW behavior and TSX feature for various combinations of ++MSR_IA32_ARCH_CAPABILITIES bits. ++ ++1. "tsx=off" ++ ++========= ========= ============ ============ ============== =================== ====================== ++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off ++---------------------------------- ------------------------------------------------------------------------- ++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation ++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full ++========= ========= ============ ============ ============== =================== ====================== ++ 0 0 0 HW default Yes Same as MDS Same as MDS ++ 0 0 1 Invalid case Invalid case Invalid case Invalid case ++ 0 1 0 HW default No Need ucode update Need ucode update ++ 0 1 1 Disabled Yes TSX disabled TSX disabled ++ 1 X 1 Disabled X None needed None needed ++========= ========= ============ ============ ============== =================== ====================== ++ ++2. "tsx=on" ++ ++========= ========= ============ ============ ============== =================== ====================== ++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on ++---------------------------------- ------------------------------------------------------------------------- ++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation ++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full ++========= ========= ============ ============ ============== =================== ====================== ++ 0 0 0 HW default Yes Same as MDS Same as MDS ++ 0 0 1 Invalid case Invalid case Invalid case Invalid case ++ 0 1 0 HW default No Need ucode update Need ucode update ++ 0 1 1 Enabled Yes None Same as MDS ++ 1 X 1 Enabled X None needed None needed ++========= ========= ============ ============ ============== =================== ====================== ++ ++3. "tsx=auto" ++ ++========= ========= ============ ============ ============== =================== ====================== ++MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto ++---------------------------------- ------------------------------------------------------------------------- ++TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation ++ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full ++========= ========= ============ ============ ============== =================== ====================== ++ 0 0 0 HW default Yes Same as MDS Same as MDS ++ 0 0 1 Invalid case Invalid case Invalid case Invalid case ++ 0 1 0 HW default No Need ucode update Need ucode update ++ 0 1 1 Disabled Yes TSX disabled TSX disabled ++ 1 X 1 Enabled X None needed None needed ++========= ========= ============ ============ ============== =================== ====================== ++ ++In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that ++indicates whether MSR_IA32_TSX_CTRL is supported. ++ ++There are two control bits in IA32_TSX_CTRL MSR: ++ ++ Bit 0: When set it disables the Restricted Transactional Memory (RTM) ++ sub-feature of TSX (will force all transactions to abort on the ++ XBEGIN instruction). ++ ++ Bit 1: When set it disables the enumeration of the RTM and HLE feature ++ (i.e. it will make CPUID(EAX=7).EBX{bit4} and ++ CPUID(EAX=7).EBX{bit11} read as 0). +diff --git a/Makefile b/Makefile +index 4741bbdfaa10..1e322e669301 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 4 + PATCHLEVEL = 9 +-SUBLEVEL = 201 ++SUBLEVEL = 202 + EXTRAVERSION = + NAME = Roaring Lionus + +diff --git a/arch/mips/bcm63xx/reset.c b/arch/mips/bcm63xx/reset.c +index d1fe51edf5e6..4d411da2497b 100644 +--- a/arch/mips/bcm63xx/reset.c ++++ b/arch/mips/bcm63xx/reset.c +@@ -119,7 +119,7 @@ + #define BCM6368_RESET_DSL 0 + #define BCM6368_RESET_SAR SOFTRESET_6368_SAR_MASK + #define BCM6368_RESET_EPHY SOFTRESET_6368_EPHY_MASK +-#define BCM6368_RESET_ENETSW 0 ++#define BCM6368_RESET_ENETSW SOFTRESET_6368_ENETSW_MASK + #define BCM6368_RESET_PCM SOFTRESET_6368_PCM_MASK + #define BCM6368_RESET_MPI SOFTRESET_6368_MPI_MASK + #define BCM6368_RESET_PCIE 0 +diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c +index 3dc96b455e0c..37c254677ccd 100644 +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -1422,13 +1422,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); + if (!kvm->arch.sca) + goto out_err; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + sca_offset += 16; + if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) + sca_offset = 0; + kvm->arch.sca = (struct bsca_block *) + ((char *) kvm->arch.sca + sca_offset); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + sprintf(debug_name, "kvm-%u", current->pid); + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index e0055b4302d6..1067f7668c4e 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1755,6 +1755,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS + + If unsure, say y. + ++choice ++ prompt "TSX enable mode" ++ depends on CPU_SUP_INTEL ++ default X86_INTEL_TSX_MODE_OFF ++ help ++ Intel's TSX (Transactional Synchronization Extensions) feature ++ allows to optimize locking protocols through lock elision which ++ can lead to a noticeable performance boost. ++ ++ On the other hand it has been shown that TSX can be exploited ++ to form side channel attacks (e.g. TAA) and chances are there ++ will be more of those attacks discovered in the future. ++ ++ Therefore TSX is not enabled by default (aka tsx=off). An admin ++ might override this decision by tsx=on the command line parameter. ++ Even with TSX enabled, the kernel will attempt to enable the best ++ possible TAA mitigation setting depending on the microcode available ++ for the particular machine. ++ ++ This option allows to set the default tsx mode between tsx=on, =off ++ and =auto. See Documentation/kernel-parameters.txt for more ++ details. ++ ++ Say off if not sure, auto if TSX is in use but it should be used on safe ++ platforms or on if TSX is in use and the security aspect of tsx is not ++ relevant. ++ ++config X86_INTEL_TSX_MODE_OFF ++ bool "off" ++ help ++ TSX is disabled if possible - equals to tsx=off command line parameter. ++ ++config X86_INTEL_TSX_MODE_ON ++ bool "on" ++ help ++ TSX is always enabled on TSX capable HW - equals the tsx=on command ++ line parameter. ++ ++config X86_INTEL_TSX_MODE_AUTO ++ bool "auto" ++ help ++ TSX is enabled on TSX capable HW that is believed to be safe against ++ side channel attacks- equals the tsx=auto command line parameter. ++endchoice ++ + config EFI + bool "EFI runtime service support" + depends on ACPI +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 3a972da155d6..ccc4420f051b 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -357,5 +357,7 @@ + #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ + #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ + #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ ++#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ ++#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 222cb69e1219..d2c14a96ec28 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -261,6 +261,7 @@ struct kvm_rmap_head { + struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; ++ struct list_head lpage_disallowed_link; + + /* + * The following two entries are used to key the shadow page in the +@@ -273,6 +274,7 @@ struct kvm_mmu_page { + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + bool unsync; ++ bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ +@@ -724,6 +726,7 @@ struct kvm_arch { + */ + struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; ++ struct list_head lpage_disallowed_mmu_pages; + struct kvm_page_track_notifier_node mmu_sp_tracker; + struct kvm_page_track_notifier_head track_notifier_head; + +@@ -798,6 +801,8 @@ struct kvm_arch { + + bool x2apic_format; + bool x2apic_broadcast_quirk_disabled; ++ ++ struct task_struct *nx_lpage_recovery_thread; + }; + + struct kvm_vm_stat { +@@ -811,6 +816,7 @@ struct kvm_vm_stat { + ulong mmu_unsync; + ulong remote_tlb_flush; + ulong lpages; ++ ulong nx_lpage_splits; + }; + + struct kvm_vcpu_stat { +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 86166868db8c..8d162e0f2881 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -77,6 +77,18 @@ + * Microarchitectural Data + * Sampling (MDS) vulnerabilities. + */ ++#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* ++ * The processor is not susceptible to a ++ * machine check error due to modifying the ++ * code page size along with either the ++ * physical address or cache type ++ * without TLB invalidation. ++ */ ++#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ ++#define ARCH_CAP_TAA_NO BIT(8) /* ++ * Not susceptible to ++ * TSX Async Abort (TAA) vulnerabilities. ++ */ + + #define MSR_IA32_FLUSH_CMD 0x0000010b + #define L1D_FLUSH BIT(0) /* +@@ -87,6 +99,10 @@ + #define MSR_IA32_BBL_CR_CTL 0x00000119 + #define MSR_IA32_BBL_CR_CTL3 0x0000011e + ++#define MSR_IA32_TSX_CTRL 0x00000122 ++#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ ++#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ ++ + #define MSR_IA32_SYSENTER_CS 0x00000174 + #define MSR_IA32_SYSENTER_ESP 0x00000175 + #define MSR_IA32_SYSENTER_EIP 0x00000176 +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index 10a48505abb5..8d56d701b5f7 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); + #include <asm/segment.h> + + /** +- * mds_clear_cpu_buffers - Mitigation for MDS vulnerability ++ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability + * + * This uses the otherwise unused and obsolete VERW instruction in + * combination with microcode which triggers a CPU buffer flush when the +@@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void) + } + + /** +- * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability ++ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability + * + * Clear CPU buffers if the corresponding static key is enabled + */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 155e49fc7010..92703fa09c19 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -880,4 +880,11 @@ enum mds_mitigations { + MDS_MITIGATION_VMWERV, + }; + ++enum taa_mitigations { ++ TAA_MITIGATION_OFF, ++ TAA_MITIGATION_UCODE_NEEDED, ++ TAA_MITIGATION_VERW, ++ TAA_MITIGATION_TSX_DISABLED, ++}; ++ + #endif /* _ASM_X86_PROCESSOR_H */ +diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile +index 33b63670bf09..f6e386fe510c 100644 +--- a/arch/x86/kernel/cpu/Makefile ++++ b/arch/x86/kernel/cpu/Makefile +@@ -25,7 +25,7 @@ obj-y += bugs.o + obj-$(CONFIG_PROC_FS) += proc.o + obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o + +-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o ++obj-$(CONFIG_CPU_SUP_INTEL) += intel.o tsx.o + obj-$(CONFIG_CPU_SUP_AMD) += amd.o + obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o + obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index 2a42fef275ad..827fc38df97a 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -31,11 +31,14 @@ + #include <asm/intel-family.h> + #include <asm/e820.h> + ++#include "cpu.h" ++ + static void __init spectre_v1_select_mitigation(void); + static void __init spectre_v2_select_mitigation(void); + static void __init ssb_select_mitigation(void); + static void __init l1tf_select_mitigation(void); + static void __init mds_select_mitigation(void); ++static void __init taa_select_mitigation(void); + + /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ + u64 x86_spec_ctrl_base; +@@ -102,6 +105,7 @@ void __init check_bugs(void) + ssb_select_mitigation(); + l1tf_select_mitigation(); + mds_select_mitigation(); ++ taa_select_mitigation(); + + arch_smt_update(); + +@@ -265,6 +269,100 @@ static int __init mds_cmdline(char *str) + } + early_param("mds", mds_cmdline); + ++#undef pr_fmt ++#define pr_fmt(fmt) "TAA: " fmt ++ ++/* Default mitigation for TAA-affected CPUs */ ++static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; ++static bool taa_nosmt __ro_after_init; ++ ++static const char * const taa_strings[] = { ++ [TAA_MITIGATION_OFF] = "Vulnerable", ++ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", ++ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", ++ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", ++}; ++ ++static void __init taa_select_mitigation(void) ++{ ++ u64 ia32_cap; ++ ++ if (!boot_cpu_has_bug(X86_BUG_TAA)) { ++ taa_mitigation = TAA_MITIGATION_OFF; ++ return; ++ } ++ ++ /* TSX previously disabled by tsx=off */ ++ if (!boot_cpu_has(X86_FEATURE_RTM)) { ++ taa_mitigation = TAA_MITIGATION_TSX_DISABLED; ++ goto out; ++ } ++ ++ if (cpu_mitigations_off()) { ++ taa_mitigation = TAA_MITIGATION_OFF; ++ return; ++ } ++ ++ /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ ++ if (taa_mitigation == TAA_MITIGATION_OFF) ++ goto out; ++ ++ if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) ++ taa_mitigation = TAA_MITIGATION_VERW; ++ else ++ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; ++ ++ /* ++ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. ++ * A microcode update fixes this behavior to clear CPU buffers. It also ++ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the ++ * ARCH_CAP_TSX_CTRL_MSR bit. ++ * ++ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode ++ * update is required. ++ */ ++ ia32_cap = x86_read_arch_cap_msr(); ++ if ( (ia32_cap & ARCH_CAP_MDS_NO) && ++ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) ++ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; ++ ++ /* ++ * TSX is enabled, select alternate mitigation for TAA which is ++ * the same as MDS. Enable MDS static branch to clear CPU buffers. ++ * ++ * For guests that can't determine whether the correct microcode is ++ * present on host, enable the mitigation for UCODE_NEEDED as well. ++ */ ++ static_branch_enable(&mds_user_clear); ++ ++ if (taa_nosmt || cpu_mitigations_auto_nosmt()) ++ cpu_smt_disable(false); ++ ++out: ++ pr_info("%s\n", taa_strings[taa_mitigation]); ++} ++ ++static int __init tsx_async_abort_parse_cmdline(char *str) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_TAA)) ++ return 0; ++ ++ if (!str) ++ return -EINVAL; ++ ++ if (!strcmp(str, "off")) { ++ taa_mitigation = TAA_MITIGATION_OFF; ++ } else if (!strcmp(str, "full")) { ++ taa_mitigation = TAA_MITIGATION_VERW; ++ } else if (!strcmp(str, "full,nosmt")) { ++ taa_mitigation = TAA_MITIGATION_VERW; ++ taa_nosmt = true; ++ } ++ ++ return 0; ++} ++early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); ++ + #undef pr_fmt + #define pr_fmt(fmt) "Spectre V1 : " fmt + +@@ -780,13 +878,10 @@ static void update_mds_branch_idle(void) + } + + #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" ++#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" + + void arch_smt_update(void) + { +- /* Enhanced IBRS implies STIBP. No update required. */ +- if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) +- return; +- + mutex_lock(&spec_ctrl_mutex); + + switch (spectre_v2_user) { +@@ -812,6 +907,17 @@ void arch_smt_update(void) + break; + } + ++ switch (taa_mitigation) { ++ case TAA_MITIGATION_VERW: ++ case TAA_MITIGATION_UCODE_NEEDED: ++ if (sched_smt_active()) ++ pr_warn_once(TAA_MSG_SMT); ++ break; ++ case TAA_MITIGATION_TSX_DISABLED: ++ case TAA_MITIGATION_OFF: ++ break; ++ } ++ + mutex_unlock(&spec_ctrl_mutex); + } + +@@ -1127,6 +1233,9 @@ void x86_spec_ctrl_setup_ap(void) + x86_amd_ssb_disable(); + } + ++bool itlb_multihit_kvm_mitigation; ++EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); ++ + #undef pr_fmt + #define pr_fmt(fmt) "L1TF: " fmt + +@@ -1282,11 +1391,24 @@ static ssize_t l1tf_show_state(char *buf) + l1tf_vmx_states[l1tf_vmx_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); + } ++ ++static ssize_t itlb_multihit_show_state(char *buf) ++{ ++ if (itlb_multihit_kvm_mitigation) ++ return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); ++ else ++ return sprintf(buf, "KVM: Vulnerable\n"); ++} + #else + static ssize_t l1tf_show_state(char *buf) + { + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + } ++ ++static ssize_t itlb_multihit_show_state(char *buf) ++{ ++ return sprintf(buf, "Processor vulnerable\n"); ++} + #endif + + static ssize_t mds_show_state(char *buf) +@@ -1308,6 +1430,21 @@ static ssize_t mds_show_state(char *buf) + sched_smt_active() ? "vulnerable" : "disabled"); + } + ++static ssize_t tsx_async_abort_show_state(char *buf) ++{ ++ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || ++ (taa_mitigation == TAA_MITIGATION_OFF)) ++ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); ++ ++ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { ++ return sprintf(buf, "%s; SMT Host state unknown\n", ++ taa_strings[taa_mitigation]); ++ } ++ ++ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], ++ sched_smt_active() ? "vulnerable" : "disabled"); ++} ++ + static char *stibp_state(void) + { + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) +@@ -1373,6 +1510,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr + case X86_BUG_MDS: + return mds_show_state(buf); + ++ case X86_BUG_TAA: ++ return tsx_async_abort_show_state(buf); ++ ++ case X86_BUG_ITLB_MULTIHIT: ++ return itlb_multihit_show_state(buf); ++ + default: + break; + } +@@ -1409,4 +1552,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu + { + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); + } ++ ++ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_TAA); ++} ++ ++ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); ++} + #endif +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 12fa16051871..477df9782fdf 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -891,13 +891,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) + c->x86_cache_bits = c->x86_phys_bits; + } + +-#define NO_SPECULATION BIT(0) +-#define NO_MELTDOWN BIT(1) +-#define NO_SSB BIT(2) +-#define NO_L1TF BIT(3) +-#define NO_MDS BIT(4) +-#define MSBDS_ONLY BIT(5) +-#define NO_SWAPGS BIT(6) ++#define NO_SPECULATION BIT(0) ++#define NO_MELTDOWN BIT(1) ++#define NO_SSB BIT(2) ++#define NO_L1TF BIT(3) ++#define NO_MDS BIT(4) ++#define MSBDS_ONLY BIT(5) ++#define NO_SWAPGS BIT(6) ++#define NO_ITLB_MULTIHIT BIT(7) + + #define VULNWL(_vendor, _family, _model, _whitelist) \ + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } +@@ -915,26 +916,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + + /* Intel Family 6 */ +- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), +- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), +- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), +- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), +- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), +- +- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), +- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), ++ ++ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(CORE_YONAH, NO_SSB), + +- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + +- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), +- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), +- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), ++ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* + * Technically, swapgs isn't serializing on AMD (despite it previously +@@ -945,13 +946,13 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { + */ + + /* AMD Family 0xf - 0x12 */ +- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), +- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), ++ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), ++ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ +- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), ++ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + {} + }; + +@@ -962,19 +963,30 @@ static bool __init cpu_matches(unsigned long which) + return m && !!(m->driver_data & which); + } + +-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) ++u64 x86_read_arch_cap_msr(void) + { + u64 ia32_cap = 0; + ++ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); ++ ++ return ia32_cap; ++} ++ ++static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) ++{ ++ u64 ia32_cap = x86_read_arch_cap_msr(); ++ ++ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ ++ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) ++ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); ++ + if (cpu_matches(NO_SPECULATION)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + +- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) +- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); +- + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); +@@ -991,6 +1003,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) + if (!cpu_matches(NO_SWAPGS)) + setup_force_cpu_bug(X86_BUG_SWAPGS); + ++ /* ++ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: ++ * - TSX is supported or ++ * - TSX_CTRL is present ++ * ++ * TSX_CTRL check is needed for cases when TSX could be disabled before ++ * the kernel boot e.g. kexec. ++ * TSX_CTRL check alone is not sufficient for cases when the microcode ++ * update is not present or running as guest that don't get TSX_CTRL. ++ */ ++ if (!(ia32_cap & ARCH_CAP_TAA_NO) && ++ (cpu_has(c, X86_FEATURE_RTM) || ++ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) ++ setup_force_cpu_bug(X86_BUG_TAA); ++ + if (cpu_matches(NO_MELTDOWN)) + return; + +@@ -1409,6 +1436,8 @@ void __init identify_boot_cpu(void) + enable_sep_cpu(); + #endif + cpu_detect_tlb(&boot_cpu_data); ++ ++ tsx_init(); + } + + void identify_secondary_cpu(struct cpuinfo_x86 *c) +diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h +index 2275900d4d1b..4350f50b5deb 100644 +--- a/arch/x86/kernel/cpu/cpu.h ++++ b/arch/x86/kernel/cpu/cpu.h +@@ -44,6 +44,22 @@ struct _tlb_table { + extern const struct cpu_dev *const __x86_cpu_dev_start[], + *const __x86_cpu_dev_end[]; + ++#ifdef CONFIG_CPU_SUP_INTEL ++enum tsx_ctrl_states { ++ TSX_CTRL_ENABLE, ++ TSX_CTRL_DISABLE, ++ TSX_CTRL_NOT_SUPPORTED, ++}; ++ ++extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; ++ ++extern void __init tsx_init(void); ++extern void tsx_enable(void); ++extern void tsx_disable(void); ++#else ++static inline void tsx_init(void) { } ++#endif /* CONFIG_CPU_SUP_INTEL */ ++ + extern void get_cpu_cap(struct cpuinfo_x86 *c); + extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); + extern int detect_extended_topology_early(struct cpuinfo_x86 *c); +@@ -51,4 +67,6 @@ extern int detect_ht_early(struct cpuinfo_x86 *c); + + extern void x86_spec_ctrl_setup_ap(void); + ++extern u64 x86_read_arch_cap_msr(void); ++ + #endif /* ARCH_X86_CPU_H */ +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index 860f2fd9f540..476a9d5c2f35 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -642,6 +642,11 @@ static void init_intel(struct cpuinfo_x86 *c) + detect_vmx_virtcap(c); + + init_intel_energy_perf(c); ++ ++ if (tsx_ctrl_state == TSX_CTRL_ENABLE) ++ tsx_enable(); ++ if (tsx_ctrl_state == TSX_CTRL_DISABLE) ++ tsx_disable(); + } + + #ifdef CONFIG_X86_32 +diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c +new file mode 100644 +index 000000000000..3e20d322bc98 +--- /dev/null ++++ b/arch/x86/kernel/cpu/tsx.c +@@ -0,0 +1,140 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Intel Transactional Synchronization Extensions (TSX) control. ++ * ++ * Copyright (C) 2019 Intel Corporation ++ * ++ * Author: ++ * Pawan Gupta <[email protected]> ++ */ ++ ++#include <linux/cpufeature.h> ++ ++#include <asm/cmdline.h> ++ ++#include "cpu.h" ++ ++enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; ++ ++void tsx_disable(void) ++{ ++ u64 tsx; ++ ++ rdmsrl(MSR_IA32_TSX_CTRL, tsx); ++ ++ /* Force all transactions to immediately abort */ ++ tsx |= TSX_CTRL_RTM_DISABLE; ++ ++ /* ++ * Ensure TSX support is not enumerated in CPUID. ++ * This is visible to userspace and will ensure they ++ * do not waste resources trying TSX transactions that ++ * will always abort. ++ */ ++ tsx |= TSX_CTRL_CPUID_CLEAR; ++ ++ wrmsrl(MSR_IA32_TSX_CTRL, tsx); ++} ++ ++void tsx_enable(void) ++{ ++ u64 tsx; ++ ++ rdmsrl(MSR_IA32_TSX_CTRL, tsx); ++ ++ /* Enable the RTM feature in the cpu */ ++ tsx &= ~TSX_CTRL_RTM_DISABLE; ++ ++ /* ++ * Ensure TSX support is enumerated in CPUID. ++ * This is visible to userspace and will ensure they ++ * can enumerate and use the TSX feature. ++ */ ++ tsx &= ~TSX_CTRL_CPUID_CLEAR; ++ ++ wrmsrl(MSR_IA32_TSX_CTRL, tsx); ++} ++ ++static bool __init tsx_ctrl_is_supported(void) ++{ ++ u64 ia32_cap = x86_read_arch_cap_msr(); ++ ++ /* ++ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this ++ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. ++ * ++ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a ++ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES ++ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get ++ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, ++ * tsx= cmdline requests will do nothing on CPUs without ++ * MSR_IA32_TSX_CTRL support. ++ */ ++ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); ++} ++ ++static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) ++{ ++ if (boot_cpu_has_bug(X86_BUG_TAA)) ++ return TSX_CTRL_DISABLE; ++ ++ return TSX_CTRL_ENABLE; ++} ++ ++void __init tsx_init(void) ++{ ++ char arg[5] = {}; ++ int ret; ++ ++ if (!tsx_ctrl_is_supported()) ++ return; ++ ++ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); ++ if (ret >= 0) { ++ if (!strcmp(arg, "on")) { ++ tsx_ctrl_state = TSX_CTRL_ENABLE; ++ } else if (!strcmp(arg, "off")) { ++ tsx_ctrl_state = TSX_CTRL_DISABLE; ++ } else if (!strcmp(arg, "auto")) { ++ tsx_ctrl_state = x86_get_tsx_auto_mode(); ++ } else { ++ tsx_ctrl_state = TSX_CTRL_DISABLE; ++ pr_err("tsx: invalid option, defaulting to off\n"); ++ } ++ } else { ++ /* tsx= not provided */ ++ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) ++ tsx_ctrl_state = x86_get_tsx_auto_mode(); ++ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) ++ tsx_ctrl_state = TSX_CTRL_DISABLE; ++ else ++ tsx_ctrl_state = TSX_CTRL_ENABLE; ++ } ++ ++ if (tsx_ctrl_state == TSX_CTRL_DISABLE) { ++ tsx_disable(); ++ ++ /* ++ * tsx_disable() will change the state of the ++ * RTM CPUID bit. Clear it here since it is now ++ * expected to be not set. ++ */ ++ setup_clear_cpu_cap(X86_FEATURE_RTM); ++ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { ++ ++ /* ++ * HW defaults TSX to be enabled at bootup. ++ * We may still need the TSX enable support ++ * during init for special cases like ++ * kexec after TSX is disabled. ++ */ ++ tsx_enable(); ++ ++ /* ++ * tsx_enable() will change the state of the ++ * RTM CPUID bit. Force it here since it is now ++ * expected to be set. ++ */ ++ setup_force_cpu_cap(X86_FEATURE_RTM); ++ } ++} +diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c +index fc8236fd2495..18c5b4920e92 100644 +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -466,8 +466,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); ++ + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_7_EDX); ++ if (boot_cpu_has(X86_FEATURE_IBPB) && ++ boot_cpu_has(X86_FEATURE_IBRS)) ++ entry->edx |= F(SPEC_CTRL); ++ if (boot_cpu_has(X86_FEATURE_STIBP)) ++ entry->edx |= F(INTEL_STIBP); ++ if (boot_cpu_has(X86_FEATURE_SSBD)) ++ entry->edx |= F(SPEC_CTRL_SSBD); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index 676edfc19a95..f0f180158c26 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -37,6 +37,7 @@ + #include <linux/srcu.h> + #include <linux/slab.h> + #include <linux/uaccess.h> ++#include <linux/kthread.h> + + #include <asm/page.h> + #include <asm/cmpxchg.h> +@@ -44,6 +45,30 @@ + #include <asm/vmx.h> + #include <asm/kvm_page_track.h> + ++extern bool itlb_multihit_kvm_mitigation; ++ ++static int __read_mostly nx_huge_pages = -1; ++static uint __read_mostly nx_huge_pages_recovery_ratio = 60; ++ ++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); ++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); ++ ++static struct kernel_param_ops nx_huge_pages_ops = { ++ .set = set_nx_huge_pages, ++ .get = param_get_bool, ++}; ++ ++static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { ++ .set = set_nx_huge_pages_recovery_ratio, ++ .get = param_get_uint, ++}; ++ ++module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); ++__MODULE_PARM_TYPE(nx_huge_pages, "bool"); ++module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, ++ &nx_huge_pages_recovery_ratio, 0644); ++__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); ++ + /* + * When setting this variable to true it enables Two-Dimensional-Paging + * where the hardware walks 2 page tables: +@@ -131,9 +156,6 @@ module_param(dbg, bool, 0644); + + #include <trace/events/kvm.h> + +-#define CREATE_TRACE_POINTS +-#include "mmutrace.h" +- + #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +@@ -142,6 +164,20 @@ module_param(dbg, bool, 0644); + /* make pte_list_desc fit well in cache line */ + #define PTE_LIST_EXT 3 + ++/* ++ * Return values of handle_mmio_page_fault and mmu.page_fault: ++ * RET_PF_RETRY: let CPU fault again on the address. ++ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. ++ * ++ * For handle_mmio_page_fault only: ++ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. ++ */ ++enum { ++ RET_PF_RETRY = 0, ++ RET_PF_EMULATE = 1, ++ RET_PF_INVALID = 2, ++}; ++ + struct pte_list_desc { + u64 *sptes[PTE_LIST_EXT]; + struct pte_list_desc *more; +@@ -179,14 +215,23 @@ static u64 __read_mostly shadow_mmio_mask; + static u64 __read_mostly shadow_present_mask; + + static void mmu_spte_set(u64 *sptep, u64 spte); ++static bool is_executable_pte(u64 spte); + static void mmu_free_roots(struct kvm_vcpu *vcpu); + ++#define CREATE_TRACE_POINTS ++#include "mmutrace.h" ++ + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) + { + shadow_mmio_mask = mmio_mask; + } + EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); + ++static bool is_nx_huge_page_enabled(void) ++{ ++ return READ_ONCE(nx_huge_pages); ++} ++ + /* + * the low bit of the generation number is always presumed to be zero. + * This disables mmio caching during memslot updates. The concept is +@@ -324,6 +369,11 @@ static int is_last_spte(u64 pte, int level) + return 0; + } + ++static bool is_executable_pte(u64 spte) ++{ ++ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; ++} ++ + static kvm_pfn_t spte_to_pfn(u64 pte) + { + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; +@@ -767,10 +817,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) + + static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) + { +- if (sp->role.direct) +- BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); +- else ++ if (!sp->role.direct) { + sp->gfns[index] = gfn; ++ return; ++ } ++ ++ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) ++ pr_err_ratelimited("gfn mismatch under direct page %llx " ++ "(expected %llx, got %llx)\n", ++ sp->gfn, ++ kvm_mmu_page_get_gfn(sp, index), gfn); + } + + /* +@@ -829,6 +885,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + kvm_mmu_gfn_disallow_lpage(slot, gfn); + } + ++static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++{ ++ if (sp->lpage_disallowed) ++ return; ++ ++ ++kvm->stat.nx_lpage_splits; ++ list_add_tail(&sp->lpage_disallowed_link, ++ &kvm->arch.lpage_disallowed_mmu_pages); ++ sp->lpage_disallowed = true; ++} ++ + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + { + struct kvm_memslots *slots; +@@ -846,6 +913,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) + kvm_mmu_gfn_allow_lpage(slot, gfn); + } + ++static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++{ ++ --kvm->stat.nx_lpage_splits; ++ sp->lpage_disallowed = false; ++ list_del(&sp->lpage_disallowed_link); ++} ++ + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, + struct kvm_memory_slot *slot) + { +@@ -2382,6 +2456,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + kvm_reload_remote_mmus(kvm); + } + ++ if (sp->lpage_disallowed) ++ unaccount_huge_nx_page(kvm, sp); ++ + sp->role.invalid = 1; + return ret; + } +@@ -2533,6 +2610,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + if (!speculative) + spte |= shadow_accessed_mask; + ++ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && ++ is_nx_huge_page_enabled()) { ++ pte_access &= ~ACC_EXEC_MASK; ++ } ++ + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else +@@ -2598,13 +2680,13 @@ done: + return ret; + } + +-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, +- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, +- bool speculative, bool host_writable) ++static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ++ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, ++ bool speculative, bool host_writable) + { + int was_rmapped = 0; + int rmap_count; +- bool emulate = false; ++ int ret = RET_PF_RETRY; + + pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, + *sptep, write_fault, gfn); +@@ -2634,18 +2716,15 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, + true, host_writable)) { + if (write_fault) +- emulate = true; ++ ret = RET_PF_EMULATE; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + if (unlikely(is_mmio_spte(*sptep))) +- emulate = true; ++ ret = RET_PF_EMULATE; + + pgprintk("%s: setting spte %llx\n", __func__, *sptep); +- pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", +- is_large_pte(*sptep)? "2MB" : "4kB", +- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, +- *sptep, sptep); ++ trace_kvm_mmu_set_spte(level, gfn, sptep); + if (!was_rmapped && is_large_pte(*sptep)) + ++vcpu->kvm->stat.lpages; + +@@ -2657,9 +2736,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + } + } + +- kvm_release_pfn_clean(pfn); +- +- return emulate; ++ return ret; + } + + static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, +@@ -2693,9 +2770,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + if (ret <= 0) + return -1; + +- for (i = 0; i < ret; i++, gfn++, start++) ++ for (i = 0; i < ret; i++, gfn++, start++) { + mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); ++ put_page(pages[i]); ++ } + + return 0; + } +@@ -2743,40 +2822,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) + __direct_pte_prefetch(vcpu, sp, sptep); + } + +-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, +- int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) ++static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, ++ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) + { +- struct kvm_shadow_walk_iterator iterator; ++ int level = *levelp; ++ u64 spte = *it.sptep; ++ ++ if (it.level == level && level > PT_PAGE_TABLE_LEVEL && ++ is_nx_huge_page_enabled() && ++ is_shadow_present_pte(spte) && ++ !is_large_pte(spte)) { ++ /* ++ * A small SPTE exists for this pfn, but FNAME(fetch) ++ * and __direct_map would like to create a large PTE ++ * instead: just force them to go down another level, ++ * patching back for them into pfn the next 9 bits of ++ * the address. ++ */ ++ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); ++ *pfnp |= gfn & page_mask; ++ (*levelp)--; ++ } ++} ++ ++static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ++ int map_writable, int level, kvm_pfn_t pfn, ++ bool prefault, bool lpage_disallowed) ++{ ++ struct kvm_shadow_walk_iterator it; + struct kvm_mmu_page *sp; +- int emulate = 0; +- gfn_t pseudo_gfn; ++ int ret; ++ gfn_t gfn = gpa >> PAGE_SHIFT; ++ gfn_t base_gfn = gfn; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) +- return 0; ++ return RET_PF_RETRY; + +- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { +- if (iterator.level == level) { +- emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, +- write, level, gfn, pfn, prefault, +- map_writable); +- direct_pte_prefetch(vcpu, iterator.sptep); +- ++vcpu->stat.pf_fixed; +- break; +- } ++ trace_kvm_mmu_spte_requested(gpa, level, pfn); ++ for_each_shadow_entry(vcpu, gpa, it) { ++ /* ++ * We cannot overwrite existing page tables with an NX ++ * large page, as the leaf could be executable. ++ */ ++ disallowed_hugepage_adjust(it, gfn, &pfn, &level); + +- drop_large_spte(vcpu, iterator.sptep); +- if (!is_shadow_present_pte(*iterator.sptep)) { +- u64 base_addr = iterator.addr; ++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ if (it.level == level) ++ break; + +- base_addr &= PT64_LVL_ADDR_MASK(iterator.level); +- pseudo_gfn = base_addr >> PAGE_SHIFT; +- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, +- iterator.level - 1, 1, ACC_ALL); ++ drop_large_spte(vcpu, it.sptep); ++ if (!is_shadow_present_pte(*it.sptep)) { ++ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, ++ it.level - 1, true, ACC_ALL); + +- link_shadow_page(vcpu, iterator.sptep, sp); ++ link_shadow_page(vcpu, it.sptep, sp); ++ if (lpage_disallowed) ++ account_huge_nx_page(vcpu->kvm, sp); + } + } +- return emulate; ++ ++ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, ++ write, level, base_gfn, pfn, prefault, ++ map_writable); ++ direct_pte_prefetch(vcpu, it.sptep); ++ ++vcpu->stat.pf_fixed; ++ return ret; + } + + static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +@@ -2798,25 +2908,23 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) + * Do not cache the mmio info caused by writing the readonly gfn + * into the spte otherwise read access on readonly gfn also can + * caused mmio page fault and treat it as mmio access. +- * Return 1 to tell kvm to emulate it. + */ + if (pfn == KVM_PFN_ERR_RO_FAULT) +- return 1; ++ return RET_PF_EMULATE; + + if (pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); +- return 0; ++ return RET_PF_RETRY; + } + + return -EFAULT; + } + + static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, +- gfn_t *gfnp, kvm_pfn_t *pfnp, ++ gfn_t gfn, kvm_pfn_t *pfnp, + int *levelp) + { + kvm_pfn_t pfn = *pfnp; +- gfn_t gfn = *gfnp; + int level = *levelp; + + /* +@@ -2843,8 +2951,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { +- gfn &= ~mask; +- *gfnp = gfn; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); +@@ -3012,11 +3118,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + { + int r; + int level; +- bool force_pt_level = false; ++ bool force_pt_level; + kvm_pfn_t pfn; + unsigned long mmu_seq; + bool map_writable, write = error_code & PFERR_WRITE_MASK; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); + ++ force_pt_level = lpage_disallowed; + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + /* +@@ -3031,32 +3140,30 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + } + + if (fast_page_fault(vcpu, v, level, error_code)) +- return 0; ++ return RET_PF_RETRY; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) +- return 0; ++ return RET_PF_RETRY; + + if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) + return r; + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; + make_mmu_pages_available(vcpu); + if (likely(!force_pt_level)) +- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); +- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; +- ++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); ++ r = __direct_map(vcpu, v, write, map_writable, level, pfn, ++ prefault, false); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return 0; ++ return r; + } + + +@@ -3383,38 +3490,38 @@ exit: + return reserved; + } + +-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) ++static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) + { + u64 spte; + bool reserved; + + if (mmio_info_in_cache(vcpu, addr, direct)) +- return RET_MMIO_PF_EMULATE; ++ return RET_PF_EMULATE; + + reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + if (WARN_ON(reserved)) +- return RET_MMIO_PF_BUG; ++ return -EINVAL; + + if (is_mmio_spte(spte)) { + gfn_t gfn = get_mmio_spte_gfn(spte); + unsigned access = get_mmio_spte_access(spte); + + if (!check_mmio_spte(vcpu, spte)) +- return RET_MMIO_PF_INVALID; ++ return RET_PF_INVALID; + + if (direct) + addr = 0; + + trace_handle_mmio_page_fault(addr, gfn, access); + vcpu_cache_mmio_info(vcpu, addr, gfn, access); +- return RET_MMIO_PF_EMULATE; ++ return RET_PF_EMULATE; + } + + /* + * If the page table is zapped by other cpus, let CPU fault again on + * the address. + */ +- return RET_MMIO_PF_RETRY; ++ return RET_PF_RETRY; + } + EXPORT_SYMBOL_GPL(handle_mmio_page_fault); + +@@ -3464,7 +3571,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); + + if (page_fault_handle_page_track(vcpu, error_code, gfn)) +- return 1; ++ return RET_PF_EMULATE; + + r = mmu_topup_memory_caches(vcpu); + if (r) +@@ -3548,18 +3655,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + unsigned long mmu_seq; + int write = error_code & PFERR_WRITE_MASK; + bool map_writable; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); + + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + if (page_fault_handle_page_track(vcpu, error_code, gfn)) +- return 1; ++ return RET_PF_EMULATE; + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + +- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, +- PT_DIRECTORY_LEVEL); ++ force_pt_level = ++ lpage_disallowed || ++ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + if (level > PT_DIRECTORY_LEVEL && +@@ -3569,32 +3679,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + } + + if (fast_page_fault(vcpu, gpa, level, error_code)) +- return 0; ++ return RET_PF_RETRY; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) +- return 0; ++ return RET_PF_RETRY; + + if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) + return r; + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; + make_mmu_pages_available(vcpu); + if (likely(!force_pt_level)) +- transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); +- r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; +- ++ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); ++ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, ++ prefault, lpage_disallowed); + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return 0; ++ return r; + } + + static void nonpaging_init_context(struct kvm_vcpu *vcpu, +@@ -4510,23 +4618,24 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, + enum emulation_result er; + bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); + ++ r = RET_PF_INVALID; + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, cr2, direct); +- if (r == RET_MMIO_PF_EMULATE) { ++ if (r == RET_PF_EMULATE) { + emulation_type = 0; + goto emulate; + } +- if (r == RET_MMIO_PF_RETRY) +- return 1; +- if (r < 0) +- return r; + } + +- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); ++ if (r == RET_PF_INVALID) { ++ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); ++ WARN_ON(r == RET_PF_INVALID); ++ } ++ ++ if (r == RET_PF_RETRY) ++ return 1; + if (r < 0) + return r; +- if (!r) +- return 1; + + if (mmio_info_in_cache(vcpu, cr2, direct)) + emulation_type = 0; +@@ -4965,7 +5074,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) + int nr_to_scan = sc->nr_to_scan; + unsigned long freed = 0; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + int idx; +@@ -5015,7 +5124,7 @@ unlock: + break; + } + +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return freed; + } + +@@ -5039,8 +5148,58 @@ static void mmu_destroy_caches(void) + kmem_cache_destroy(mmu_page_header_cache); + } + ++static bool get_nx_auto_mode(void) ++{ ++ /* Return true when CPU has the bug, and mitigations are ON */ ++ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); ++} ++ ++static void __set_nx_huge_pages(bool val) ++{ ++ nx_huge_pages = itlb_multihit_kvm_mitigation = val; ++} ++ ++static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) ++{ ++ bool old_val = nx_huge_pages; ++ bool new_val; ++ ++ /* In "auto" mode deploy workaround only if CPU has the bug. */ ++ if (sysfs_streq(val, "off")) ++ new_val = 0; ++ else if (sysfs_streq(val, "force")) ++ new_val = 1; ++ else if (sysfs_streq(val, "auto")) ++ new_val = get_nx_auto_mode(); ++ else if (strtobool(val, &new_val) < 0) ++ return -EINVAL; ++ ++ __set_nx_huge_pages(new_val); ++ ++ if (new_val != old_val) { ++ struct kvm *kvm; ++ int idx; ++ ++ mutex_lock(&kvm_lock); ++ ++ list_for_each_entry(kvm, &vm_list, vm_list) { ++ idx = srcu_read_lock(&kvm->srcu); ++ kvm_mmu_invalidate_zap_all_pages(kvm); ++ srcu_read_unlock(&kvm->srcu, idx); ++ ++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); ++ } ++ mutex_unlock(&kvm_lock); ++ } ++ ++ return 0; ++} ++ + int kvm_mmu_module_init(void) + { ++ if (nx_huge_pages == -1) ++ __set_nx_huge_pages(get_nx_auto_mode()); ++ + pte_list_desc_cache = kmem_cache_create("pte_list_desc", + sizeof(struct pte_list_desc), + 0, SLAB_ACCOUNT, NULL); +@@ -5104,3 +5263,116 @@ void kvm_mmu_module_exit(void) + unregister_shrinker(&mmu_shrinker); + mmu_audit_disable(); + } ++ ++static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) ++{ ++ unsigned int old_val; ++ int err; ++ ++ old_val = nx_huge_pages_recovery_ratio; ++ err = param_set_uint(val, kp); ++ if (err) ++ return err; ++ ++ if (READ_ONCE(nx_huge_pages) && ++ !old_val && nx_huge_pages_recovery_ratio) { ++ struct kvm *kvm; ++ ++ mutex_lock(&kvm_lock); ++ ++ list_for_each_entry(kvm, &vm_list, vm_list) ++ wake_up_process(kvm->arch.nx_lpage_recovery_thread); ++ ++ mutex_unlock(&kvm_lock); ++ } ++ ++ return err; ++} ++ ++static void kvm_recover_nx_lpages(struct kvm *kvm) ++{ ++ int rcu_idx; ++ struct kvm_mmu_page *sp; ++ unsigned int ratio; ++ LIST_HEAD(invalid_list); ++ ulong to_zap; ++ ++ rcu_idx = srcu_read_lock(&kvm->srcu); ++ spin_lock(&kvm->mmu_lock); ++ ++ ratio = READ_ONCE(nx_huge_pages_recovery_ratio); ++ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; ++ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { ++ /* ++ * We use a separate list instead of just using active_mmu_pages ++ * because the number of lpage_disallowed pages is expected to ++ * be relatively small compared to the total. ++ */ ++ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, ++ struct kvm_mmu_page, ++ lpage_disallowed_link); ++ WARN_ON_ONCE(!sp->lpage_disallowed); ++ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); ++ WARN_ON_ONCE(sp->lpage_disallowed); ++ ++ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { ++ kvm_mmu_commit_zap_page(kvm, &invalid_list); ++ if (to_zap) ++ cond_resched_lock(&kvm->mmu_lock); ++ } ++ } ++ ++ spin_unlock(&kvm->mmu_lock); ++ srcu_read_unlock(&kvm->srcu, rcu_idx); ++} ++ ++static long get_nx_lpage_recovery_timeout(u64 start_time) ++{ ++ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) ++ ? start_time + 60 * HZ - get_jiffies_64() ++ : MAX_SCHEDULE_TIMEOUT; ++} ++ ++static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) ++{ ++ u64 start_time; ++ long remaining_time; ++ ++ while (true) { ++ start_time = get_jiffies_64(); ++ remaining_time = get_nx_lpage_recovery_timeout(start_time); ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ while (!kthread_should_stop() && remaining_time > 0) { ++ schedule_timeout(remaining_time); ++ remaining_time = get_nx_lpage_recovery_timeout(start_time); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ set_current_state(TASK_RUNNING); ++ ++ if (kthread_should_stop()) ++ return 0; ++ ++ kvm_recover_nx_lpages(kvm); ++ } ++} ++ ++int kvm_mmu_post_init_vm(struct kvm *kvm) ++{ ++ int err; ++ ++ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, ++ "kvm-nx-lpage-recovery", ++ &kvm->arch.nx_lpage_recovery_thread); ++ if (!err) ++ kthread_unpark(kvm->arch.nx_lpage_recovery_thread); ++ ++ return err; ++} ++ ++void kvm_mmu_pre_destroy_vm(struct kvm *kvm) ++{ ++ if (kvm->arch.nx_lpage_recovery_thread) ++ kthread_stop(kvm->arch.nx_lpage_recovery_thread); ++} +diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h +index c92834c55c59..e584689e7d46 100644 +--- a/arch/x86/kvm/mmu.h ++++ b/arch/x86/kvm/mmu.h +@@ -56,23 +56,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); + void + reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); + +-/* +- * Return values of handle_mmio_page_fault: +- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction +- * directly. +- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page +- * fault path update the mmio spte. +- * RET_MMIO_PF_RETRY: let CPU fault again on the address. +- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). +- */ +-enum { +- RET_MMIO_PF_EMULATE = 1, +- RET_MMIO_PF_INVALID = 2, +- RET_MMIO_PF_RETRY = 0, +- RET_MMIO_PF_BUG = -1 +-}; +- +-int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); + void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); + void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); + bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); +@@ -202,4 +185,8 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); + void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); ++ ++int kvm_mmu_post_init_vm(struct kvm *kvm); ++void kvm_mmu_pre_destroy_vm(struct kvm *kvm); ++ + #endif +diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h +index 5a24b846a1cb..756b14ecc957 100644 +--- a/arch/x86/kvm/mmutrace.h ++++ b/arch/x86/kvm/mmutrace.h +@@ -322,6 +322,65 @@ TRACE_EVENT( + __entry->kvm_gen == __entry->spte_gen + ) + ); ++ ++TRACE_EVENT( ++ kvm_mmu_set_spte, ++ TP_PROTO(int level, gfn_t gfn, u64 *sptep), ++ TP_ARGS(level, gfn, sptep), ++ ++ TP_STRUCT__entry( ++ __field(u64, gfn) ++ __field(u64, spte) ++ __field(u64, sptep) ++ __field(u8, level) ++ /* These depend on page entry type, so compute them now. */ ++ __field(bool, r) ++ __field(bool, x) ++ __field(u8, u) ++ ), ++ ++ TP_fast_assign( ++ __entry->gfn = gfn; ++ __entry->spte = *sptep; ++ __entry->sptep = virt_to_phys(sptep); ++ __entry->level = level; ++ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); ++ __entry->x = is_executable_pte(__entry->spte); ++ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; ++ ), ++ ++ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", ++ __entry->gfn, __entry->spte, ++ __entry->r ? "r" : "-", ++ __entry->spte & PT_PRESENT_MASK ? "w" : "-", ++ __entry->x ? "x" : "-", ++ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), ++ __entry->level, __entry->sptep ++ ) ++); ++ ++TRACE_EVENT( ++ kvm_mmu_spte_requested, ++ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), ++ TP_ARGS(addr, level, pfn), ++ ++ TP_STRUCT__entry( ++ __field(u64, gfn) ++ __field(u64, pfn) ++ __field(u8, level) ++ ), ++ ++ TP_fast_assign( ++ __entry->gfn = addr >> PAGE_SHIFT; ++ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); ++ __entry->level = level; ++ ), ++ ++ TP_printk("gfn %llx pfn %llx level %d", ++ __entry->gfn, __entry->pfn, __entry->level ++ ) ++); ++ + #endif /* _TRACE_KVMMMU_H */ + + #undef TRACE_INCLUDE_PATH +diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h +index 37363900297d..e03225e707b2 100644 +--- a/arch/x86/kvm/paging_tmpl.h ++++ b/arch/x86/kvm/paging_tmpl.h +@@ -499,6 +499,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + true, true); + ++ kvm_release_pfn_clean(pfn); + return true; + } + +@@ -572,12 +573,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, + static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *gw, + int write_fault, int hlevel, +- kvm_pfn_t pfn, bool map_writable, bool prefault) ++ kvm_pfn_t pfn, bool map_writable, bool prefault, ++ bool lpage_disallowed) + { + struct kvm_mmu_page *sp = NULL; + struct kvm_shadow_walk_iterator it; + unsigned direct_access, access = gw->pt_access; +- int top_level, emulate; ++ int top_level, ret; ++ gfn_t gfn, base_gfn; + + direct_access = gw->pte_access; + +@@ -622,36 +625,49 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + link_shadow_page(vcpu, it.sptep, sp); + } + +- for (; +- shadow_walk_okay(&it) && it.level > hlevel; +- shadow_walk_next(&it)) { +- gfn_t direct_gfn; ++ /* ++ * FNAME(page_fault) might have clobbered the bottom bits of ++ * gw->gfn, restore them from the virtual address. ++ */ ++ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); ++ base_gfn = gfn; + ++ trace_kvm_mmu_spte_requested(addr, gw->level, pfn); ++ ++ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { + clear_sp_write_flooding_count(it.sptep); +- validate_direct_spte(vcpu, it.sptep, direct_access); + +- drop_large_spte(vcpu, it.sptep); ++ /* ++ * We cannot overwrite existing page tables with an NX ++ * large page, as the leaf could be executable. ++ */ ++ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + +- if (is_shadow_present_pte(*it.sptep)) +- continue; ++ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ if (it.level == hlevel) ++ break; + +- direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); ++ validate_direct_spte(vcpu, it.sptep, direct_access); + +- sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, +- true, direct_access); +- link_shadow_page(vcpu, it.sptep, sp); ++ drop_large_spte(vcpu, it.sptep); ++ ++ if (!is_shadow_present_pte(*it.sptep)) { ++ sp = kvm_mmu_get_page(vcpu, base_gfn, addr, ++ it.level - 1, true, direct_access); ++ link_shadow_page(vcpu, it.sptep, sp); ++ if (lpage_disallowed) ++ account_huge_nx_page(vcpu->kvm, sp); ++ } + } + +- clear_sp_write_flooding_count(it.sptep); +- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, +- it.level, gw->gfn, pfn, prefault, map_writable); ++ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, ++ it.level, base_gfn, pfn, prefault, map_writable); + FNAME(pte_prefetch)(vcpu, gw, it.sptep); +- +- return emulate; ++ ++vcpu->stat.pf_fixed; ++ return ret; + + out_gpte_changed: +- kvm_release_pfn_clean(pfn); +- return 0; ++ return RET_PF_RETRY; + } + + /* +@@ -717,9 +733,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + int r; + kvm_pfn_t pfn; + int level = PT_PAGE_TABLE_LEVEL; +- bool force_pt_level = false; + unsigned long mmu_seq; + bool map_writable, is_self_change_mapping; ++ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && ++ is_nx_huge_page_enabled(); ++ bool force_pt_level = lpage_disallowed; + + pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + +@@ -746,12 +764,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + if (!prefault) + inject_page_fault(vcpu, &walker.fault); + +- return 0; ++ return RET_PF_RETRY; + } + + if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { + shadow_page_table_clear_flood(vcpu, addr); +- return 1; ++ return RET_PF_EMULATE; + } + + vcpu->arch.write_fault_to_shadow_pgtable = false; +@@ -773,7 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + + if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, + &map_writable)) +- return 0; ++ return RET_PF_RETRY; + + if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, + walker.gfn, pfn, walker.pte_access, &r)) +@@ -799,6 +817,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + walker.pte_access &= ~ACC_EXEC_MASK; + } + ++ r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; +@@ -806,19 +825,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, + kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); + make_mmu_pages_available(vcpu); + if (!force_pt_level) +- transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); ++ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, +- level, pfn, map_writable, prefault); +- ++vcpu->stat.pf_fixed; ++ level, pfn, map_writable, prefault, lpage_disallowed); + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); +- spin_unlock(&vcpu->kvm->mmu_lock); +- +- return r; + + out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); +- return 0; ++ return r; + } + + static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index f7a7b98b3271..1079228e4fef 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -590,8 +590,14 @@ static int get_npt_level(void) + static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) + { + vcpu->arch.efer = efer; +- if (!npt_enabled && !(efer & EFER_LMA)) +- efer &= ~EFER_LME; ++ ++ if (!npt_enabled) { ++ /* Shadow paging assumes NX to be available. */ ++ efer |= EFER_NX; ++ ++ if (!(efer & EFER_LMA)) ++ efer &= ~EFER_LME; ++ } + + to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 6b66d1f0d185..4c0d6d0d6337 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2219,17 +2219,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) + u64 guest_efer = vmx->vcpu.arch.efer; + u64 ignore_bits = 0; + +- if (!enable_ept) { +- /* +- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing +- * host CPUID is more efficient than testing guest CPUID +- * or CR4. Host SMEP is anyway a requirement for guest SMEP. +- */ +- if (boot_cpu_has(X86_FEATURE_SMEP)) +- guest_efer |= EFER_NX; +- else if (!(guest_efer & EFER_NX)) +- ignore_bits |= EFER_NX; +- } ++ /* Shadow paging assumes NX to be available. */ ++ if (!enable_ept) ++ guest_efer |= EFER_NX; + + /* + * LMA and LME handled by hardware; SCE meaningless outside long mode. +@@ -6556,16 +6548,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) + NULL, 0) == EMULATE_DONE; + } + +- ret = handle_mmio_page_fault(vcpu, gpa, true); +- if (likely(ret == RET_MMIO_PF_EMULATE)) +- return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == +- EMULATE_DONE; +- +- if (unlikely(ret == RET_MMIO_PF_INVALID)) +- return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); +- +- if (unlikely(ret == RET_MMIO_PF_RETRY)) +- return 1; ++ ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); ++ if (ret >= 0) ++ return ret; + + /* It is the real ept misconfig */ + WARN_ON(1); +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 0b6517f5821b..06cd710e1d45 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -191,6 +191,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { + { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { "largepages", VM_STAT(lpages) }, ++ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits) }, + { NULL } + }; + +@@ -587,7 +588,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) + gfn_t gfn; + int r; + +- if (is_long_mode(vcpu) || !is_pae(vcpu)) ++ if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) + return false; + + if (!test_bit(VCPU_EXREG_PDPTR, +@@ -1031,6 +1032,14 @@ u64 kvm_get_arch_capabilities(void) + + rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + ++ /* ++ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that ++ * the nested hypervisor runs with NX huge pages. If it is not, ++ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other ++ * L1 guests, so it need not worry about its own (L2) guests. ++ */ ++ data |= ARCH_CAP_PSCHANGE_MC_NO; ++ + /* + * If we're doing cache flushes (either "always" or "cond") + * we will do one whenever the guest does a vmlaunch/vmresume. +@@ -1043,8 +1052,35 @@ u64 kvm_get_arch_capabilities(void) + if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + ++ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) ++ data |= ARCH_CAP_RDCL_NO; ++ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) ++ data |= ARCH_CAP_SSB_NO; ++ if (!boot_cpu_has_bug(X86_BUG_MDS)) ++ data |= ARCH_CAP_MDS_NO; ++ ++ /* ++ * On TAA affected systems, export MDS_NO=0 when: ++ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. ++ * - Updated microcode is present. This is detected by ++ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures ++ * that VERW clears CPU buffers. ++ * ++ * When MDS_NO=0 is exported, guests deploy clear CPU buffer ++ * mitigation and don't complain: ++ * ++ * "Vulnerable: Clear CPU buffers attempted, no microcode" ++ * ++ * If TSX is disabled on the system, guests are also mitigated against ++ * TAA and clear CPU buffer mitigation is not required for guests. ++ */ ++ if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && ++ (data & ARCH_CAP_TSX_CTRL_MSR)) ++ data &= ~ARCH_CAP_MDS_NO; ++ + return data; + } ++ + EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); + + static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +@@ -5951,17 +5987,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->cpu != freq->cpu) + continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); +- if (vcpu->cpu != smp_processor_id()) ++ if (vcpu->cpu != raw_smp_processor_id()) + send_ipi = 1; + } + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + if (freq->old < freq->new && send_ipi) { + /* +@@ -6099,12 +6135,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) + struct kvm_vcpu *vcpu; + int i; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + atomic_set(&kvm_guest_has_master_clock, 0); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + } + + static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); +@@ -7491,7 +7527,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + kvm_update_cpuid(vcpu); + + idx = srcu_read_lock(&vcpu->kvm->srcu); +- if (!is_long_mode(vcpu) && is_pae(vcpu)) { ++ if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + mmu_reset_needed = 1; + } +@@ -8072,6 +8108,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); ++ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + atomic_set(&kvm->arch.noncoherent_dma_count, 0); + +@@ -8100,6 +8137,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) + return 0; + } + ++int kvm_arch_post_init_vm(struct kvm *kvm) ++{ ++ return kvm_mmu_post_init_vm(kvm); ++} ++ + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) + { + int r; +@@ -8206,6 +8248,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) + } + EXPORT_SYMBOL_GPL(x86_set_memory_region); + ++void kvm_arch_pre_destroy_vm(struct kvm *kvm) ++{ ++ kvm_mmu_pre_destroy_vm(kvm); ++} ++ + void kvm_arch_destroy_vm(struct kvm *kvm) + { + if (current->mm == kvm->mm) { +diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c +index 3b123735a1c4..677c5f36674b 100644 +--- a/drivers/base/cpu.c ++++ b/drivers/base/cpu.c +@@ -537,12 +537,27 @@ ssize_t __weak cpu_show_mds(struct device *dev, + return sprintf(buf, "Not affected\n"); + } + ++ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ ++ssize_t __weak cpu_show_itlb_multihit(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); + static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); + static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); + static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); + static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); ++static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); ++static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); + + static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, +@@ -551,6 +566,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_spec_store_bypass.attr, + &dev_attr_l1tf.attr, + &dev_attr_mds.attr, ++ &dev_attr_tsx_async_abort.attr, ++ &dev_attr_itlb_multihit.attr, + NULL + }; + +diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c +index a2f6953a86f5..0a21fb86fd67 100644 +--- a/drivers/bluetooth/hci_ldisc.c ++++ b/drivers/bluetooth/hci_ldisc.c +@@ -653,15 +653,14 @@ static int hci_uart_set_proto(struct hci_uart *hu, int id) + return err; + + hu->proto = p; +- set_bit(HCI_UART_PROTO_READY, &hu->flags); + + err = hci_uart_register_dev(hu); + if (err) { +- clear_bit(HCI_UART_PROTO_READY, &hu->flags); + p->close(hu); + return err; + } + ++ set_bit(HCI_UART_PROTO_READY, &hu->flags); + return 0; + } + +diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c +index 95e28ecfde0a..99c7cf4822c3 100644 +--- a/drivers/usb/gadget/udc/core.c ++++ b/drivers/usb/gadget/udc/core.c +@@ -817,6 +817,8 @@ int usb_gadget_map_request_by_dev(struct device *dev, + dev_err(dev, "failed to map buffer\n"); + return -EFAULT; + } ++ ++ req->dma_mapped = 1; + } + + return 0; +@@ -841,9 +843,10 @@ void usb_gadget_unmap_request_by_dev(struct device *dev, + is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + req->num_mapped_sgs = 0; +- } else { ++ } else if (req->dma_mapped) { + dma_unmap_single(dev, req->dma, req->length, + is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); ++ req->dma_mapped = 0; + } + } + EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev); +diff --git a/include/linux/cpu.h b/include/linux/cpu.h +index b27c9b2e683f..e19bbc38a722 100644 +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -56,6 +56,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev, + struct device_attribute *attr, char *buf); + extern ssize_t cpu_show_mds(struct device *dev, + struct device_attribute *attr, char *buf); ++extern ssize_t cpu_show_tsx_async_abort(struct device *dev, ++ struct device_attribute *attr, ++ char *buf); ++extern ssize_t cpu_show_itlb_multihit(struct device *dev, ++ struct device_attribute *attr, char *buf); + + extern __printf(4, 5) + struct device *cpu_device_create(struct device *parent, void *drvdata, +@@ -282,28 +287,7 @@ static inline int cpuhp_smt_enable(void) { return 0; } + static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } + #endif + +-/* +- * These are used for a global "mitigations=" cmdline option for toggling +- * optional CPU mitigations. +- */ +-enum cpu_mitigations { +- CPU_MITIGATIONS_OFF, +- CPU_MITIGATIONS_AUTO, +- CPU_MITIGATIONS_AUTO_NOSMT, +-}; +- +-extern enum cpu_mitigations cpu_mitigations; +- +-/* mitigations=off */ +-static inline bool cpu_mitigations_off(void) +-{ +- return cpu_mitigations == CPU_MITIGATIONS_OFF; +-} +- +-/* mitigations=auto,nosmt */ +-static inline bool cpu_mitigations_auto_nosmt(void) +-{ +- return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +-} ++extern bool cpu_mitigations_off(void); ++extern bool cpu_mitigations_auto_nosmt(void); + + #endif /* _LINUX_CPU_H_ */ +diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h +index eb55374b73f3..0590e7d47b02 100644 +--- a/include/linux/kvm_host.h ++++ b/include/linux/kvm_host.h +@@ -129,7 +129,7 @@ static inline bool is_error_page(struct page *page) + + extern struct kmem_cache *kvm_vcpu_cache; + +-extern spinlock_t kvm_lock; ++extern struct mutex kvm_lock; + extern struct list_head vm_list; + + struct kvm_io_range { +@@ -1208,4 +1208,10 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) + } + #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ + ++typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); ++ ++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, ++ uintptr_t data, const char *name, ++ struct task_struct **thread_ptr); ++ + #endif +diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h +index e4516e9ded0f..4b810bc7ae63 100644 +--- a/include/linux/usb/gadget.h ++++ b/include/linux/usb/gadget.h +@@ -48,6 +48,7 @@ struct usb_ep; + * by adding a zero length packet as needed; + * @short_not_ok: When reading data, makes short packets be + * treated as errors (queue stops advancing till cleanup). ++ * @dma_mapped: Indicates if request has been mapped to DMA (internal) + * @complete: Function called when request completes, so this request and + * its buffer may be re-used. The function will always be called with + * interrupts disabled, and it must not sleep. +@@ -103,6 +104,7 @@ struct usb_request { + unsigned no_interrupt:1; + unsigned zero:1; + unsigned short_not_ok:1; ++ unsigned dma_mapped:1; + + void (*complete)(struct usb_ep *ep, + struct usb_request *req); +diff --git a/kernel/cpu.c b/kernel/cpu.c +index c947bb35b89f..0ed3e9deda30 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -2235,7 +2235,18 @@ void __init boot_cpu_hotplug_init(void) + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); + } + +-enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; ++/* ++ * These are used for a global "mitigations=" cmdline option for toggling ++ * optional CPU mitigations. ++ */ ++enum cpu_mitigations { ++ CPU_MITIGATIONS_OFF, ++ CPU_MITIGATIONS_AUTO, ++ CPU_MITIGATIONS_AUTO_NOSMT, ++}; ++ ++static enum cpu_mitigations cpu_mitigations __ro_after_init = ++ CPU_MITIGATIONS_AUTO; + + static int __init mitigations_parse_cmdline(char *arg) + { +@@ -2252,3 +2263,17 @@ static int __init mitigations_parse_cmdline(char *arg) + return 0; + } + early_param("mitigations", mitigations_parse_cmdline); ++ ++/* mitigations=off */ ++bool cpu_mitigations_off(void) ++{ ++ return cpu_mitigations == CPU_MITIGATIONS_OFF; ++} ++EXPORT_SYMBOL_GPL(cpu_mitigations_off); ++ ++/* mitigations=auto,nosmt */ ++bool cpu_mitigations_auto_nosmt(void) ++{ ++ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; ++} ++EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); +diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c +index c72586a094ed..0fc93519e63e 100644 +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -49,6 +49,7 @@ + #include <linux/slab.h> + #include <linux/sort.h> + #include <linux/bsearch.h> ++#include <linux/kthread.h> + + #include <asm/processor.h> + #include <asm/io.h> +@@ -87,7 +88,7 @@ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR); + * kvm->lock --> kvm->slots_lock --> kvm->irq_lock + */ + +-DEFINE_SPINLOCK(kvm_lock); ++DEFINE_MUTEX(kvm_lock); + static DEFINE_RAW_SPINLOCK(kvm_count_lock); + LIST_HEAD(vm_list); + +@@ -612,6 +613,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) + return 0; + } + ++/* ++ * Called after the VM is otherwise initialized, but just before adding it to ++ * the vm_list. ++ */ ++int __weak kvm_arch_post_init_vm(struct kvm *kvm) ++{ ++ return 0; ++} ++ ++/* ++ * Called just after removing the VM from the vm_list, but before doing any ++ * other destruction. ++ */ ++void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) ++{ ++} ++ + static struct kvm *kvm_create_vm(unsigned long type) + { + int r, i; +@@ -659,22 +677,31 @@ static struct kvm *kvm_create_vm(unsigned long type) + kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), + GFP_KERNEL); + if (!kvm->buses[i]) +- goto out_err; ++ goto out_err_no_mmu_notifier; + } + + r = kvm_init_mmu_notifier(kvm); ++ if (r) ++ goto out_err_no_mmu_notifier; ++ ++ r = kvm_arch_post_init_vm(kvm); + if (r) + goto out_err; + +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + + preempt_notifier_inc(); + + return kvm; + + out_err: ++#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) ++ if (kvm->mmu_notifier.ops) ++ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); ++#endif ++out_err_no_mmu_notifier: + cleanup_srcu_struct(&kvm->irq_srcu); + out_err_no_irq_srcu: + cleanup_srcu_struct(&kvm->srcu); +@@ -724,9 +751,11 @@ static void kvm_destroy_vm(struct kvm *kvm) + + kvm_destroy_vm_debugfs(kvm); + kvm_arch_sync_events(kvm); +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_del(&kvm->vm_list); +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); ++ kvm_arch_pre_destroy_vm(kvm); ++ + kvm_free_irq_routing(kvm); + for (i = 0; i < KVM_NR_BUSES; i++) { + if (kvm->buses[i]) +@@ -3752,13 +3781,13 @@ static int vm_stat_get(void *_offset, u64 *val) + u64 tmp_val; + + *val = 0; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return 0; + } + +@@ -3772,13 +3801,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) + u64 tmp_val; + + *val = 0; +- spin_lock(&kvm_lock); ++ mutex_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) { + stat_tmp.kvm = kvm; + vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); + *val += tmp_val; + } +- spin_unlock(&kvm_lock); ++ mutex_unlock(&kvm_lock); + return 0; + } + +@@ -3987,3 +4016,86 @@ void kvm_exit(void) + kvm_vfio_ops_exit(); + } + EXPORT_SYMBOL_GPL(kvm_exit); ++ ++struct kvm_vm_worker_thread_context { ++ struct kvm *kvm; ++ struct task_struct *parent; ++ struct completion init_done; ++ kvm_vm_thread_fn_t thread_fn; ++ uintptr_t data; ++ int err; ++}; ++ ++static int kvm_vm_worker_thread(void *context) ++{ ++ /* ++ * The init_context is allocated on the stack of the parent thread, so ++ * we have to locally copy anything that is needed beyond initialization ++ */ ++ struct kvm_vm_worker_thread_context *init_context = context; ++ struct kvm *kvm = init_context->kvm; ++ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; ++ uintptr_t data = init_context->data; ++ int err; ++ ++ err = kthread_park(current); ++ /* kthread_park(current) is never supposed to return an error */ ++ WARN_ON(err != 0); ++ if (err) ++ goto init_complete; ++ ++ err = cgroup_attach_task_all(init_context->parent, current); ++ if (err) { ++ kvm_err("%s: cgroup_attach_task_all failed with err %d\n", ++ __func__, err); ++ goto init_complete; ++ } ++ ++ set_user_nice(current, task_nice(init_context->parent)); ++ ++init_complete: ++ init_context->err = err; ++ complete(&init_context->init_done); ++ init_context = NULL; ++ ++ if (err) ++ return err; ++ ++ /* Wait to be woken up by the spawner before proceeding. */ ++ kthread_parkme(); ++ ++ if (!kthread_should_stop()) ++ err = thread_fn(kvm, data); ++ ++ return err; ++} ++ ++int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, ++ uintptr_t data, const char *name, ++ struct task_struct **thread_ptr) ++{ ++ struct kvm_vm_worker_thread_context init_context = {}; ++ struct task_struct *thread; ++ ++ *thread_ptr = NULL; ++ init_context.kvm = kvm; ++ init_context.parent = current; ++ init_context.thread_fn = thread_fn; ++ init_context.data = data; ++ init_completion(&init_context.init_done); ++ ++ thread = kthread_run(kvm_vm_worker_thread, &init_context, ++ "%s-%d", name, task_pid_nr(current)); ++ if (IS_ERR(thread)) ++ return PTR_ERR(thread); ++ ++ /* kthread_run is never supposed to return NULL */ ++ WARN_ON(thread == NULL); ++ ++ wait_for_completion(&init_context.init_done); ++ ++ if (!init_context.err) ++ *thread_ptr = thread; ++ ++ return init_context.err; ++}
