commit:     c7d7705101af05e259b3b84ffc59a60ff2b96142
Author:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
AuthorDate: Fri Jan  5 15:05:38 2018 +0000
Commit:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
CommitDate: Fri Jan  5 15:05:38 2018 +0000
URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=c7d77051

linux kernel 4.4.110

 0000_README              |    4 +
 1109_linux-4.4.110.patch | 2814 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2818 insertions(+)

diff --git a/0000_README b/0000_README
index 3be106c..46149de 100644
--- a/0000_README
+++ b/0000_README
@@ -479,6 +479,10 @@ Patch:  1108_linux-4.4.109.patch
 From:   http://www.kernel.org
 Desc:   Linux 4.4.109
 
+Patch:  1109_linux-4.4.110.patch
+From:   http://www.kernel.org
+Desc:   Linux 4.4.110
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.

diff --git a/1109_linux-4.4.110.patch b/1109_linux-4.4.110.patch
new file mode 100644
index 0000000..1c226ed
--- /dev/null
+++ b/1109_linux-4.4.110.patch
@@ -0,0 +1,2814 @@
+diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
+index b4a83a490212..5977c4d71356 100644
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
+ 
+       nojitter        [IA-64] Disables jitter checking for ITC timers.
+ 
++      nopti           [X86-64] Disable KAISER isolation of kernel from user.
++
+       no-kvmclock     [X86,KVM] Disable paravirtualized KVM clock driver
+ 
+       no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
+@@ -3054,6 +3056,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
+       pt.             [PARIDE]
+                       See Documentation/blockdev/paride.txt.
+ 
++      pti=            [X86_64]
++                      Control KAISER user/kernel address space isolation:
++                      on - enable
++                      off - disable
++                      auto - default setting
++
+       pty.legacy_count=
+                       [KNL] Number of legacy pty's. Overwrites compiled-in
+                       default number.
+diff --git a/Makefile b/Makefile
+index 5d67056e24dd..b028c106535b 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,6 +1,6 @@
+ VERSION = 4
+ PATCHLEVEL = 4
+-SUBLEVEL = 109
++SUBLEVEL = 110
+ EXTRAVERSION =
+ NAME = Blurry Fish Butt
+ 
+diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
+index 3783dc3e10b3..4abb284a5b9c 100644
+--- a/arch/x86/boot/compressed/misc.h
++++ b/arch/x86/boot/compressed/misc.h
+@@ -9,6 +9,7 @@
+  */
+ #undef CONFIG_PARAVIRT
+ #undef CONFIG_PARAVIRT_SPINLOCKS
++#undef CONFIG_PAGE_TABLE_ISOLATION
+ #undef CONFIG_KASAN
+ 
+ #include <linux/linkage.h>
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index cc0f2f5da19b..952b23b5d4e9 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -35,6 +35,7 @@
+ #include <asm/asm.h>
+ #include <asm/smap.h>
+ #include <asm/pgtable_types.h>
++#include <asm/kaiser.h>
+ #include <linux/err.h>
+ 
+ /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
+        * it is too small to ever cause noticeable irq latency.
+        */
+       SWAPGS_UNSAFE_STACK
++      SWITCH_KERNEL_CR3_NO_STACK
+       /*
+        * A hypervisor implementation might want to use a label
+        * after the swapgs, so that it can do the swapgs
+@@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath:
+       testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, 
SIZEOF_PTREGS)
+       jnz     int_ret_from_sys_call_irqs_off  /* Go to the slow path */
+ 
+-      RESTORE_C_REGS_EXCEPT_RCX_R11
+       movq    RIP(%rsp), %rcx
+       movq    EFLAGS(%rsp), %r11
++      RESTORE_C_REGS_EXCEPT_RCX_R11
++      /*
++       * This opens a window where we have a user CR3, but are
++       * running in the kernel.  This makes using the CS
++       * register useless for telling whether or not we need to
++       * switch CR3 in NMIs.  Normal interrupts are OK because
++       * they are off here.
++       */
++      SWITCH_USER_CR3
+       movq    RSP(%rsp), %rsp
+       /*
+        * 64-bit SYSRET restores rip from rcx,
+@@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call)
+ syscall_return_via_sysret:
+       /* rcx and r11 are already restored (see code above) */
+       RESTORE_C_REGS_EXCEPT_RCX_R11
++      /*
++       * This opens a window where we have a user CR3, but are
++       * running in the kernel.  This makes using the CS
++       * register useless for telling whether or not we need to
++       * switch CR3 in NMIs.  Normal interrupts are OK because
++       * they are off here.
++       */
++      SWITCH_USER_CR3
+       movq    RSP(%rsp), %rsp
+       USERGS_SYSRET64
+ 
+ opportunistic_sysret_failed:
++      /*
++       * This opens a window where we have a user CR3, but are
++       * running in the kernel.  This makes using the CS
++       * register useless for telling whether or not we need to
++       * switch CR3 in NMIs.  Normal interrupts are OK because
++       * they are off here.
++       */
++      SWITCH_USER_CR3
+       SWAPGS
+       jmp     restore_c_regs_and_iret
+ END(entry_SYSCALL_64)
+@@ -509,6 +535,7 @@ END(irq_entries_start)
+        * tracking that we're in kernel mode.
+        */
+       SWAPGS
++      SWITCH_KERNEL_CR3
+ 
+       /*
+        * We need to tell lockdep that IRQs are off.  We can't do this until
+@@ -568,6 +595,7 @@ GLOBAL(retint_user)
+       mov     %rsp,%rdi
+       call    prepare_exit_to_usermode
+       TRACE_IRQS_IRETQ
++      SWITCH_USER_CR3
+       SWAPGS
+       jmp     restore_regs_and_iret
+ 
+@@ -625,6 +653,7 @@ native_irq_return_ldt:
+       pushq   %rax
+       pushq   %rdi
+       SWAPGS
++      SWITCH_KERNEL_CR3
+       movq    PER_CPU_VAR(espfix_waddr), %rdi
+       movq    %rax, (0*8)(%rdi)               /* RAX */
+       movq    (2*8)(%rsp), %rax               /* RIP */
+@@ -640,6 +669,7 @@ native_irq_return_ldt:
+       andl    $0xffff0000, %eax
+       popq    %rdi
+       orq     PER_CPU_VAR(espfix_stack), %rax
++      SWITCH_USER_CR3
+       SWAPGS
+       movq    %rax, %rsp
+       popq    %rax
+@@ -995,7 +1025,11 @@ idtentry machine_check                                   
has_error_code=0        paranoid=1 do_sym=*machine_check_vec
+ /*
+  * Save all registers in pt_regs, and switch gs if needed.
+  * Use slow, but surefire "are we in kernel?" check.
+- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
++ *
++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
++ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
++ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
++ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+  */
+ ENTRY(paranoid_entry)
+       cld
+@@ -1008,7 +1042,26 @@ ENTRY(paranoid_entry)
+       js      1f                              /* negative -> in kernel */
+       SWAPGS
+       xorl    %ebx, %ebx
+-1:    ret
++1:
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      /*
++       * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
++       * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
++       * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
++       * unconditionally, but we need to find out whether the reverse
++       * should be done on return (conveyed to paranoid_exit in %ebx).
++       */
++      ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
++      testl   $KAISER_SHADOW_PGD_OFFSET, %eax
++      jz      2f
++      orl     $2, %ebx
++      andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++      /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
++      ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
++      movq    %rax, %cr3
++2:
++#endif
++      ret
+ END(paranoid_entry)
+ 
+ /*
+@@ -1021,19 +1074,26 @@ END(paranoid_entry)
+  * be complicated.  Fortunately, we there's no good reason
+  * to try to handle preemption here.
+  *
+- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
++ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
++ *           ebx=2: needs both swapgs and SWITCH_USER_CR3
++ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
+  */
+ ENTRY(paranoid_exit)
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF_DEBUG
+-      testl   %ebx, %ebx                      /* swapgs needed? */
++      TRACE_IRQS_IRETQ_DEBUG
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
++      testl   $2, %ebx                        /* SWITCH_USER_CR3 needed? */
++      jz      paranoid_exit_no_switch
++      SWITCH_USER_CR3
++paranoid_exit_no_switch:
++#endif
++      testl   $1, %ebx                        /* swapgs needed? */
+       jnz     paranoid_exit_no_swapgs
+-      TRACE_IRQS_IRETQ
+       SWAPGS_UNSAFE_STACK
+-      jmp     paranoid_exit_restore
+ paranoid_exit_no_swapgs:
+-      TRACE_IRQS_IRETQ_DEBUG
+-paranoid_exit_restore:
+       RESTORE_EXTRA_REGS
+       RESTORE_C_REGS
+       REMOVE_PT_GPREGS_FROM_STACK 8
+@@ -1048,6 +1108,13 @@ ENTRY(error_entry)
+       cld
+       SAVE_C_REGS 8
+       SAVE_EXTRA_REGS 8
++      /*
++       * error_entry() always returns with a kernel gsbase and
++       * CR3.  We must also have a kernel CR3/gsbase before
++       * calling TRACE_IRQS_*.  Just unconditionally switch to
++       * the kernel CR3 here.
++       */
++      SWITCH_KERNEL_CR3
+       xorl    %ebx, %ebx
+       testb   $3, CS+8(%rsp)
+       jz      .Lerror_kernelspace
+@@ -1210,6 +1277,10 @@ ENTRY(nmi)
+        */
+ 
+       SWAPGS_UNSAFE_STACK
++      /*
++       * percpu variables are mapped with user CR3, so no need
++       * to switch CR3 here.
++       */
+       cld
+       movq    %rsp, %rdx
+       movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+@@ -1243,12 +1314,34 @@ ENTRY(nmi)
+ 
+       movq    %rsp, %rdi
+       movq    $-1, %rsi
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      /* Unconditionally use kernel CR3 for do_nmi() */
++      /* %rax is saved above, so OK to clobber here */
++      ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
++      /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
++      ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
++      pushq   %rax
++      /* mask off "user" bit of pgd address and 12 PCID bits: */
++      andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++      movq    %rax, %cr3
++2:
++#endif
+       call    do_nmi
+ 
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      /*
++       * Unconditionally restore CR3.  I know we return to
++       * kernel code that needs user CR3, but do we ever return
++       * to "user mode" where we need the kernel CR3?
++       */
++      ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
++#endif
++
+       /*
+        * Return back to user mode.  We must *not* do the normal exit
+-       * work, because we don't want to enable interrupts.  Fortunately,
+-       * do_nmi doesn't modify pt_regs.
++       * work, because we don't want to enable interrupts.  Do not
++       * switch to user CR3: we might be going back to kernel code
++       * that had a user CR3 set.
+        */
+       SWAPGS
+       jmp     restore_c_regs_and_iret
+@@ -1445,22 +1538,55 @@ end_repeat_nmi:
+       ALLOC_PT_GPREGS_ON_STACK
+ 
+       /*
+-       * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
+-       * as we should not be calling schedule in NMI context.
+-       * Even with normal interrupts enabled. An NMI should not be
+-       * setting NEED_RESCHED or anything that normal interrupts and
+-       * exceptions might do.
++       * Use the same approach as paranoid_entry to handle SWAPGS, but
++       * without CR3 handling since we do that differently in NMIs.  No
++       * need to use paranoid_exit as we should not be calling schedule
++       * in NMI context.  Even with normal interrupts enabled. An NMI
++       * should not be setting NEED_RESCHED or anything that normal
++       * interrupts and exceptions might do.
+        */
+-      call    paranoid_entry
+-
+-      /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
++      cld
++      SAVE_C_REGS
++      SAVE_EXTRA_REGS
++      movl    $1, %ebx
++      movl    $MSR_GS_BASE, %ecx
++      rdmsr
++      testl   %edx, %edx
++      js      1f                              /* negative -> in kernel */
++      SWAPGS
++      xorl    %ebx, %ebx
++1:
+       movq    %rsp, %rdi
+       movq    $-1, %rsi
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      /* Unconditionally use kernel CR3 for do_nmi() */
++      /* %rax is saved above, so OK to clobber here */
++      ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
++      /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
++      ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
++      pushq   %rax
++      /* mask off "user" bit of pgd address and 12 PCID bits: */
++      andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++      movq    %rax, %cr3
++2:
++#endif
++
++      /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+       call    do_nmi
+ 
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      /*
++       * Unconditionally restore CR3.  We might be returning to
++       * kernel code that needs user CR3, like just just before
++       * a sysret.
++       */
++      ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
++#endif
++
+       testl   %ebx, %ebx                      /* swapgs needed? */
+       jnz     nmi_restore
+ nmi_swapgs:
++      /* We fixed up CR3 above, so no need to switch it here */
+       SWAPGS_UNSAFE_STACK
+ nmi_restore:
+       RESTORE_EXTRA_REGS
+diff --git a/arch/x86/entry/entry_64_compat.S 
b/arch/x86/entry/entry_64_compat.S
+index 15cfebaa7688..d03bf0e28b8b 100644
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -13,6 +13,8 @@
+ #include <asm/irqflags.h>
+ #include <asm/asm.h>
+ #include <asm/smap.h>
++#include <asm/pgtable_types.h>
++#include <asm/kaiser.h>
+ #include <linux/linkage.h>
+ #include <linux/err.h>
+ 
+@@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32)
+ ENTRY(entry_SYSENTER_compat)
+       /* Interrupts are off on entry. */
+       SWAPGS_UNSAFE_STACK
++      SWITCH_KERNEL_CR3_NO_STACK
+       movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ 
+       /*
+@@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat)
+ ENTRY(entry_SYSCALL_compat)
+       /* Interrupts are off on entry. */
+       SWAPGS_UNSAFE_STACK
++      SWITCH_KERNEL_CR3_NO_STACK
+ 
+       /* Stash user ESP and switch to the kernel stack. */
+       movl    %esp, %r8d
+@@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat)
+       /* Opportunistic SYSRET */
+ sysret32_from_system_call:
+       TRACE_IRQS_ON                   /* User mode traces as IRQs on. */
++      SWITCH_USER_CR3
+       movq    RBX(%rsp), %rbx         /* pt_regs->rbx */
+       movq    RBP(%rsp), %rbp         /* pt_regs->rbp */
+       movq    EFLAGS(%rsp), %r11      /* pt_regs->flags (in r11) */
+@@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat)
+       PARAVIRT_ADJUST_EXCEPTION_FRAME
+       ASM_CLAC                        /* Do this early to minimize exposure */
+       SWAPGS
++      SWITCH_KERNEL_CR3_NO_STACK
+ 
+       /*
+        * User tracing code (ptrace or signal handlers) might assume that
+@@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat)
+ 
+       /* Go back to user mode. */
+       TRACE_IRQS_ON
++      SWITCH_USER_CR3
+       SWAPGS
+       jmp     restore_regs_and_iret
+ END(entry_INT80_compat)
+diff --git a/arch/x86/entry/vdso/vclock_gettime.c 
b/arch/x86/entry/vdso/vclock_gettime.c
+index ca94fa649251..5dd363d54348 100644
+--- a/arch/x86/entry/vdso/vclock_gettime.c
++++ b/arch/x86/entry/vdso/vclock_gettime.c
+@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
+ }
+ #endif
+ 
++#ifdef CONFIG_PARAVIRT_CLOCK
++extern u8 pvclock_page
++      __attribute__((visibility("hidden")));
++#endif
++
+ #ifndef BUILD_VDSO32
+ 
+ #include <linux/kernel.h>
+@@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, 
struct timezone *tz)
+ 
+ #ifdef CONFIG_PARAVIRT_CLOCK
+ 
+-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
++static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
+ {
+-      const struct pvclock_vsyscall_time_info *pvti_base;
+-      int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
+-      int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
+-
+-      BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
+-
+-      pvti_base = (struct pvclock_vsyscall_time_info *)
+-                  __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
+-
+-      return &pvti_base[offset];
++      return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
+ }
+ 
+ static notrace cycle_t vread_pvclock(int *mode)
+ {
+-      const struct pvclock_vsyscall_time_info *pvti;
++      const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
+       cycle_t ret;
+-      u64 last;
+-      u32 version;
+-      u8 flags;
+-      unsigned cpu, cpu1;
+-
++      u64 tsc, pvti_tsc;
++      u64 last, delta, pvti_system_time;
++      u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
+ 
+       /*
+-       * Note: hypervisor must guarantee that:
+-       * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+-       * 2. that per-CPU pvclock time info is updated if the
+-       *    underlying CPU changes.
+-       * 3. that version is increased whenever underlying CPU
+-       *    changes.
++       * Note: The kernel and hypervisor must guarantee that cpu ID
++       * number maps 1:1 to per-CPU pvclock time info.
++       *
++       * Because the hypervisor is entirely unaware of guest userspace
++       * preemption, it cannot guarantee that per-CPU pvclock time
++       * info is updated if the underlying CPU changes or that that
++       * version is increased whenever underlying CPU changes.
+        *
++       * On KVM, we are guaranteed that pvti updates for any vCPU are
++       * atomic as seen by *all* vCPUs.  This is an even stronger
++       * guarantee than we get with a normal seqlock.
++       *
++       * On Xen, we don't appear to have that guarantee, but Xen still
++       * supplies a valid seqlock using the version field.
++
++       * We only do pvclock vdso timing at all if
++       * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
++       * mean that all vCPUs have matching pvti and that the TSC is
++       * synced, so we can just look at vCPU 0's pvti.
+        */
+-      do {
+-              cpu = __getcpu() & VGETCPU_CPU_MASK;
+-              /* TODO: We can put vcpu id into higher bits of pvti.version.
+-               * This will save a couple of cycles by getting rid of
+-               * __getcpu() calls (Gleb).
+-               */
+-
+-              pvti = get_pvti(cpu);
+-
+-              version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+-
+-              /*
+-               * Test we're still on the cpu as well as the version.
+-               * We could have been migrated just after the first
+-               * vgetcpu but before fetching the version, so we
+-               * wouldn't notice a version change.
+-               */
+-              cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+-      } while (unlikely(cpu != cpu1 ||
+-                        (pvti->pvti.version & 1) ||
+-                        pvti->pvti.version != version));
+-
+-      if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
++
++      if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+               *mode = VCLOCK_NONE;
++              return 0;
++      }
++
++      do {
++              version = pvti->version;
++
++              /* This is also a read barrier, so we'll read version first. */
++              tsc = rdtsc_ordered();
++
++              pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
++              pvti_tsc_shift = pvti->tsc_shift;
++              pvti_system_time = pvti->system_time;
++              pvti_tsc = pvti->tsc_timestamp;
++
++              /* Make sure that the version double-check is last. */
++              smp_rmb();
++      } while (unlikely((version & 1) || version != pvti->version));
++
++      delta = tsc - pvti_tsc;
++      ret = pvti_system_time +
++              pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
++                                  pvti_tsc_shift);
+ 
+       /* refer to tsc.c read_tsc() comment for rationale */
+       last = gtod->cycle_last;
+diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S 
b/arch/x86/entry/vdso/vdso-layout.lds.S
+index de2c921025f5..4158acc17df0 100644
+--- a/arch/x86/entry/vdso/vdso-layout.lds.S
++++ b/arch/x86/entry/vdso/vdso-layout.lds.S
+@@ -25,7 +25,7 @@ SECTIONS
+        * segment.
+        */
+ 
+-      vvar_start = . - 2 * PAGE_SIZE;
++      vvar_start = . - 3 * PAGE_SIZE;
+       vvar_page = vvar_start;
+ 
+       /* Place all vvars at the offsets in asm/vvar.h. */
+@@ -36,6 +36,7 @@ SECTIONS
+ #undef EMIT_VVAR
+ 
+       hpet_page = vvar_start + PAGE_SIZE;
++      pvclock_page = vvar_start + 2 * PAGE_SIZE;
+ 
+       . = SIZEOF_HEADERS;
+ 
+diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
+index 785d9922b106..491020b2826d 100644
+--- a/arch/x86/entry/vdso/vdso2c.c
++++ b/arch/x86/entry/vdso/vdso2c.c
+@@ -73,6 +73,7 @@ enum {
+       sym_vvar_start,
+       sym_vvar_page,
+       sym_hpet_page,
++      sym_pvclock_page,
+       sym_VDSO_FAKE_SECTION_TABLE_START,
+       sym_VDSO_FAKE_SECTION_TABLE_END,
+ };
+@@ -80,6 +81,7 @@ enum {
+ const int special_pages[] = {
+       sym_vvar_page,
+       sym_hpet_page,
++      sym_pvclock_page,
+ };
+ 
+ struct vdso_sym {
+@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
+       [sym_vvar_start] = {"vvar_start", true},
+       [sym_vvar_page] = {"vvar_page", true},
+       [sym_hpet_page] = {"hpet_page", true},
++      [sym_pvclock_page] = {"pvclock_page", true},
+       [sym_VDSO_FAKE_SECTION_TABLE_START] = {
+               "VDSO_FAKE_SECTION_TABLE_START", false
+       },
+diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
+index 64df47148160..aa828191c654 100644
+--- a/arch/x86/entry/vdso/vma.c
++++ b/arch/x86/entry/vdso/vma.c
+@@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool 
calculate_addr)
+               .name = "[vvar]",
+               .pages = no_pages,
+       };
++      struct pvclock_vsyscall_time_info *pvti;
+ 
+       if (calculate_addr) {
+               addr = vdso_addr(current->mm->start_stack,
+@@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool 
calculate_addr)
+       }
+ #endif
+ 
++      pvti = pvclock_pvti_cpu0_va();
++      if (pvti && image->sym_pvclock_page) {
++              ret = remap_pfn_range(vma,
++                                    text_start + image->sym_pvclock_page,
++                                    __pa(pvti) >> PAGE_SHIFT,
++                                    PAGE_SIZE,
++                                    PAGE_READONLY);
++
++              if (ret)
++                      goto up_fail;
++      }
++
+ up_fail:
+       if (ret)
+               current->mm->context.vdso = NULL;
+diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
+index e01f7f7ccb0c..84ae170bc3d0 100644
+--- a/arch/x86/include/asm/cmdline.h
++++ b/arch/x86/include/asm/cmdline.h
+@@ -2,5 +2,7 @@
+ #define _ASM_X86_CMDLINE_H
+ 
+ int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
++int cmdline_find_option(const char *cmdline_ptr, const char *option,
++                      char *buffer, int bufsize);
+ 
+ #endif /* _ASM_X86_CMDLINE_H */
+diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
+index f7ba9fbf12ee..f6605712ca90 100644
+--- a/arch/x86/include/asm/cpufeature.h
++++ b/arch/x86/include/asm/cpufeature.h
+@@ -187,6 +187,7 @@
+ #define X86_FEATURE_ARAT      ( 7*32+ 1) /* Always Running APIC Timer */
+ #define X86_FEATURE_CPB               ( 7*32+ 2) /* AMD Core Performance 
Boost */
+ #define X86_FEATURE_EPB               ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS 
support */
++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && 
CR4.PCIDE=1 */
+ #define X86_FEATURE_PLN               ( 7*32+ 5) /* Intel Power Limit 
Notification */
+ #define X86_FEATURE_PTS               ( 7*32+ 6) /* Intel Package Thermal 
Status */
+ #define X86_FEATURE_DTHERM    ( 7*32+ 7) /* Digital Thermal Sensor */
+@@ -199,6 +200,9 @@
+ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+ #define X86_FEATURE_INTEL_PT  ( 7*32+15) /* Intel Processor Trace */
+ 
++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
++#define X86_FEATURE_KAISER    ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o 
nokaiser */
++
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
+ #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
+diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
+index 4e10d73cf018..880db91d9457 100644
+--- a/arch/x86/include/asm/desc.h
++++ b/arch/x86/include/asm/desc.h
+@@ -43,7 +43,7 @@ struct gdt_page {
+       struct desc_struct gdt[GDT_ENTRIES];
+ } __attribute__((aligned(PAGE_SIZE)));
+ 
+-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
+ 
+ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+ {
+diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
+index 59caa55fb9b5..ee52ff858699 100644
+--- a/arch/x86/include/asm/hw_irq.h
++++ b/arch/x86/include/asm/hw_irq.h
+@@ -187,7 +187,7 @@ extern char irq_entries_start[];
+ #define VECTOR_RETRIGGERED    ((void *)~0UL)
+ 
+ typedef struct irq_desc* vector_irq_t[NR_VECTORS];
+-DECLARE_PER_CPU(vector_irq_t, vector_irq);
++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
+ 
+ #endif /* !ASSEMBLY_ */
+ 
+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
+new file mode 100644
+index 000000000000..802bbbdfe143
+--- /dev/null
++++ b/arch/x86/include/asm/kaiser.h
+@@ -0,0 +1,141 @@
++#ifndef _ASM_X86_KAISER_H
++#define _ASM_X86_KAISER_H
++
++#include <uapi/asm/processor-flags.h> /* For PCID constants */
++
++/*
++ * This file includes the definitions for the KAISER feature.
++ * KAISER is a counter measure against x86_64 side channel attacks on
++ * the kernel virtual memory.  It has a shadow pgd for every process: the
++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
++ * user memory. Within a kernel context switch, or when an interrupt is 
handled,
++ * the pgd is switched to the normal one. When the system switches to user 
mode,
++ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
++ * and the user may not attack the whole kernel memory.
++ *
++ * A minimalistic kernel mapping holds the parts needed to be mapped in user
++ * mode, such as the entry/exit functions of the user space, or the stacks.
++ */
++
++#define KAISER_SHADOW_PGD_OFFSET 0x1000
++
++#ifdef __ASSEMBLY__
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++
++.macro _SWITCH_TO_KERNEL_CR3 reg
++movq %cr3, \reg
++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
++movq \reg, %cr3
++.endm
++
++.macro _SWITCH_TO_USER_CR3 reg regb
++/*
++ * regb must be the low byte portion of reg: because we have arranged
++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
++ * not enabled): so that the one register can update both memory and cr3.
++ */
++movq %cr3, \reg
++orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
++js   9f
++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
++9:
++movq \reg, %cr3
++.endm
++
++.macro SWITCH_KERNEL_CR3
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
++_SWITCH_TO_KERNEL_CR3 %rax
++popq %rax
++8:
++.endm
++
++.macro SWITCH_USER_CR3
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
++_SWITCH_TO_USER_CR3 %rax %al
++popq %rax
++8:
++.endm
++
++.macro SWITCH_KERNEL_CR3_NO_STACK
++ALTERNATIVE "jmp 8f", \
++      __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
++      X86_FEATURE_KAISER
++_SWITCH_TO_KERNEL_CR3 %rax
++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
++8:
++.endm
++
++#else /* CONFIG_PAGE_TABLE_ISOLATION */
++
++.macro SWITCH_KERNEL_CR3
++.endm
++.macro SWITCH_USER_CR3
++.endm
++.macro SWITCH_KERNEL_CR3_NO_STACK
++.endm
++
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
++
++#else /* __ASSEMBLY__ */
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++/*
++ * Upon kernel/user mode switch, it may happen that the address
++ * space has to be switched before the registers have been
++ * stored.  To change the address space, another register is
++ * needed.  A register therefore has to be stored/restored.
++*/
++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++
++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
++
++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
++
++extern int kaiser_enabled;
++extern void __init kaiser_check_boottime_disable(void);
++#else
++#define kaiser_enabled        0
++static inline void __init kaiser_check_boottime_disable(void) {}
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
++
++/*
++ * Kaiser function prototypes are needed even when 
CONFIG_PAGE_TABLE_ISOLATION is not set,
++ * so as to build with tests on kaiser_enabled instead of #ifdefs.
++ */
++
++/**
++ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) 
mapping
++ *  @addr: the start address of the range
++ *  @size: the size of the range
++ *  @flags: The mapping flags of the pages
++ *
++ *  The mapping is done on a global scope, so no bigger
++ *  synchronization has to be done.  the pages have to be
++ *  manually unmapped again when they are not needed any longer.
++ */
++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, 
unsigned long flags);
++
++/**
++ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
++ *  @addr: the start address of the range
++ *  @size: the size of the range
++ */
++extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
++
++/**
++ *  kaiser_init - Initialize the shadow mapping
++ *
++ *  Most parts of the shadow mapping can be mapped upon boot
++ *  time.  Only per-process things like the thread stacks
++ *  or a new LDT have to be mapped at runtime.  These boot-
++ *  time mappings are permanent and never unmapped.
++ */
++extern void kaiser_init(void);
++
++#endif /* __ASSEMBLY */
++
++#endif /* _ASM_X86_KAISER_H */
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
+index 6ec0c8b2e9df..84c62d950023 100644
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -18,6 +18,12 @@
+ #ifndef __ASSEMBLY__
+ #include <asm/x86_init.h>
+ 
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++extern int kaiser_enabled;
++#else
++#define kaiser_enabled 0
++#endif
++
+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+ void ptdump_walk_pgd_level_checkwx(void);
+ 
+@@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long 
address)
+ 
+ static inline int pgd_bad(pgd_t pgd)
+ {
+-      return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
++      pgdval_t ignore_flags = _PAGE_USER;
++      /*
++       * We set NX on KAISER pgds that map userspace memory so
++       * that userspace can not meaningfully use the kernel
++       * page table by accident; it will fault on the first
++       * instruction it tries to run.  See native_set_pgd().
++       */
++      if (kaiser_enabled)
++              ignore_flags |= _PAGE_NX;
++
++      return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+ 
+ static inline int pgd_none(pgd_t pgd)
+@@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct 
*mm,
+  */
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+-       memcpy(dst, src, count * sizeof(pgd_t));
++      memcpy(dst, src, count * sizeof(pgd_t));
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      if (kaiser_enabled) {
++              /* Clone the shadow pgd part as well */
++              memcpy(native_get_shadow_pgd(dst),
++                      native_get_shadow_pgd(src),
++                      count * sizeof(pgd_t));
++      }
++#endif
+ }
+ 
+ #define PTE_SHIFT ilog2(PTRS_PER_PTE)
+diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
+index 2ee781114d34..c810226e741a 100644
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
+       native_set_pud(pud, native_make_pud(0));
+ }
+ 
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
++
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
++{
++#ifdef CONFIG_DEBUG_VM
++      /* linux/mmdebug.h may not have been included at this point */
++      BUG_ON(!kaiser_enabled);
++#endif
++      return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
++}
++#else
++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++      return pgd;
++}
++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
++{
++      BUILD_BUG_ON(1);
++      return NULL;
++}
++#endif /* CONFIG_PAGE_TABLE_ISOLATION */
++
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+-      *pgdp = pgd;
++      *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
+ }
+ 
+ static inline void native_pgd_clear(pgd_t *pgd)
+diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
+index 79c91853e50e..8dba273da25a 100644
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -89,7 +89,7 @@
+ #define _PAGE_NX      (_AT(pteval_t, 0))
+ #endif
+ 
+-#define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
++#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+ 
+ #define _PAGE_TABLE   (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
+                        _PAGE_ACCESSED | _PAGE_DIRTY)
+@@ -102,6 +102,33 @@
+                        _PAGE_SOFT_DIRTY)
+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+ 
++/* The ASID is the lower 12 bits of CR3 */
++#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
++
++/* Mask for all the PCID-related bits in CR3: */
++#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | 
X86_CR3_PCID_ASID_MASK)
++#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
++
++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
++#define X86_CR3_PCID_ASID_USER        (_AC(0x80,UL))
++
++#define X86_CR3_PCID_KERN_FLUSH               (X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_FLUSH               (X86_CR3_PCID_ASID_USER)
++#define X86_CR3_PCID_KERN_NOFLUSH     (X86_CR3_PCID_NOFLUSH | 
X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_NOFLUSH     (X86_CR3_PCID_NOFLUSH | 
X86_CR3_PCID_ASID_USER)
++#else
++#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
++/*
++ * PCIDs are unsupported on 32-bit and none of these bits can be
++ * set in CR3:
++ */
++#define X86_CR3_PCID_KERN_FLUSH               (0)
++#define X86_CR3_PCID_USER_FLUSH               (0)
++#define X86_CR3_PCID_KERN_NOFLUSH     (0)
++#define X86_CR3_PCID_USER_NOFLUSH     (0)
++#endif
++
+ /*
+  * The cache modes defined here are used to translate between pure SW usage
+  * and the HW defined cache mode bits and/or PAT entries.
+diff --git a/arch/x86/include/asm/processor.h 
b/arch/x86/include/asm/processor.h
+index 2d5a50cb61a2..f3bdaed0188f 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -305,7 +305,7 @@ struct tss_struct {
+ 
+ } ____cacheline_aligned;
+ 
+-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
+ 
+ #ifdef CONFIG_X86_32
+ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
+index baad72e4c100..6045cef376c2 100644
+--- a/arch/x86/include/asm/pvclock.h
++++ b/arch/x86/include/asm/pvclock.h
+@@ -4,6 +4,15 @@
+ #include <linux/clocksource.h>
+ #include <asm/pvclock-abi.h>
+ 
++#ifdef CONFIG_PARAVIRT_CLOCK
++extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
++#else
++static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
++{
++      return NULL;
++}
++#endif
++
+ /* some helper functions for xen and kvm pv clock sources */
+ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index 9fc5968da820..a691b66cc40a 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -131,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned 
long mask)
+       cr4_set_bits(mask);
+ }
+ 
++/*
++ * Declare a couple of kaiser interfaces here for convenience,
++ * to avoid the need for asm/kaiser.h in unexpected places.
++ */
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++extern int kaiser_enabled;
++extern void kaiser_setup_pcid(void);
++extern void kaiser_flush_tlb_on_return_to_user(void);
++#else
++#define kaiser_enabled 0
++static inline void kaiser_setup_pcid(void)
++{
++}
++static inline void kaiser_flush_tlb_on_return_to_user(void)
++{
++}
++#endif
++
+ static inline void __native_flush_tlb(void)
+ {
+       /*
+@@ -139,6 +157,8 @@ static inline void __native_flush_tlb(void)
+        * back:
+        */
+       preempt_disable();
++      if (kaiser_enabled)
++              kaiser_flush_tlb_on_return_to_user();
+       native_write_cr3(native_read_cr3());
+       preempt_enable();
+ }
+@@ -148,20 +168,27 @@ static inline void 
__native_flush_tlb_global_irq_disabled(void)
+       unsigned long cr4;
+ 
+       cr4 = this_cpu_read(cpu_tlbstate.cr4);
+-      /* clear PGE */
+-      native_write_cr4(cr4 & ~X86_CR4_PGE);
+-      /* write old PGE again and flush TLBs */
+-      native_write_cr4(cr4);
++      if (cr4 & X86_CR4_PGE) {
++              /* clear PGE and flush TLB of all entries */
++              native_write_cr4(cr4 & ~X86_CR4_PGE);
++              /* restore PGE as it was before */
++              native_write_cr4(cr4);
++      } else {
++              /* do it with cr3, letting kaiser flush user PCID */
++              __native_flush_tlb();
++      }
+ }
+ 
+ static inline void __native_flush_tlb_global(void)
+ {
+       unsigned long flags;
+ 
+-      if (static_cpu_has(X86_FEATURE_INVPCID)) {
++      if (this_cpu_has(X86_FEATURE_INVPCID)) {
+               /*
+                * Using INVPCID is considerably faster than a pair of writes
+                * to CR4 sandwiched inside an IRQ flag save/restore.
++               *
++               * Note, this works with CR4.PCIDE=0 or 1.
+                */
+               invpcid_flush_all();
+               return;
+@@ -173,24 +200,45 @@ static inline void __native_flush_tlb_global(void)
+        * be called from deep inside debugging code.)
+        */
+       raw_local_irq_save(flags);
+-
+       __native_flush_tlb_global_irq_disabled();
+-
+       raw_local_irq_restore(flags);
+ }
+ 
+ static inline void __native_flush_tlb_single(unsigned long addr)
+ {
+-      asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++      /*
++       * SIMICS #GP's if you run INVPCID with type 2/3
++       * and X86_CR4_PCIDE clear.  Shame!
++       *
++       * The ASIDs used below are hard-coded.  But, we must not
++       * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
++       * invlpg in the case we are called early.
++       */
++
++      if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
++              if (kaiser_enabled)
++                      kaiser_flush_tlb_on_return_to_user();
++              asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++              return;
++      }
++      /* Flush the address out of both PCIDs. */
++      /*
++       * An optimization here might be to determine addresses
++       * that are only kernel-mapped and only flush the kernel
++       * ASID.  But, userspace flushes are probably much more
++       * important performance-wise.
++       *
++       * Make sure to do only a single invpcid when KAISER is
++       * disabled and we have only a single ASID.
++       */
++      if (kaiser_enabled)
++              invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
++      invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+ }
+ 
+ static inline void __flush_tlb_all(void)
+ {
+-      if (cpu_has_pge)
+-              __flush_tlb_global();
+-      else
+-              __flush_tlb();
+-
++      __flush_tlb_global();
+       /*
+        * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+        * we'd end up flushing kernel translations for the current ASID but
+diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
+index 756de9190aec..deabaf9759b6 100644
+--- a/arch/x86/include/asm/vdso.h
++++ b/arch/x86/include/asm/vdso.h
+@@ -22,6 +22,7 @@ struct vdso_image {
+ 
+       long sym_vvar_page;
+       long sym_hpet_page;
++      long sym_pvclock_page;
+       long sym_VDSO32_NOTE_MASK;
+       long sym___kernel_sigreturn;
+       long sym___kernel_rt_sigreturn;
+diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
+index 79887abcb5e1..1361779f44fe 100644
+--- a/arch/x86/include/uapi/asm/processor-flags.h
++++ b/arch/x86/include/uapi/asm/processor-flags.h
+@@ -77,7 +77,8 @@
+ #define X86_CR3_PWT           _BITUL(X86_CR3_PWT_BIT)
+ #define X86_CR3_PCD_BIT               4 /* Page Cache Disable */
+ #define X86_CR3_PCD           _BITUL(X86_CR3_PCD_BIT)
+-#define X86_CR3_PCID_MASK     _AC(0x00000fff,UL) /* PCID Mask */
++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
++#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
+ 
+ /*
+  * Intel CPU features in CR4
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index aa1e7246b06b..cc154ac64f00 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
+ 
+ static const struct cpu_dev *this_cpu = &default_cpu;
+ 
+-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = 
{
+ #ifdef CONFIG_X86_64
+       /*
+        * We need valid kernel segments for data and code in long mode too
+@@ -324,8 +324,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 
*c)
+ static void setup_pcid(struct cpuinfo_x86 *c)
+ {
+       if (cpu_has(c, X86_FEATURE_PCID)) {
+-              if (cpu_has(c, X86_FEATURE_PGE)) {
++              if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+                       cr4_set_bits(X86_CR4_PCIDE);
++                      /*
++                       * INVPCID has two "groups" of types:
++                       * 1/2: Invalidate an individual address
++                       * 3/4: Invalidate all contexts
++                       *
++                       * 1/2 take a PCID, but 3/4 do not.  So, 3/4
++                       * ignore the PCID argument in the descriptor.
++                       * But, we have to be careful not to call 1/2
++                       * with an actual non-zero PCID in them before
++                       * we do the above cr4_set_bits().
++                       */
++                      if (cpu_has(c, X86_FEATURE_INVPCID))
++                              set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+               } else {
+                       /*
+                        * flush_tlb_all(), as currently implemented, won't
+@@ -338,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
+                       clear_cpu_cap(c, X86_FEATURE_PCID);
+               }
+       }
++      kaiser_setup_pcid();
+ }
+ 
+ /*
+@@ -1229,7 +1243,7 @@ static const unsigned int 
exception_stack_sizes[N_EXCEPTION_STACKS] = {
+         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
+ };
+ 
+-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
+       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+ 
+ /* May not be marked __init: used by software suspend */
+@@ -1392,6 +1406,14 @@ void cpu_init(void)
+        * try to read it.
+        */
+       cr4_init_shadow();
++      if (!kaiser_enabled) {
++              /*
++               * secondary_startup_64() deferred setting PGE in cr4:
++               * probe_page_size_mask() sets it on the boot cpu,
++               * but it needs to be set on each secondary cpu.
++               */
++              cr4_set_bits(X86_CR4_PGE);
++      }
+ 
+       /*
+        * Load microcode on this cpu if a valid microcode is available.
+diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c 
b/arch/x86/kernel/cpu/perf_event_intel_ds.c
+index 1e7de3cefc9c..f01b3a12dce0 100644
+--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
++++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
+@@ -2,11 +2,15 @@
+ #include <linux/types.h>
+ #include <linux/slab.h>
+ 
++#include <asm/kaiser.h>
+ #include <asm/perf_event.h>
+ #include <asm/insn.h>
+ 
+ #include "perf_event.h"
+ 
++static
++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, 
cpu_debug_store);
++
+ /* The size of a BTS record in bytes: */
+ #define BTS_RECORD_SIZE               24
+ 
+@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
+ 
+ static DEFINE_PER_CPU(void *, insn_buffer);
+ 
++static void *dsalloc(size_t size, gfp_t flags, int node)
++{
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      unsigned int order = get_order(size);
++      struct page *page;
++      unsigned long addr;
++
++      page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
++      if (!page)
++              return NULL;
++      addr = (unsigned long)page_address(page);
++      if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
++              __free_pages(page, order);
++              addr = 0;
++      }
++      return (void *)addr;
++#else
++      return kmalloc_node(size, flags | __GFP_ZERO, node);
++#endif
++}
++
++static void dsfree(const void *buffer, size_t size)
++{
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++      if (!buffer)
++              return;
++      kaiser_remove_mapping((unsigned long)buffer, size);
++      free_pages((unsigned long)buffer, get_order(size));
++#else
++      kfree(buffer);
++#endif
++}
++
+ static int alloc_pebs_buffer(int cpu)
+ {
+       struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
+       if (!x86_pmu.pebs)
+               return 0;
+ 
+-      buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
++      buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+       if (unlikely(!buffer))
+               return -ENOMEM;
+ 
+@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
+       if (x86_pmu.intel_cap.pebs_format < 2) {
+               ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+               if (!ibuffer) {
+-                      kfree(buffer);
++                      dsfree(buffer, x86_pmu.pebs_buffer_size);
+                       return -ENOMEM;
+               }
+               per_cpu(insn_buffer, cpu) = ibuffer;
+@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
+       kfree(per_cpu(insn_buffer, cpu));
+       per_cpu(insn_buffer, cpu) = NULL;
+ 
+-      kfree((void *)(unsigned long)ds->pebs_buffer_base);
++      dsfree((void *)(unsigned long)ds->pebs_buffer_base,
++                      x86_pmu.pebs_buffer_size);
+       ds->pebs_buffer_base = 0;
+ }
+ 
+@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
+       if (!x86_pmu.bts)
+               return 0;
+ 
+-      buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
++      buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+       if (unlikely(!buffer)) {
+               WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+               return -ENOMEM;
+@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
+       if (!ds || !x86_pmu.bts)
+               return;
+ 
+-      kfree((void *)(unsigned long)ds->bts_buffer_base);
++      dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
+       ds->bts_buffer_base = 0;
+ }
+ 
+ static int alloc_ds_buffer(int cpu)
+ {
+-      int node = cpu_to_node(cpu);
+-      struct debug_store *ds;
+-
+-      ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+-      if (unlikely(!ds))
+-              return -ENOMEM;
++      struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
+ 
++      memset(ds, 0, sizeof(*ds));
+       per_cpu(cpu_hw_events, cpu).ds = ds;
+ 
+       return 0;
+@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
+               return;
+ 
+       per_cpu(cpu_hw_events, cpu).ds = NULL;
+-      kfree(ds);
+ }
+ 
+ void release_ds_buffers(void)
+diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
+index 4d38416e2a7f..b02cb2ec6726 100644
+--- a/arch/x86/kernel/espfix_64.c
++++ b/arch/x86/kernel/espfix_64.c
+@@ -41,6 +41,7 @@
+ #include <asm/pgalloc.h>
+ #include <asm/setup.h>
+ #include <asm/espfix.h>
++#include <asm/kaiser.h>
+ 
+ /*
+  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
+       /* Install the espfix pud into the kernel page directory */
+       pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+       pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
++      /*
++       * Just copy the top-level PGD that is mapping the espfix
++       * area to ensure it is mapped into the shadow user page
++       * tables.
++       */
++      if (kaiser_enabled) {
++              set_pgd(native_get_shadow_pgd(pgd_p),
++                      __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
++      }
+ 
+       /* Randomize the locations */
+       init_espfix_random();
+diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
+index ffdc0e860390..4034e905741a 100644
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
+       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+ 1:
+ 
+-      /* Enable PAE mode and PGE */
+-      movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
++      /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
++      movl    $(X86_CR4_PAE | X86_CR4_PSE), %ecx
+       movq    %rcx, %cr4
+ 
+       /* Setup early boot stage 4 level pagetables. */
+@@ -441,6 +441,27 @@ early_idt_ripmsg:
+       .balign PAGE_SIZE; \
+ GLOBAL(name)
+ 
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++/*
++ * Each PGD needs to be 8k long and 8k aligned.  We do not
++ * ever go out to userspace with these, so we do not
++ * strictly *need* the second page, but this allows us to
++ * have a single set_pgd() implementation that does not
++ * need to worry about whether it has 4k or 8k to work
++ * with.
++ *
++ * This ensures PGDs are 8k long:
++ */
++#define KAISER_USER_PGD_FILL  512
++/* This ensures they are 8k-aligned: */
++#define NEXT_PGD_PAGE(name) \
++      .balign 2 * PAGE_SIZE; \
++GLOBAL(name)
++#else
++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
++#define KAISER_USER_PGD_FILL  0
++#endif
++
+ /* Automate the creation of 1 to 1 mapping pmd entries */
+ #define PMDS(START, PERM, COUNT)                      \
+       i = 0 ;                                         \
+@@ -450,9 +471,10 @@ GLOBAL(name)
+       .endr
+ 
+       __INITDATA
+-NEXT_PAGE(early_level4_pgt)
++NEXT_PGD_PAGE(early_level4_pgt)
+       .fill   511,8,0
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
++      .fill   KAISER_USER_PGD_FILL,8,0
+ 
+ NEXT_PAGE(early_dynamic_pgts)
+       .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+@@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)
+       .data
+ 
+ #ifndef CONFIG_XEN
+-NEXT_PAGE(init_level4_pgt)
++NEXT_PGD_PAGE(init_level4_pgt)
+       .fill   512,8,0
++      .fill   KAISER_USER_PGD_FILL,8,0
+ #else
+-NEXT_PAGE(init_level4_pgt)
++NEXT_PGD_PAGE(init_level4_pgt)
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .org    init_level4_pgt + L4_START_KERNEL*8, 0
+       /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
++      .fill   KAISER_USER_PGD_FILL,8,0
+ 
+ NEXT_PAGE(level3_ident_pgt)
+       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+@@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
+        */
+       PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+ #endif
++      .fill   KAISER_USER_PGD_FILL,8,0
+ 
+ NEXT_PAGE(level3_kernel_pgt)
+       .fill   L3_START_KERNEL,8,0
+diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
+index 1423ab1b0312..f480b38a03c3 100644
+--- a/arch/x86/kernel/irqinit.c
++++ b/arch/x86/kernel/irqinit.c
+@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
+       .flags = IRQF_NO_THREAD,
+ };
+ 
+-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
+       [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
+ };
+ 
+diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
+index 2bd81e302427..ec1b06dc82d2 100644
+--- a/arch/x86/kernel/kvmclock.c
++++ b/arch/x86/kernel/kvmclock.c
+@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
+ static struct pvclock_vsyscall_time_info *hv_clock;
+ static struct pvclock_wall_clock wall_clock;
+ 
++struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
++{
++      return hv_clock;
++}
++
+ /*
+  * The wallclock is the time of day when we booted. Since then, some time may
+  * have elapsed since the hypervisor wrote the data. So we try to account for
+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
+index d6279593bcdd..bc429365b72a 100644
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -16,6 +16,7 @@
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
++#include <linux/kaiser.h>
+ 
+ #include <asm/ldt.h>
+ #include <asm/desc.h>
+@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
+       set_ldt(pc->ldt->entries, pc->ldt->size);
+ }
+ 
++static void __free_ldt_struct(struct ldt_struct *ldt)
++{
++      if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
++              vfree(ldt->entries);
++      else
++              free_page((unsigned long)ldt->entries);
++      kfree(ldt);
++}
++
+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. 
*/
+ static struct ldt_struct *alloc_ldt_struct(int size)
+ {
+       struct ldt_struct *new_ldt;
+       int alloc_size;
++      int ret;
+ 
+       if (size > LDT_ENTRIES)
+               return NULL;
+@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
+               return NULL;
+       }
+ 
++      ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
++                               __PAGE_KERNEL);
+       new_ldt->size = size;
++      if (ret) {
++              __free_ldt_struct(new_ldt);
++              return NULL;
++      }
+       return new_ldt;
+ }
+ 
+@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
+       if (likely(!ldt))
+               return;
+ 
++      kaiser_remove_mapping((unsigned long)ldt->entries,
++                            ldt->size * LDT_ENTRY_SIZE);
+       paravirt_free_ldt(ldt->entries, ldt->size);
+-      if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+-              vfree(ldt->entries);
+-      else
+-              free_page((unsigned long)ldt->entries);
+-      kfree(ldt);
++      __free_ldt_struct(ldt);
+ }
+ 
+ /*
+diff --git a/arch/x86/kernel/paravirt_patch_64.c 
b/arch/x86/kernel/paravirt_patch_64.c
+index 8aa05583bc42..0677bf8d3a42 100644
+--- a/arch/x86/kernel/paravirt_patch_64.c
++++ b/arch/x86/kernel/paravirt_patch_64.c
+@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
+ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
+ DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
+ DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
+-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
+ DEF_NATIVE(pv_cpu_ops, clts, "clts");
+ DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+ 
+@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+               PATCH_SITE(pv_mmu_ops, read_cr3);
+               PATCH_SITE(pv_mmu_ops, write_cr3);
+               PATCH_SITE(pv_cpu_ops, clts);
+-              PATCH_SITE(pv_mmu_ops, flush_tlb_single);
+               PATCH_SITE(pv_cpu_ops, wbinvd);
+ #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+               case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
+index 9f7c21c22477..7c5c5dc90ffa 100644
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -39,7 +39,7 @@
+  * section. Since TSS's are completely CPU-local, we want them
+  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+  */
+-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, 
cpu_tss) = {
+       .x86_tss = {
+               .sp0 = TOP_OF_INIT_STACK,
+ #ifdef CONFIG_X86_32
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index e67b834279b2..bbaae4cf9e8e 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -112,6 +112,7 @@
+ #include <asm/alternative.h>
+ #include <asm/prom.h>
+ #include <asm/microcode.h>
++#include <asm/kaiser.h>
+ 
+ /*
+  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+@@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
+        */
+       init_hypervisor_platform();
+ 
++      /*
++       * This needs to happen right after XENPV is set on xen and
++       * kaiser_enabled is checked below in cleanup_highmap().
++       */
++      kaiser_check_boottime_disable();
++
+       x86_init.resources.probe_roms();
+ 
+       /* after parse_early_param, so could debug it */
+diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
+index 1c113db9ed57..2bb5ee464df3 100644
+--- a/arch/x86/kernel/tracepoint.c
++++ b/arch/x86/kernel/tracepoint.c
+@@ -9,10 +9,12 @@
+ #include <linux/atomic.h>
+ 
+ atomic_t trace_idt_ctr = ATOMIC_INIT(0);
++__aligned(PAGE_SIZE)
+ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
+                               (unsigned long) trace_idt_table };
+ 
+ /* No need to be aligned, but done to keep all IDTs defined the same way. */
++__aligned(PAGE_SIZE)
+ gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+ 
+ static int trace_irq_vector_refcount;
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 796f1ec67469..ccf17dbfea09 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+                       return 1;
+ 
+               /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+-              if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || 
!is_long_mode(vcpu))
++              if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
++                  !is_long_mode(vcpu))
+                       return 1;
+       }
+ 
+diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
+index 422db000d727..a744506856b1 100644
+--- a/arch/x86/lib/cmdline.c
++++ b/arch/x86/lib/cmdline.c
+@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const 
char *option)
+ 
+       return 0;       /* Buffer overrun */
+ }
++
++/*
++ * Find a non-boolean option (i.e. option=argument). In accordance with
++ * standard Linux practice, if this option is repeated, this returns the
++ * last instance on the command line.
++ *
++ * @cmdline: the cmdline string
++ * @max_cmdline_size: the maximum size of cmdline
++ * @option: option string to look for
++ * @buffer: memory buffer to return the option argument
++ * @bufsize: size of the supplied memory buffer
++ *
++ * Returns the length of the argument (regardless of if it was
++ * truncated to fit in the buffer), or -1 on not found.
++ */
++static int
++__cmdline_find_option(const char *cmdline, int max_cmdline_size,
++                    const char *option, char *buffer, int bufsize)
++{
++      char c;
++      int pos = 0, len = -1;
++      const char *opptr = NULL;
++      char *bufptr = buffer;
++      enum {
++              st_wordstart = 0,       /* Start of word/after whitespace */
++              st_wordcmp,     /* Comparing this word */
++              st_wordskip,    /* Miscompare, skip */
++              st_bufcpy,      /* Copying this to buffer */
++      } state = st_wordstart;
++
++      if (!cmdline)
++              return -1;      /* No command line */
++
++      /*
++       * This 'pos' check ensures we do not overrun
++       * a non-NULL-terminated 'cmdline'
++       */
++      while (pos++ < max_cmdline_size) {
++              c = *(char *)cmdline++;
++              if (!c)
++                      break;
++
++              switch (state) {
++              case st_wordstart:
++                      if (myisspace(c))
++                              break;
++
++                      state = st_wordcmp;
++                      opptr = option;
++                      /* fall through */
++
++              case st_wordcmp:
++                      if ((c == '=') && !*opptr) {
++                              /*
++                               * We matched all the way to the end of the
++                               * option we were looking for, prepare to
++                               * copy the argument.
++                               */
++                              len = 0;
++                              bufptr = buffer;
++                              state = st_bufcpy;
++                              break;
++                      } else if (c == *opptr++) {
++                              /*
++                               * We are currently matching, so continue
++                               * to the next character on the cmdline.
++                               */
++                              break;
++                      }
++                      state = st_wordskip;
++                      /* fall through */
++
++              case st_wordskip:
++                      if (myisspace(c))
++                              state = st_wordstart;
++                      break;
++
++              case st_bufcpy:
++                      if (myisspace(c)) {
++                              state = st_wordstart;
++                      } else {
++                              /*
++                               * Increment len, but don't overrun the
++                               * supplied buffer and leave room for the
++                               * NULL terminator.
++                               */
++                              if (++len < bufsize)
++                                      *bufptr++ = c;
++                      }
++                      break;
++              }
++      }
++
++      if (bufsize)
++              *bufptr = '\0';
++
++      return len;
++}
++
++int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
++                      int bufsize)
++{
++      return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
++                                   buffer, bufsize);
++}
+diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
+index 1ae7c141f778..61e6cead9c4a 100644
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)              += srat.o
+ obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
+ 
+ obj-$(CONFIG_X86_INTEL_MPX)   += mpx.o
++obj-$(CONFIG_PAGE_TABLE_ISOLATION)            += kaiser.o
+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
+index ed4b372860e4..2bd45ae91eb3 100644
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
+               cr4_set_bits_and_update_boot(X86_CR4_PSE);
+ 
+       /* Enable PGE if available */
+-      if (cpu_has_pge) {
++      if (cpu_has_pge && !kaiser_enabled) {
+               cr4_set_bits_and_update_boot(X86_CR4_PGE);
+               __supported_pte_mask |= _PAGE_GLOBAL;
+       } else
+diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
+index ec081fe0ce2c..d76ec9348cff 100644
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
+                       continue;
+               if (vaddr < (unsigned long) _text || vaddr > end)
+                       set_pmd(pmd, __pmd(0));
++              else if (kaiser_enabled) {
++                      /*
++                       * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
++                       * clear that now.  This is not important, so long as
++                       * CR4.PGE remains clear, but it removes an anomaly.
++                       * Physical mapping setup below avoids _PAGE_GLOBAL
++                       * by use of massage_pgprot() inside pfn_pte() etc.
++                       */
++                      set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
++              }
+       }
+ }
+ 
+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
+new file mode 100644
+index 000000000000..b0b3a69f1c7f
+--- /dev/null
++++ b/arch/x86/mm/kaiser.c
+@@ -0,0 +1,456 @@
++#include <linux/bug.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/string.h>
++#include <linux/types.h>
++#include <linux/bug.h>
++#include <linux/init.h>
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <linux/mm.h>
++#include <linux/uaccess.h>
++#include <linux/ftrace.h>
++
++#undef pr_fmt
++#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
++
++#include <asm/kaiser.h>
++#include <asm/tlbflush.h>     /* to verify its kaiser declarations */
++#include <asm/pgtable.h>
++#include <asm/pgalloc.h>
++#include <asm/desc.h>
++#include <asm/cmdline.h>
++
++int kaiser_enabled __read_mostly = 1;
++EXPORT_SYMBOL(kaiser_enabled);        /* for inlined TLB flush functions */
++
++__visible
++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++
++/*
++ * These can have bit 63 set, so we can not just use a plain "or"
++ * instruction to get their value or'd into CR3.  It would take
++ * another register.  So, we use a memory reference to these instead.
++ *
++ * This is also handy because systems that do not support PCIDs
++ * just end up or'ing a 0 into their CR3, which does no harm.
++ */
++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
++
++/*
++ * At runtime, the only things we map are some things for CPU
++ * hotplug, and stacks for new processes.  No two CPUs will ever
++ * be populating the same addresses, so we only need to ensure
++ * that we protect between two CPUs trying to allocate and
++ * populate the same page table page.
++ *
++ * Only take this lock when doing a set_p[4um]d(), but it is not
++ * needed for doing a set_pte().  We assume that only the *owner*
++ * of a given allocation will be doing this for _their_
++ * allocation.
++ *
++ * This ensures that once a system has been running for a while
++ * and there have been stacks all over and these page tables
++ * are fully populated, there will be no further acquisitions of
++ * this lock.
++ */
++static DEFINE_SPINLOCK(shadow_table_allocation_lock);
++
++/*
++ * Returns -1 on error.
++ */
++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
++{
++      pgd_t *pgd;
++      pud_t *pud;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      pgd = pgd_offset_k(vaddr);
++      /*
++       * We made all the kernel PGDs present in kaiser_init().
++       * We expect them to stay that way.
++       */
++      BUG_ON(pgd_none(*pgd));
++      /*
++       * PGDs are either 512GB or 128TB on all x86_64
++       * configurations.  We don't handle these.
++       */
++      BUG_ON(pgd_large(*pgd));
++
++      pud = pud_offset(pgd, vaddr);
++      if (pud_none(*pud)) {
++              WARN_ON_ONCE(1);
++              return -1;
++      }
++
++      if (pud_large(*pud))
++              return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
++
++      pmd = pmd_offset(pud, vaddr);
++      if (pmd_none(*pmd)) {
++              WARN_ON_ONCE(1);
++              return -1;
++      }
++
++      if (pmd_large(*pmd))
++              return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
++
++      pte = pte_offset_kernel(pmd, vaddr);
++      if (pte_none(*pte)) {
++              WARN_ON_ONCE(1);
++              return -1;
++      }
++
++      return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
++}
++
++/*
++ * This is a relatively normal page table walk, except that it
++ * also tries to allocate page tables pages along the way.
++ *
++ * Returns a pointer to a PTE on success, or NULL on failure.
++ */
++static pte_t *kaiser_pagetable_walk(unsigned long address)
++{
++      pmd_t *pmd;
++      pud_t *pud;
++      pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
++      gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
++
++      if (pgd_none(*pgd)) {
++              WARN_ONCE(1, "All shadow pgds should have been populated");
++              return NULL;
++      }
++      BUILD_BUG_ON(pgd_large(*pgd) != 0);
++
++      pud = pud_offset(pgd, address);
++      /* The shadow page tables do not use large mappings: */
++      if (pud_large(*pud)) {
++              WARN_ON(1);
++              return NULL;
++      }
++      if (pud_none(*pud)) {
++              unsigned long new_pmd_page = __get_free_page(gfp);
++              if (!new_pmd_page)
++                      return NULL;
++              spin_lock(&shadow_table_allocation_lock);
++              if (pud_none(*pud)) {
++                      set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
++                      __inc_zone_page_state(virt_to_page((void *)
++                                              new_pmd_page), NR_KAISERTABLE);
++              } else
++                      free_page(new_pmd_page);
++              spin_unlock(&shadow_table_allocation_lock);
++      }
++
++      pmd = pmd_offset(pud, address);
++      /* The shadow page tables do not use large mappings: */
++      if (pmd_large(*pmd)) {
++              WARN_ON(1);
++              return NULL;
++      }
++      if (pmd_none(*pmd)) {
++              unsigned long new_pte_page = __get_free_page(gfp);
++              if (!new_pte_page)
++                      return NULL;
++              spin_lock(&shadow_table_allocation_lock);
++              if (pmd_none(*pmd)) {
++                      set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
++                      __inc_zone_page_state(virt_to_page((void *)
++                                              new_pte_page), NR_KAISERTABLE);
++              } else
++                      free_page(new_pte_page);
++              spin_unlock(&shadow_table_allocation_lock);
++      }
++
++      return pte_offset_kernel(pmd, address);
++}
++
++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
++                             unsigned long flags)
++{
++      int ret = 0;
++      pte_t *pte;
++      unsigned long start_addr = (unsigned long )__start_addr;
++      unsigned long address = start_addr & PAGE_MASK;
++      unsigned long end_addr = PAGE_ALIGN(start_addr + size);
++      unsigned long target_address;
++
++      /*
++       * It is convenient for callers to pass in __PAGE_KERNEL etc,
++       * and there is no actual harm from setting _PAGE_GLOBAL, so
++       * long as CR4.PGE is not set.  But it is nonetheless troubling
++       * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
++       * requires that not to be #defined to 0): so mask it off here.
++       */
++      flags &= ~_PAGE_GLOBAL;
++
++      for (; address < end_addr; address += PAGE_SIZE) {
++              target_address = get_pa_from_mapping(address);
++              if (target_address == -1) {
++                      ret = -EIO;
++                      break;
++              }
++              pte = kaiser_pagetable_walk(address);
++              if (!pte) {
++                      ret = -ENOMEM;
++                      break;
++              }
++              if (pte_none(*pte)) {
++                      set_pte(pte, __pte(flags | target_address));
++              } else {
++                      pte_t tmp;
++                      set_pte(&tmp, __pte(flags | target_address));
++                      WARN_ON_ONCE(!pte_same(*pte, tmp));
++              }
++      }
++      return ret;
++}
++
++static int kaiser_add_user_map_ptrs(const void *start, const void *end, 
unsigned long flags)
++{
++      unsigned long size = end - start;
++
++      return kaiser_add_user_map(start, size, flags);
++}
++
++/*
++ * Ensure that the top level of the (shadow) page tables are
++ * entirely populated.  This ensures that all processes that get
++ * forked have the same entries.  This way, we do not have to
++ * ever go set up new entries in older processes.
++ *
++ * Note: we never free these, so there are no updates to them
++ * after this.
++ */
++static void __init kaiser_init_all_pgds(void)
++{
++      pgd_t *pgd;
++      int i = 0;
++
++      pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
++      for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
++              pgd_t new_pgd;
++              pud_t *pud = pud_alloc_one(&init_mm,
++                                         PAGE_OFFSET + i * PGDIR_SIZE);
++              if (!pud) {
++                      WARN_ON(1);
++                      break;
++              }
++              inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
++              new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
++              /*
++               * Make sure not to stomp on some other pgd entry.
++               */
++              if (!pgd_none(pgd[i])) {
++                      WARN_ON(1);
++                      continue;
++              }
++              set_pgd(pgd + i, new_pgd);
++      }
++}
++
++#define kaiser_add_user_map_early(start, size, flags) do {    \
++      int __ret = kaiser_add_user_map(start, size, flags);    \
++      WARN_ON(__ret);                                         \
++} while (0)
++
++#define kaiser_add_user_map_ptrs_early(start, end, flags) do {                
\
++      int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
++      WARN_ON(__ret);                                                 \
++} while (0)
++
++void __init kaiser_check_boottime_disable(void)
++{
++      bool enable = true;
++      char arg[5];
++      int ret;
++
++      if (boot_cpu_has(X86_FEATURE_XENPV))
++              goto silent_disable;
++
++      ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
++      if (ret > 0) {
++              if (!strncmp(arg, "on", 2))
++                      goto enable;
++
++              if (!strncmp(arg, "off", 3))
++                      goto disable;
++
++              if (!strncmp(arg, "auto", 4))
++                      goto skip;
++      }
++
++      if (cmdline_find_option_bool(boot_command_line, "nopti"))
++              goto disable;
++
++skip:
++      if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
++              goto disable;
++
++enable:
++      if (enable)
++              setup_force_cpu_cap(X86_FEATURE_KAISER);
++
++      return;
++
++disable:
++      pr_info("disabled\n");
++
++silent_disable:
++      kaiser_enabled = 0;
++      setup_clear_cpu_cap(X86_FEATURE_KAISER);
++}
++
++/*
++ * If anything in here fails, we will likely die on one of the
++ * first kernel->user transitions and init will die.  But, we
++ * will have most of the kernel up by then and should be able to
++ * get a clean warning out of it.  If we BUG_ON() here, we run
++ * the risk of being before we have good console output.
++ */
++void __init kaiser_init(void)
++{
++      int cpu;
++
++      if (!kaiser_enabled)
++              return;
++
++      kaiser_init_all_pgds();
++
++      for_each_possible_cpu(cpu) {
++              void *percpu_vaddr = __per_cpu_user_mapped_start +
++                                   per_cpu_offset(cpu);
++              unsigned long percpu_sz = __per_cpu_user_mapped_end -
++                                        __per_cpu_user_mapped_start;
++              kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
++                                        __PAGE_KERNEL);
++      }
++
++      /*
++       * Map the entry/exit text section, which is needed at
++       * switches from user to and from kernel.
++       */
++      kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
++                                     __PAGE_KERNEL_RX);
++
++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
++      kaiser_add_user_map_ptrs_early(__irqentry_text_start,
++                                     __irqentry_text_end,
++                                     __PAGE_KERNEL_RX);
++#endif
++      kaiser_add_user_map_early((void *)idt_descr.address,
++                                sizeof(gate_desc) * NR_VECTORS,
++                                __PAGE_KERNEL_RO);
++#ifdef CONFIG_TRACING
++      kaiser_add_user_map_early(&trace_idt_descr,
++                                sizeof(trace_idt_descr),
++                                __PAGE_KERNEL);
++      kaiser_add_user_map_early(&trace_idt_table,
++                                sizeof(gate_desc) * NR_VECTORS,
++                                __PAGE_KERNEL);
++#endif
++      kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
++                                __PAGE_KERNEL);
++      kaiser_add_user_map_early(&debug_idt_table,
++                                sizeof(gate_desc) * NR_VECTORS,
++                                __PAGE_KERNEL);
++
++      pr_info("enabled\n");
++}
++
++/* Add a mapping to the shadow mapping, and synchronize the mappings */
++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long 
flags)
++{
++      if (!kaiser_enabled)
++              return 0;
++      return kaiser_add_user_map((const void *)addr, size, flags);
++}
++
++void kaiser_remove_mapping(unsigned long start, unsigned long size)
++{
++      extern void unmap_pud_range_nofree(pgd_t *pgd,
++                              unsigned long start, unsigned long end);
++      unsigned long end = start + size;
++      unsigned long addr, next;
++      pgd_t *pgd;
++
++      if (!kaiser_enabled)
++              return;
++      pgd = native_get_shadow_pgd(pgd_offset_k(start));
++      for (addr = start; addr < end; pgd++, addr = next) {
++              next = pgd_addr_end(addr, end);
++              unmap_pud_range_nofree(pgd, addr, next);
++      }
++}
++
++/*
++ * Page table pages are page-aligned.  The lower half of the top
++ * level is used for userspace and the top half for the kernel.
++ * This returns true for user pages that need to get copied into
++ * both the user and kernel copies of the page tables, and false
++ * for kernel pages that should only be in the kernel copy.
++ */
++static inline bool is_userspace_pgd(pgd_t *pgdp)
++{
++      return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
++}
++
++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
++{
++      if (!kaiser_enabled)
++              return pgd;
++      /*
++       * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
++       * skip cases like kexec and EFI which make temporary low mappings.
++       */
++      if (pgd.pgd & _PAGE_USER) {
++              if (is_userspace_pgd(pgdp)) {
++                      native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
++                      /*
++                       * Even if the entry is *mapping* userspace, ensure
++                       * that userspace can not use it.  This way, if we
++                       * get out to userspace running on the kernel CR3,
++                       * userspace will crash instead of running.
++                       */
++                      if (__supported_pte_mask & _PAGE_NX)
++                              pgd.pgd |= _PAGE_NX;
++              }
++      } else if (!pgd.pgd) {
++              /*
++               * pgd_clear() cannot check _PAGE_USER, and is even used to
++               * clear corrupted pgd entries: so just rely on cases like
++               * kexec and EFI never to be using pgd_clear().
++               */
++              if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
++                  is_userspace_pgd(pgdp))
++                      native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
++      }
++      return pgd;
++}
++
++void kaiser_setup_pcid(void)
++{
++      unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
++
++      if (this_cpu_has(X86_FEATURE_PCID))
++              user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
++      /*
++       * These variables are used by the entry/exit
++       * code to change PCID and pgd and TLB flushing.
++       */
++      this_cpu_write(x86_cr3_pcid_user, user_cr3);
++}
++
++/*
++ * Make a note that this cpu will need to flush USER tlb on return to user.
++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
++ */
++void kaiser_flush_tlb_on_return_to_user(void)
++{
++      if (this_cpu_has(X86_FEATURE_PCID))
++              this_cpu_write(x86_cr3_pcid_user,
++                      X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
++}
++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
+index 4e5ac46adc9d..81ec7c02f968 100644
+--- a/arch/x86/mm/kasan_init_64.c
++++ b/arch/x86/mm/kasan_init_64.c
+@@ -121,11 +121,16 @@ void __init kasan_init(void)
+       kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+                       (void *)KASAN_SHADOW_END);
+ 
+-      memset(kasan_zero_page, 0, PAGE_SIZE);
+-
+       load_cr3(init_level4_pgt);
+       __flush_tlb_all();
+-      init_task.kasan_depth = 0;
+ 
++      /*
++       * kasan_zero_page has been used as early shadow memory, thus it may
++       * contain some garbage. Now we can clear it, since after the TLB flush
++       * no one should write to it.
++       */
++      memset(kasan_zero_page, 0, PAGE_SIZE);
++
++      init_task.kasan_depth = 0;
+       pr_info("KernelAddressSanitizer initialized\n");
+ }
+diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
+index b599a780a5a9..79377e2a7bcd 100644
+--- a/arch/x86/mm/pageattr.c
++++ b/arch/x86/mm/pageattr.c
+@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
+ #define CPA_FLUSHTLB 1
+ #define CPA_ARRAY 2
+ #define CPA_PAGES_ARRAY 4
++#define CPA_FREE_PAGETABLES 8
+ 
+ #ifdef CONFIG_PROC_FS
+ static unsigned long direct_pages_count[PG_LEVEL_NUM];
+@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t 
*kpte,
+       return 0;
+ }
+ 
+-static bool try_to_free_pte_page(pte_t *pte)
++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
+ {
+       int i;
+ 
++      if (!(cpa->flags & CPA_FREE_PAGETABLES))
++              return false;
++
+       for (i = 0; i < PTRS_PER_PTE; i++)
+               if (!pte_none(pte[i]))
+                       return false;
+@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
+       return true;
+ }
+ 
+-static bool try_to_free_pmd_page(pmd_t *pmd)
++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
+ {
+       int i;
+ 
++      if (!(cpa->flags & CPA_FREE_PAGETABLES))
++              return false;
++
+       for (i = 0; i < PTRS_PER_PMD; i++)
+               if (!pmd_none(pmd[i]))
+                       return false;
+@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
+       return true;
+ }
+ 
+-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long 
end)
++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
++                          unsigned long start,
++                          unsigned long end)
+ {
+       pte_t *pte = pte_offset_kernel(pmd, start);
+ 
+@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long 
start, unsigned long end)
+               pte++;
+       }
+ 
+-      if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
++      if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
+               pmd_clear(pmd);
+               return true;
+       }
+       return false;
+ }
+ 
+-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
+                             unsigned long start, unsigned long end)
+ {
+-      if (unmap_pte_range(pmd, start, end))
+-              if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
++      if (unmap_pte_range(cpa, pmd, start, end))
++              if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
+                       pud_clear(pud);
+ }
+ 
+-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long 
end)
++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
++                          unsigned long start, unsigned long end)
+ {
+       pmd_t *pmd = pmd_offset(pud, start);
+ 
+@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
+               unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+               unsigned long pre_end = min_t(unsigned long, end, next_page);
+ 
+-              __unmap_pmd_range(pud, pmd, start, pre_end);
++              __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
+ 
+               start = pre_end;
+               pmd++;
+@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
+               if (pmd_large(*pmd))
+                       pmd_clear(pmd);
+               else
+-                      __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
++                      __unmap_pmd_range(cpa, pud, pmd,
++                                        start, start + PMD_SIZE);
+ 
+               start += PMD_SIZE;
+               pmd++;
+@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
+        * 4K leftovers?
+        */
+       if (start < end)
+-              return __unmap_pmd_range(pud, pmd, start, end);
++              return __unmap_pmd_range(cpa, pud, pmd, start, end);
+ 
+       /*
+        * Try again to free the PMD page if haven't succeeded above.
+        */
+       if (!pud_none(*pud))
+-              if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
++              if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
+                       pud_clear(pud);
+ }
+ 
+-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long 
end)
++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
++                            unsigned long start,
++                            unsigned long end)
+ {
+       pud_t *pud = pud_offset(pgd, start);
+ 
+@@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
+               unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+               unsigned long pre_end   = min_t(unsigned long, end, next_page);
+ 
+-              unmap_pmd_range(pud, start, pre_end);
++              unmap_pmd_range(cpa, pud, start, pre_end);
+ 
+               start = pre_end;
+               pud++;
+@@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
+               if (pud_large(*pud))
+                       pud_clear(pud);
+               else
+-                      unmap_pmd_range(pud, start, start + PUD_SIZE);
++                      unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
+ 
+               start += PUD_SIZE;
+               pud++;
+@@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
+        * 2M leftovers?
+        */
+       if (start < end)
+-              unmap_pmd_range(pud, start, end);
++              unmap_pmd_range(cpa, pud, start, end);
+ 
+       /*
+        * No need to try to free the PUD page because we'll free it in
+@@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
+        */
+ }
+ 
++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long 
end)
++{
++      struct cpa_data cpa = {
++              .flags = CPA_FREE_PAGETABLES,
++      };
++
++      __unmap_pud_range(&cpa, pgd, start, end);
++}
++
++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long 
end)
++{
++      struct cpa_data cpa = {
++              .flags = 0,
++      };
++
++      __unmap_pud_range(&cpa, pgd, start, end);
++}
++
+ static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long 
end)
+ {
+       pgd_t *pgd_entry = root + pgd_index(addr);
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index fb0a9dd1d6e4..dbc27a2b4ad5 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -6,7 +6,7 @@
+ #include <asm/fixmap.h>
+ #include <asm/mtrr.h>
+ 
+-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
++#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+ 
+ #ifdef CONFIG_HIGHPTE
+ #define PGALLOC_USER_GFP __GFP_HIGHMEM
+@@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)
+               kmem_cache_free(pgd_cache, pgd);
+ }
+ #else
++
++/*
++ * Instead of one pgd, Kaiser acquires two pgds.  Being order-1, it is
++ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
++ * in a pointer to swap between the two 4k halves.
++ */
++#define PGD_ALLOCATION_ORDER  kaiser_enabled
++
+ static inline pgd_t *_pgd_alloc(void)
+ {
+-      return (pgd_t *)__get_free_page(PGALLOC_GFP);
++      /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
++      return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
++                                       PGD_ALLOCATION_ORDER);
+ }
+ 
+ static inline void _pgd_free(pgd_t *pgd)
+ {
+-      free_page((unsigned long)pgd);
++      free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+ }
+ #endif /* CONFIG_X86_PAE */
+ 
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 7a4cdb632508..7cad01af6dcd 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -6,13 +6,14 @@
+ #include <linux/interrupt.h>
+ #include <linux/module.h>
+ #include <linux/cpu.h>
++#include <linux/debugfs.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/mmu_context.h>
+ #include <asm/cache.h>
+ #include <asm/apic.h>
+ #include <asm/uv/uv.h>
+-#include <linux/debugfs.h>
++#include <asm/kaiser.h>
+ 
+ /*
+  *    TLB flushing, formerly SMP-only
+@@ -34,6 +35,36 @@ struct flush_tlb_info {
+       unsigned long flush_end;
+ };
+ 
++static void load_new_mm_cr3(pgd_t *pgdir)
++{
++      unsigned long new_mm_cr3 = __pa(pgdir);
++
++      if (kaiser_enabled) {
++              /*
++               * We reuse the same PCID for different tasks, so we must
++               * flush all the entries for the PCID out when we change tasks.
++               * Flush KERN below, flush USER when returning to userspace in
++               * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
++               *
++               * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
++               * do it here, but can only be used if X86_FEATURE_INVPCID is
++               * available - and many machines support pcid without invpcid.
++               *
++               * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
++               * would be needed in the write_cr3() below - if PCIDs enabled.
++               */
++              BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
++              kaiser_flush_tlb_on_return_to_user();
++      }
++
++      /*
++       * Caution: many callers of this function expect
++       * that load_cr3() is serializing and orders TLB
++       * fills with respect to the mm_cpumask writes.
++       */
++      write_cr3(new_mm_cr3);
++}
++
+ /*
+  * We cannot call mmdrop() because we are in interrupt context,
+  * instead update mm->cpu_vm_mask.
+@@ -45,7 +76,7 @@ void leave_mm(int cpu)
+               BUG();
+       if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+               cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+-              load_cr3(swapper_pg_dir);
++              load_new_mm_cr3(swapper_pg_dir);
+               /*
+                * This gets called in the idle path where RCU
+                * functions differently.  Tracing normally
+@@ -105,7 +136,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
+                * ordering guarantee we need.
+                *
+                */
+-              load_cr3(next->pgd);
++              load_new_mm_cr3(next->pgd);
+ 
+               trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ 
+@@ -152,7 +183,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
+                        * As above, load_cr3() is serializing and orders TLB
+                        * fills with respect to the mm_cpumask write.
+                        */
+-                      load_cr3(next->pgd);
++                      load_new_mm_cr3(next->pgd);
+                       trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 
TLB_FLUSH_ALL);
+                       load_mm_cr4(next);
+                       load_mm_ldt(next);
+diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
+index ef2e8c97e183..a461b6604fd9 100644
+--- a/include/asm-generic/vmlinux.lds.h
++++ b/include/asm-generic/vmlinux.lds.h
+@@ -725,7 +725,14 @@
+  */
+ #define PERCPU_INPUT(cacheline)                                               
\
+       VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
++      VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;                \
+       *(.data..percpu..first)                                         \
++      . = ALIGN(cacheline);                                           \
++      *(.data..percpu..user_mapped)                                   \
++      *(.data..percpu..user_mapped..shared_aligned)                   \
++      . = ALIGN(PAGE_SIZE);                                           \
++      *(.data..percpu..user_mapped..page_aligned)                     \
++      VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;                  \
+       . = ALIGN(PAGE_SIZE);                                           \
+       *(.data..percpu..page_aligned)                                  \
+       . = ALIGN(cacheline);                                           \
+diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
+new file mode 100644
+index 000000000000..58c55b1589d0
+--- /dev/null
++++ b/include/linux/kaiser.h
+@@ -0,0 +1,52 @@
++#ifndef _LINUX_KAISER_H
++#define _LINUX_KAISER_H
++
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++#include <asm/kaiser.h>
++
++static inline int kaiser_map_thread_stack(void *stack)
++{
++      /*
++       * Map that page of kernel stack on which we enter from user context.
++       */
++      return kaiser_add_mapping((unsigned long)stack +
++                      THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
++}
++
++static inline void kaiser_unmap_thread_stack(void *stack)
++{
++      /*
++       * Note: may be called even when kaiser_map_thread_stack() failed.
++       */
++      kaiser_remove_mapping((unsigned long)stack +
++                      THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
++}
++#else
++
++/*
++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
++ * includes architectures that support KAISER, but have it disabled.
++ */
++
++static inline void kaiser_init(void)
++{
++}
++static inline int kaiser_add_mapping(unsigned long addr,
++                                   unsigned long size, unsigned long flags)
++{
++      return 0;
++}
++static inline void kaiser_remove_mapping(unsigned long start,
++                                       unsigned long size)
++{
++}
++static inline int kaiser_map_thread_stack(void *stack)
++{
++      return 0;
++}
++static inline void kaiser_unmap_thread_stack(void *stack)
++{
++}
++
++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
++#endif /* _LINUX_KAISER_H */
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index ff88d6189411..b93b578cfa42 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -131,8 +131,9 @@ enum zone_stat_item {
+       NR_SLAB_RECLAIMABLE,
+       NR_SLAB_UNRECLAIMABLE,
+       NR_PAGETABLE,           /* used for pagetables */
+-      NR_KERNEL_STACK,
+       /* Second 128 byte cacheline */
++      NR_KERNEL_STACK,
++      NR_KAISERTABLE,
+       NR_UNSTABLE_NFS,        /* NFS unstable pages */
+       NR_BOUNCE,
+       NR_VMSCAN_WRITE,
+diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
+index 8f16299ca068..8902f23bb770 100644
+--- a/include/linux/percpu-defs.h
++++ b/include/linux/percpu-defs.h
+@@ -35,6 +35,12 @@
+ 
+ #endif
+ 
++#ifdef CONFIG_PAGE_TABLE_ISOLATION
++#define USER_MAPPED_SECTION "..user_mapped"
++#else
++#define USER_MAPPED_SECTION ""
++#endif
++
+ /*
+  * Base implementations of per-CPU variable declarations and definitions, 
where
+  * the section in which the variable is to be placed is provided by the
+@@ -115,6 +121,12 @@
+ #define DEFINE_PER_CPU(type, name)                                    \
+       DEFINE_PER_CPU_SECTION(type, name, "")
+ 
++#define DECLARE_PER_CPU_USER_MAPPED(type, name)                               
\
++      DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
++
++#define DEFINE_PER_CPU_USER_MAPPED(type, name)                                
\
++      DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
++
+ /*
+  * Declaration/definition used for per-CPU variables that must come first in
+  * the set of variables.
+@@ -144,6 +156,14 @@
+       DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+ 
++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)                
\
++      DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION 
PER_CPU_SHARED_ALIGNED_SECTION) \
++      ____cacheline_aligned_in_smp
++
++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)         \
++      DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION 
PER_CPU_SHARED_ALIGNED_SECTION) \
++      ____cacheline_aligned_in_smp
++
+ #define DECLARE_PER_CPU_ALIGNED(type, name)                           \
+       DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
+       ____cacheline_aligned
+@@ -162,11 +182,21 @@
+ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                               
\
+       DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
+       __aligned(PAGE_SIZE)
++/*
++ * Declaration/definition used for per-CPU variables that must be page 
aligned and need to be mapped in user mode.
++ */
++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)          \
++      DECLARE_PER_CPU_SECTION(type, name, 
USER_MAPPED_SECTION"..page_aligned") \
++      __aligned(PAGE_SIZE)
++
++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)           \
++      DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") 
\
++      __aligned(PAGE_SIZE)
+ 
+ /*
+  * Declaration/definition used for per-CPU variables that must be read mostly.
+  */
+-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                       \
++#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                               
\
+       DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
+ 
+ #define DEFINE_PER_CPU_READ_MOSTLY(type, name)                                
\
+diff --git a/init/main.c b/init/main.c
+index 9e64d7097f1a..49926d95442f 100644
+--- a/init/main.c
++++ b/init/main.c
+@@ -81,6 +81,7 @@
+ #include <linux/integrity.h>
+ #include <linux/proc_ns.h>
+ #include <linux/io.h>
++#include <linux/kaiser.h>
+ 
+ #include <asm/io.h>
+ #include <asm/bugs.h>
+@@ -492,6 +493,7 @@ static void __init mm_init(void)
+       pgtable_init();
+       vmalloc_init();
+       ioremap_huge_init();
++      kaiser_init();
+ }
+ 
+ asmlinkage __visible void __init start_kernel(void)
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 68cfda1c1800..ac00f14208b7 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -58,6 +58,7 @@
+ #include <linux/tsacct_kern.h>
+ #include <linux/cn_proc.h>
+ #include <linux/freezer.h>
++#include <linux/kaiser.h>
+ #include <linux/delayacct.h>
+ #include <linux/taskstats_kern.h>
+ #include <linux/random.h>
+@@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct 
task_struct *tsk,
+ 
+ static inline void free_thread_info(struct thread_info *ti)
+ {
++      kaiser_unmap_thread_stack(ti);
+       free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ }
+ # else
+@@ -352,6 +354,10 @@ static struct task_struct *dup_task_struct(struct 
task_struct *orig, int node)
+               goto free_ti;
+ 
+       tsk->stack = ti;
++
++      err = kaiser_map_thread_stack(tsk->stack);
++      if (err)
++              goto free_ti;
+ #ifdef CONFIG_SECCOMP
+       /*
+        * We must handle setting up seccomp filters once we're under
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index c344e3609c53..324b7e90b4c5 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -736,6 +736,7 @@ const char * const vmstat_text[] = {
+       "nr_slab_unreclaimable",
+       "nr_page_table_pages",
+       "nr_kernel_stack",
++      "nr_overhead",
+       "nr_unstable",
+       "nr_bounce",
+       "nr_vmscan_write",
+diff --git a/security/Kconfig b/security/Kconfig
+index e45237897b43..a3ebb6ee5bd5 100644
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -31,6 +31,16 @@ config SECURITY
+ 
+         If you are unsure how to answer this question, answer N.
+ 
++config PAGE_TABLE_ISOLATION
++      bool "Remove the kernel mapping in user mode"
++      default y
++      depends on X86_64 && SMP
++      help
++        This enforces a strict kernel and user space isolation, in order
++        to close hardware side channels on kernel address information.
++
++        If you are unsure how to answer this question, answer Y.
++
+ config SECURITYFS
+       bool "Enable the securityfs filesystem"
+       help

Reply via email to