Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+8

Documentation/admin-guide/kernel-parameters.txt

··· 2708 2708 steal time is computed, but won't influence scheduler 2709 2709 behaviour 2710 2710 2711 + nopti [X86-64] Disable kernel page table isolation 2712 + 2711 2713 nolapic [X86-32,APIC] Do not enable or use the local APIC. 2712 2714 2713 2715 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. ··· 3283 3281 3284 3282 pt. [PARIDE] 3285 3283 See Documentation/blockdev/paride.txt. 3284 + 3285 + pti= [X86_64] 3286 + Control user/kernel address space isolation: 3287 + on - enable 3288 + off - disable 3289 + auto - default setting 3286 3290 3287 3291 pty.legacy_count= 3288 3292 [KNL] Number of legacy pty's. Overwrites compiled-in

+3 -2

Documentation/x86/x86_64/mm.txt

··· 12 12 ... unused hole ... 13 13 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) 14 14 ... unused hole ... 15 + fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI 15 16 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping 16 17 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks 17 18 ... unused hole ... ··· 30 29 hole caused by [56:63] sign extension 31 30 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor 32 31 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory 33 - ff90000000000000 - ff91ffffffffffff (=49 bits) hole 34 - ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space 32 + ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI 33 + ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) 35 34 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole 36 35 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) 37 36 ... unused hole ...

+3

arch/x86/boot/compressed/pagetable.c

··· 23 23 */ 24 24 #undef CONFIG_AMD_MEM_ENCRYPT 25 25 26 + /* No PAGE_TABLE_ISOLATION support needed either: */ 27 + #undef CONFIG_PAGE_TABLE_ISOLATION 28 + 26 29 #include "misc.h" 27 30 28 31 /* These actually do the work of building the kernel identity maps. */

+145

arch/x86/entry/calling.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #include <linux/jump_label.h> 3 3 #include <asm/unwind_hints.h> 4 + #include <asm/cpufeatures.h> 5 + #include <asm/page_types.h> 6 + #include <asm/percpu.h> 7 + #include <asm/asm-offsets.h> 8 + #include <asm/processor-flags.h> 4 9 5 10 /* 6 11 ··· 191 186 orq $0x1, %rbp 192 187 #endif 193 188 .endm 189 + 190 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 191 + 192 + /* 193 + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two 194 + * halves: 195 + */ 196 + #define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT) 197 + #define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT)) 198 + 199 + .macro SET_NOFLUSH_BIT reg:req 200 + bts $X86_CR3_PCID_NOFLUSH_BIT, \reg 201 + .endm 202 + 203 + .macro ADJUST_KERNEL_CR3 reg:req 204 + ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID 205 + /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ 206 + andq $(~PTI_SWITCH_MASK), \reg 207 + .endm 208 + 209 + .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req 210 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI 211 + mov %cr3, \scratch_reg 212 + ADJUST_KERNEL_CR3 \scratch_reg 213 + mov \scratch_reg, %cr3 214 + .Lend_\@: 215 + .endm 216 + 217 + #define THIS_CPU_user_pcid_flush_mask \ 218 + PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask 219 + 220 + .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req 221 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI 222 + mov %cr3, \scratch_reg 223 + 224 + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID 225 + 226 + /* 227 + * Test if the ASID needs a flush. 228 + */ 229 + movq \scratch_reg, \scratch_reg2 230 + andq $(0x7FF), \scratch_reg /* mask ASID */ 231 + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask 232 + jnc .Lnoflush_\@ 233 + 234 + /* Flush needed, clear the bit */ 235 + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask 236 + movq \scratch_reg2, \scratch_reg 237 + jmp .Lwrcr3_\@ 238 + 239 + .Lnoflush_\@: 240 + movq \scratch_reg2, \scratch_reg 241 + SET_NOFLUSH_BIT \scratch_reg 242 + 243 + .Lwrcr3_\@: 244 + /* Flip the PGD and ASID to the user version */ 245 + orq $(PTI_SWITCH_MASK), \scratch_reg 246 + mov \scratch_reg, %cr3 247 + .Lend_\@: 248 + .endm 249 + 250 + .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req 251 + pushq %rax 252 + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax 253 + popq %rax 254 + .endm 255 + 256 + .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req 257 + ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI 258 + movq %cr3, \scratch_reg 259 + movq \scratch_reg, \save_reg 260 + /* 261 + * Is the "switch mask" all zero? That means that both of 262 + * these are zero: 263 + * 264 + * 1. The user/kernel PCID bit, and 265 + * 2. The user/kernel "bit" that points CR3 to the 266 + * bottom half of the 8k PGD 267 + * 268 + * That indicates a kernel CR3 value, not a user CR3. 269 + */ 270 + testq $(PTI_SWITCH_MASK), \scratch_reg 271 + jz .Ldone_\@ 272 + 273 + ADJUST_KERNEL_CR3 \scratch_reg 274 + movq \scratch_reg, %cr3 275 + 276 + .Ldone_\@: 277 + .endm 278 + 279 + .macro RESTORE_CR3 scratch_reg:req save_reg:req 280 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI 281 + 282 + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID 283 + 284 + /* 285 + * KERNEL pages can always resume with NOFLUSH as we do 286 + * explicit flushes. 287 + */ 288 + bt $X86_CR3_PTI_SWITCH_BIT, \save_reg 289 + jnc .Lnoflush_\@ 290 + 291 + /* 292 + * Check if there's a pending flush for the user ASID we're 293 + * about to set. 294 + */ 295 + movq \save_reg, \scratch_reg 296 + andq $(0x7FF), \scratch_reg 297 + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask 298 + jnc .Lnoflush_\@ 299 + 300 + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask 301 + jmp .Lwrcr3_\@ 302 + 303 + .Lnoflush_\@: 304 + SET_NOFLUSH_BIT \save_reg 305 + 306 + .Lwrcr3_\@: 307 + /* 308 + * The CR3 write could be avoided when not changing its value, 309 + * but would require a CR3 read *and* a scratch register. 310 + */ 311 + movq \save_reg, %cr3 312 + .Lend_\@: 313 + .endm 314 + 315 + #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ 316 + 317 + .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req 318 + .endm 319 + .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req 320 + .endm 321 + .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req 322 + .endm 323 + .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req 324 + .endm 325 + .macro RESTORE_CR3 scratch_reg:req save_reg:req 326 + .endm 327 + 328 + #endif 194 329 195 330 #endif /* CONFIG_X86_64 */ 196 331

+41 -7

arch/x86/entry/entry_64.S

··· 23 23 #include <asm/segment.h> 24 24 #include <asm/cache.h> 25 25 #include <asm/errno.h> 26 - #include "calling.h" 27 26 #include <asm/asm-offsets.h> 28 27 #include <asm/msr.h> 29 28 #include <asm/unistd.h> ··· 38 39 #include <asm/export.h> 39 40 #include <asm/frame.h> 40 41 #include <linux/err.h> 42 + 43 + #include "calling.h" 41 44 42 45 .code64 43 46 .section .entry.text, "ax" ··· 169 168 /* Stash the user RSP. */ 170 169 movq %rsp, RSP_SCRATCH 171 170 171 + /* Note: using %rsp as a scratch reg. */ 172 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp 173 + 172 174 /* Load the top of the task stack into RSP */ 173 175 movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp 174 176 ··· 211 207 */ 212 208 213 209 swapgs 210 + /* 211 + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it 212 + * is not required to switch CR3. 213 + */ 214 214 movq %rsp, PER_CPU_VAR(rsp_scratch) 215 215 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 216 216 ··· 411 403 * We are on the trampoline stack. All regs except RDI are live. 412 404 * We can do future final exit work right here. 413 405 */ 406 + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi 414 407 415 408 popq %rdi 416 409 popq %rsp ··· 749 740 * We can do future final exit work right here. 750 741 */ 751 742 743 + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi 744 + 752 745 /* Restore RDI. */ 753 746 popq %rdi 754 747 SWAPGS ··· 833 822 */ 834 823 835 824 pushq %rdi /* Stash user RDI */ 836 - SWAPGS 825 + SWAPGS /* to kernel GS */ 826 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ 827 + 837 828 movq PER_CPU_VAR(espfix_waddr), %rdi 838 829 movq %rax, (0*8)(%rdi) /* user RAX */ 839 830 movq (1*8)(%rsp), %rax /* user RIP */ ··· 851 838 /* Now RAX == RSP. */ 852 839 853 840 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ 854 - popq %rdi /* Restore user RDI */ 855 841 856 842 /* 857 843 * espfix_stack[31:16] == 0. The page tables are set up such that ··· 861 849 * still points to an RO alias of the ESPFIX stack. 862 850 */ 863 851 orq PER_CPU_VAR(espfix_stack), %rax 864 - SWAPGS 852 + 853 + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi 854 + SWAPGS /* to user GS */ 855 + popq %rdi /* Restore user RDI */ 856 + 865 857 movq %rax, %rsp 866 858 UNWIND_HINT_IRET_REGS offset=8 867 859 ··· 965 949 UNWIND_HINT_FUNC 966 950 967 951 pushq %rdi 952 + /* Need to switch before accessing the thread stack. */ 953 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi 968 954 movq %rsp, %rdi 969 955 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 970 956 UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI ··· 1268 1250 js 1f /* negative -> in kernel */ 1269 1251 SWAPGS 1270 1252 xorl %ebx, %ebx 1271 - 1: ret 1253 + 1254 + 1: 1255 + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 1256 + 1257 + ret 1272 1258 END(paranoid_entry) 1273 1259 1274 1260 /* ··· 1294 1272 testl %ebx, %ebx /* swapgs needed? */ 1295 1273 jnz .Lparanoid_exit_no_swapgs 1296 1274 TRACE_IRQS_IRETQ 1275 + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 1297 1276 SWAPGS_UNSAFE_STACK 1298 1277 jmp .Lparanoid_exit_restore 1299 1278 .Lparanoid_exit_no_swapgs: ··· 1322 1299 * from user mode due to an IRET fault. 1323 1300 */ 1324 1301 SWAPGS 1302 + /* We have user CR3. Change to kernel CR3. */ 1303 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax 1325 1304 1326 1305 .Lerror_entry_from_usermode_after_swapgs: 1327 1306 /* Put us onto the real thread stack. */ ··· 1370 1345 * .Lgs_change's error handler with kernel gsbase. 1371 1346 */ 1372 1347 SWAPGS 1348 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax 1373 1349 jmp .Lerror_entry_done 1374 1350 1375 1351 .Lbstep_iret: ··· 1380 1354 1381 1355 .Lerror_bad_iret: 1382 1356 /* 1383 - * We came from an IRET to user mode, so we have user gsbase. 1384 - * Switch to kernel gsbase: 1357 + * We came from an IRET to user mode, so we have user 1358 + * gsbase and CR3. Switch to kernel gsbase and CR3: 1385 1359 */ 1386 1360 SWAPGS 1361 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax 1387 1362 1388 1363 /* 1389 1364 * Pretend that the exception came from user mode: set up pt_regs ··· 1416 1389 /* 1417 1390 * Runs on exception stack. Xen PV does not go through this path at all, 1418 1391 * so we can use real assembly here. 1392 + * 1393 + * Registers: 1394 + * %r14: Used to save/restore the CR3 of the interrupted context 1395 + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. 1419 1396 */ 1420 1397 ENTRY(nmi) 1421 1398 UNWIND_HINT_IRET_REGS ··· 1483 1452 1484 1453 swapgs 1485 1454 cld 1455 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx 1486 1456 movq %rsp, %rdx 1487 1457 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1488 1458 UNWIND_HINT_IRET_REGS base=%rdx offset=8 ··· 1735 1703 movq %rsp, %rdi 1736 1704 movq $-1, %rsi 1737 1705 call do_nmi 1706 + 1707 + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 1738 1708 1739 1709 testl %ebx, %ebx /* swapgs needed? */ 1740 1710 jnz nmi_restore

+23 -1

arch/x86/entry/entry_64_compat.S

··· 49 49 ENTRY(entry_SYSENTER_compat) 50 50 /* Interrupts are off on entry. */ 51 51 SWAPGS 52 + 53 + /* We are about to clobber %rsp anyway, clobbering here is OK */ 54 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp 55 + 52 56 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 53 57 54 58 /* ··· 220 216 pushq $0 /* pt_regs->r15 = 0 */ 221 217 222 218 /* 219 + * We just saved %rdi so it is safe to clobber. It is not 220 + * preserved during the C calls inside TRACE_IRQS_OFF anyway. 221 + */ 222 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi 223 + 224 + /* 223 225 * User mode is traced as though IRQs are on, and SYSENTER 224 226 * turned them off. 225 227 */ ··· 266 256 * when the system call started, which is already known to user 267 257 * code. We zero R8-R10 to avoid info leaks. 268 258 */ 259 + movq RSP-ORIG_RAX(%rsp), %rsp 260 + 261 + /* 262 + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored 263 + * on the process stack which is not mapped to userspace and 264 + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 265 + * switch until after after the last reference to the process 266 + * stack. 267 + * 268 + * %r8/%r9 are zeroed before the sysret, thus safe to clobber. 269 + */ 270 + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 271 + 269 272 xorq %r8, %r8 270 273 xorq %r9, %r9 271 274 xorq %r10, %r10 272 - movq RSP-ORIG_RAX(%rsp), %rsp 273 275 swapgs 274 276 sysretl 275 277 END(entry_SYSCALL_compat)

+3 -3

arch/x86/entry/vsyscall/vsyscall_64.c

··· 344 344 * vsyscalls but leave the page not present. If so, we skip calling 345 345 * this. 346 346 */ 347 - static void __init set_vsyscall_pgtable_user_bits(void) 347 + void __init set_vsyscall_pgtable_user_bits(pgd_t *root) 348 348 { 349 349 pgd_t *pgd; 350 350 p4d_t *p4d; 351 351 pud_t *pud; 352 352 pmd_t *pmd; 353 353 354 - pgd = pgd_offset_k(VSYSCALL_ADDR); 354 + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); 355 355 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); 356 356 p4d = p4d_offset(pgd, VSYSCALL_ADDR); 357 357 #if CONFIG_PGTABLE_LEVELS >= 5 ··· 373 373 vsyscall_mode == NATIVE 374 374 ? PAGE_KERNEL_VSYSCALL 375 375 : PAGE_KERNEL_VVAR); 376 - set_vsyscall_pgtable_user_bits(); 376 + set_vsyscall_pgtable_user_bits(swapper_pg_dir); 377 377 } 378 378 379 379 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=

+83 -47

arch/x86/events/intel/ds.c

··· 3 3 #include <linux/types.h> 4 4 #include <linux/slab.h> 5 5 6 + #include <asm/cpu_entry_area.h> 6 7 #include <asm/perf_event.h> 7 8 #include <asm/insn.h> 8 9 9 10 #include "../perf_event.h" 10 11 12 + /* Waste a full page so it can be mapped into the cpu_entry_area */ 13 + DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); 14 + 11 15 /* The size of a BTS record in bytes: */ 12 16 #define BTS_RECORD_SIZE 24 13 17 14 - #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) 15 - #define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) 16 18 #define PEBS_FIXUP_SIZE PAGE_SIZE 17 19 18 20 /* ··· 281 279 282 280 static DEFINE_PER_CPU(void *, insn_buffer); 283 281 282 + static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) 283 + { 284 + phys_addr_t pa; 285 + size_t msz = 0; 286 + 287 + pa = virt_to_phys(addr); 288 + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) 289 + cea_set_pte(cea, pa, prot); 290 + } 291 + 292 + static void ds_clear_cea(void *cea, size_t size) 293 + { 294 + size_t msz = 0; 295 + 296 + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) 297 + cea_set_pte(cea, 0, PAGE_NONE); 298 + } 299 + 300 + static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) 301 + { 302 + unsigned int order = get_order(size); 303 + int node = cpu_to_node(cpu); 304 + struct page *page; 305 + 306 + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); 307 + return page ? page_address(page) : NULL; 308 + } 309 + 310 + static void dsfree_pages(const void *buffer, size_t size) 311 + { 312 + if (buffer) 313 + free_pages((unsigned long)buffer, get_order(size)); 314 + } 315 + 284 316 static int alloc_pebs_buffer(int cpu) 285 317 { 286 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 287 - int node = cpu_to_node(cpu); 288 - int max; 289 - void *buffer, *ibuffer; 318 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); 319 + struct debug_store *ds = hwev->ds; 320 + size_t bsiz = x86_pmu.pebs_buffer_size; 321 + int max, node = cpu_to_node(cpu); 322 + void *buffer, *ibuffer, *cea; 290 323 291 324 if (!x86_pmu.pebs) 292 325 return 0; 293 326 294 - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); 327 + buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); 295 328 if (unlikely(!buffer)) 296 329 return -ENOMEM; 297 330 ··· 337 300 if (x86_pmu.intel_cap.pebs_format < 2) { 338 301 ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); 339 302 if (!ibuffer) { 340 - kfree(buffer); 303 + dsfree_pages(buffer, bsiz); 341 304 return -ENOMEM; 342 305 } 343 306 per_cpu(insn_buffer, cpu) = ibuffer; 344 307 } 345 - 346 - max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; 347 - 348 - ds->pebs_buffer_base = (u64)(unsigned long)buffer; 308 + hwev->ds_pebs_vaddr = buffer; 309 + /* Update the cpu entry area mapping */ 310 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; 311 + ds->pebs_buffer_base = (unsigned long) cea; 312 + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); 349 313 ds->pebs_index = ds->pebs_buffer_base; 350 - ds->pebs_absolute_maximum = ds->pebs_buffer_base + 351 - max * x86_pmu.pebs_record_size; 352 - 314 + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); 315 + ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; 353 316 return 0; 354 317 } 355 318 356 319 static void release_pebs_buffer(int cpu) 357 320 { 358 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 321 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); 322 + struct debug_store *ds = hwev->ds; 323 + void *cea; 359 324 360 325 if (!ds || !x86_pmu.pebs) 361 326 return; ··· 365 326 kfree(per_cpu(insn_buffer, cpu)); 366 327 per_cpu(insn_buffer, cpu) = NULL; 367 328 368 - kfree((void *)(unsigned long)ds->pebs_buffer_base); 329 + /* Clear the fixmap */ 330 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; 331 + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); 369 332 ds->pebs_buffer_base = 0; 333 + dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); 334 + hwev->ds_pebs_vaddr = NULL; 370 335 } 371 336 372 337 static int alloc_bts_buffer(int cpu) 373 338 { 374 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 375 - int node = cpu_to_node(cpu); 376 - int max, thresh; 377 - void *buffer; 339 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); 340 + struct debug_store *ds = hwev->ds; 341 + void *buffer, *cea; 342 + int max; 378 343 379 344 if (!x86_pmu.bts) 380 345 return 0; 381 346 382 - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); 347 + buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); 383 348 if (unlikely(!buffer)) { 384 349 WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); 385 350 return -ENOMEM; 386 351 } 387 - 388 - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; 389 - thresh = max / 16; 390 - 391 - ds->bts_buffer_base = (u64)(unsigned long)buffer; 352 + hwev->ds_bts_vaddr = buffer; 353 + /* Update the fixmap */ 354 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; 355 + ds->bts_buffer_base = (unsigned long) cea; 356 + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); 392 357 ds->bts_index = ds->bts_buffer_base; 393 - ds->bts_absolute_maximum = ds->bts_buffer_base + 394 - max * BTS_RECORD_SIZE; 395 - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - 396 - thresh * BTS_RECORD_SIZE; 397 - 358 + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); 359 + ds->bts_absolute_maximum = ds->bts_buffer_base + max; 360 + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); 398 361 return 0; 399 362 } 400 363 401 364 static void release_bts_buffer(int cpu) 402 365 { 403 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 366 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); 367 + struct debug_store *ds = hwev->ds; 368 + void *cea; 404 369 405 370 if (!ds || !x86_pmu.bts) 406 371 return; 407 372 408 - kfree((void *)(unsigned long)ds->bts_buffer_base); 373 + /* Clear the fixmap */ 374 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; 375 + ds_clear_cea(cea, BTS_BUFFER_SIZE); 409 376 ds->bts_buffer_base = 0; 377 + dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); 378 + hwev->ds_bts_vaddr = NULL; 410 379 } 411 380 412 381 static int alloc_ds_buffer(int cpu) 413 382 { 414 - int node = cpu_to_node(cpu); 415 - struct debug_store *ds; 383 + struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; 416 384 417 - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); 418 - if (unlikely(!ds)) 419 - return -ENOMEM; 420 - 385 + memset(ds, 0, sizeof(*ds)); 421 386 per_cpu(cpu_hw_events, cpu).ds = ds; 422 - 423 387 return 0; 424 388 } 425 389 426 390 static void release_ds_buffer(int cpu) 427 391 { 428 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 429 - 430 - if (!ds) 431 - return; 432 - 433 392 per_cpu(cpu_hw_events, cpu).ds = NULL; 434 - kfree(ds); 435 393 } 436 394 437 395 void release_ds_buffers(void)

+4 -19

arch/x86/events/perf_event.h

··· 14 14 15 15 #include <linux/perf_event.h> 16 16 17 + #include <asm/intel_ds.h> 18 + 17 19 /* To enable MSR tracing please use the generic trace points. */ 18 20 19 21 /* ··· 79 77 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 80 78 }; 81 79 82 - /* The maximal number of PEBS events: */ 83 - #define MAX_PEBS_EVENTS 8 84 80 #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) 85 81 86 82 /* ··· 94 94 PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ 95 95 PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ 96 96 PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) 97 - 98 - /* 99 - * A debug store configuration. 100 - * 101 - * We only support architectures that use 64bit fields. 102 - */ 103 - struct debug_store { 104 - u64 bts_buffer_base; 105 - u64 bts_index; 106 - u64 bts_absolute_maximum; 107 - u64 bts_interrupt_threshold; 108 - u64 pebs_buffer_base; 109 - u64 pebs_index; 110 - u64 pebs_absolute_maximum; 111 - u64 pebs_interrupt_threshold; 112 - u64 pebs_event_reset[MAX_PEBS_EVENTS]; 113 - }; 114 97 115 98 #define PEBS_REGS \ 116 99 (PERF_REG_X86_AX | \ ··· 199 216 * Intel DebugStore bits 200 217 */ 201 218 struct debug_store *ds; 219 + void *ds_pebs_vaddr; 220 + void *ds_bts_vaddr; 202 221 u64 pebs_enabled; 203 222 int n_pebs; 204 223 int n_large_pebs;

+13

arch/x86/include/asm/cpu_entry_area.h

··· 5 5 6 6 #include <linux/percpu-defs.h> 7 7 #include <asm/processor.h> 8 + #include <asm/intel_ds.h> 8 9 9 10 /* 10 11 * cpu_entry_area is a percpu region that contains things needed by the CPU ··· 40 39 * with guard pages between them. 41 40 */ 42 41 char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; 42 + #endif 43 + #ifdef CONFIG_CPU_SUP_INTEL 44 + /* 45 + * Per CPU debug store for Intel performance monitoring. Wastes a 46 + * full page at the moment. 47 + */ 48 + struct debug_store cpu_debug_store; 49 + /* 50 + * The actual PEBS/BTS buffers must be mapped to user space 51 + * Reserve enough fixmap PTEs. 52 + */ 53 + struct debug_store_buffers cpu_debug_buffers; 43 54 #endif 44 55 }; 45 56

+3 -1

arch/x86/include/asm/cpufeatures.h

··· 197 197 #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ 198 198 #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ 199 199 #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ 200 + #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ 200 201 201 202 #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ 202 203 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ 203 204 #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ 204 - 205 + #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ 205 206 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 206 207 #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ 207 208 #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ ··· 341 340 #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ 342 341 #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ 343 342 #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ 343 + #define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ 344 344 345 345 #endif /* _ASM_X86_CPUFEATURES_H */

+2

arch/x86/include/asm/desc.h

··· 21 21 22 22 desc->type = (info->read_exec_only ^ 1) << 1; 23 23 desc->type |= info->contents << 2; 24 + /* Set the ACCESS bit so it can be mapped RO */ 25 + desc->type |= 1; 24 26 25 27 desc->s = 1; 26 28 desc->dpl = 0x3;

+7 -1

arch/x86/include/asm/disabled-features.h

··· 50 50 # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) 51 51 #endif 52 52 53 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 54 + # define DISABLE_PTI 0 55 + #else 56 + # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) 57 + #endif 58 + 53 59 /* 54 60 * Make sure to add features to the correct mask 55 61 */ ··· 66 60 #define DISABLED_MASK4 (DISABLE_PCID) 67 61 #define DISABLED_MASK5 0 68 62 #define DISABLED_MASK6 0 69 - #define DISABLED_MASK7 0 63 + #define DISABLED_MASK7 (DISABLE_PTI) 70 64 #define DISABLED_MASK8 0 71 65 #define DISABLED_MASK9 (DISABLE_MPX) 72 66 #define DISABLED_MASK10 0

+36

arch/x86/include/asm/intel_ds.h

··· 1 + #ifndef _ASM_INTEL_DS_H 2 + #define _ASM_INTEL_DS_H 3 + 4 + #include <linux/percpu-defs.h> 5 + 6 + #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) 7 + #define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) 8 + 9 + /* The maximal number of PEBS events: */ 10 + #define MAX_PEBS_EVENTS 8 11 + 12 + /* 13 + * A debug store configuration. 14 + * 15 + * We only support architectures that use 64bit fields. 16 + */ 17 + struct debug_store { 18 + u64 bts_buffer_base; 19 + u64 bts_index; 20 + u64 bts_absolute_maximum; 21 + u64 bts_interrupt_threshold; 22 + u64 pebs_buffer_base; 23 + u64 pebs_index; 24 + u64 pebs_absolute_maximum; 25 + u64 pebs_interrupt_threshold; 26 + u64 pebs_event_reset[MAX_PEBS_EVENTS]; 27 + } __aligned(PAGE_SIZE); 28 + 29 + DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); 30 + 31 + struct debug_store_buffers { 32 + char bts_buffer[BTS_BUFFER_SIZE]; 33 + char pebs_buffer[PEBS_BUFFER_SIZE]; 34 + }; 35 + 36 + #endif

+53 -6

arch/x86/include/asm/mmu_context.h

··· 50 50 * call gates. On native, we could merge the ldt_struct and LDT 51 51 * allocations, but it's not worth trying to optimize. 52 52 */ 53 - struct desc_struct *entries; 54 - unsigned int nr_entries; 53 + struct desc_struct *entries; 54 + unsigned int nr_entries; 55 + 56 + /* 57 + * If PTI is in use, then the entries array is not mapped while we're 58 + * in user mode. The whole array will be aliased at the addressed 59 + * given by ldt_slot_va(slot). We use two slots so that we can allocate 60 + * and map, and enable a new LDT without invalidating the mapping 61 + * of an older, still-in-use LDT. 62 + * 63 + * slot will be -1 if this LDT doesn't have an alias mapping. 64 + */ 65 + int slot; 55 66 }; 67 + 68 + /* This is a multiple of PAGE_SIZE. */ 69 + #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) 70 + 71 + static inline void *ldt_slot_va(int slot) 72 + { 73 + #ifdef CONFIG_X86_64 74 + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); 75 + #else 76 + BUG(); 77 + #endif 78 + } 56 79 57 80 /* 58 81 * Used for LDT copy/destruction. ··· 87 64 } 88 65 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); 89 66 void destroy_context_ldt(struct mm_struct *mm); 67 + void ldt_arch_exit_mmap(struct mm_struct *mm); 90 68 #else /* CONFIG_MODIFY_LDT_SYSCALL */ 91 69 static inline void init_new_context_ldt(struct mm_struct *mm) { } 92 70 static inline int ldt_dup_context(struct mm_struct *oldmm, ··· 95 71 { 96 72 return 0; 97 73 } 98 - static inline void destroy_context_ldt(struct mm_struct *mm) {} 74 + static inline void destroy_context_ldt(struct mm_struct *mm) { } 75 + static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } 99 76 #endif 100 77 101 78 static inline void load_mm_ldt(struct mm_struct *mm) ··· 121 96 * that we can see. 122 97 */ 123 98 124 - if (unlikely(ldt)) 125 - set_ldt(ldt->entries, ldt->nr_entries); 126 - else 99 + if (unlikely(ldt)) { 100 + if (static_cpu_has(X86_FEATURE_PTI)) { 101 + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { 102 + /* 103 + * Whoops -- either the new LDT isn't mapped 104 + * (if slot == -1) or is mapped into a bogus 105 + * slot (if slot > 1). 106 + */ 107 + clear_LDT(); 108 + return; 109 + } 110 + 111 + /* 112 + * If page table isolation is enabled, ldt->entries 113 + * will not be mapped in the userspace pagetables. 114 + * Tell the CPU to access the LDT through the alias 115 + * at ldt_slot_va(ldt->slot). 116 + */ 117 + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); 118 + } else { 119 + set_ldt(ldt->entries, ldt->nr_entries); 120 + } 121 + } else { 127 122 clear_LDT(); 123 + } 128 124 #else 129 125 clear_LDT(); 130 126 #endif ··· 240 194 static inline void arch_exit_mmap(struct mm_struct *mm) 241 195 { 242 196 paravirt_arch_exit_mmap(mm); 197 + ldt_arch_exit_mmap(mm); 243 198 } 244 199 245 200 #ifdef CONFIG_X86_64

+11

arch/x86/include/asm/pgalloc.h

··· 30 30 */ 31 31 extern gfp_t __userpte_alloc_gfp; 32 32 33 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 34 + /* 35 + * Instead of one PGD, we acquire two PGDs. Being order-1, it is 36 + * both 8k in size and 8k-aligned. That lets us just flip bit 12 37 + * in a pointer to swap between the two 4k halves. 38 + */ 39 + #define PGD_ALLOCATION_ORDER 1 40 + #else 41 + #define PGD_ALLOCATION_ORDER 0 42 + #endif 43 + 33 44 /* 34 45 * Allocate and free page tables. 35 46 */

+26 -4

arch/x86/include/asm/pgtable.h

··· 28 28 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); 29 29 30 30 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); 31 + void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); 31 32 void ptdump_walk_pgd_level_checkwx(void); 32 33 33 34 #ifdef CONFIG_DEBUG_WX ··· 842 841 843 842 static inline int p4d_bad(p4d_t p4d) 844 843 { 845 - return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; 844 + unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; 845 + 846 + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) 847 + ignore_flags |= _PAGE_NX; 848 + 849 + return (p4d_flags(p4d) & ~ignore_flags) != 0; 846 850 } 847 851 #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 848 852 ··· 881 875 882 876 static inline int pgd_bad(pgd_t pgd) 883 877 { 884 - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; 878 + unsigned long ignore_flags = _PAGE_USER; 879 + 880 + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) 881 + ignore_flags |= _PAGE_NX; 882 + 883 + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; 885 884 } 886 885 887 886 static inline int pgd_none(pgd_t pgd) ··· 915 904 * pgd_offset() returns a (pgd_t *) 916 905 * pgd_index() is used get the offset into the pgd page's array of pgd_t's; 917 906 */ 918 - #define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) 907 + #define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) 908 + /* 909 + * a shortcut to get a pgd_t in a given mm 910 + */ 911 + #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) 919 912 /* 920 913 * a shortcut which implies the use of the kernel's pgd, instead 921 914 * of a process's ··· 1121 1106 */ 1122 1107 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) 1123 1108 { 1124 - memcpy(dst, src, count * sizeof(pgd_t)); 1109 + memcpy(dst, src, count * sizeof(pgd_t)); 1110 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 1111 + if (!static_cpu_has(X86_FEATURE_PTI)) 1112 + return; 1113 + /* Clone the user space pgd as well */ 1114 + memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), 1115 + count * sizeof(pgd_t)); 1116 + #endif 1125 1117 } 1126 1118 1127 1119 #define PTE_SHIFT ilog2(PTRS_PER_PTE)

+92

arch/x86/include/asm/pgtable_64.h

··· 131 131 #endif 132 132 } 133 133 134 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 135 + /* 136 + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages 137 + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and 138 + * the user one is in the last 4k. To switch between them, you 139 + * just need to flip the 12th bit in their addresses. 140 + */ 141 + #define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT 142 + 143 + /* 144 + * This generates better code than the inline assembly in 145 + * __set_bit(). 146 + */ 147 + static inline void *ptr_set_bit(void *ptr, int bit) 148 + { 149 + unsigned long __ptr = (unsigned long)ptr; 150 + 151 + __ptr |= BIT(bit); 152 + return (void *)__ptr; 153 + } 154 + static inline void *ptr_clear_bit(void *ptr, int bit) 155 + { 156 + unsigned long __ptr = (unsigned long)ptr; 157 + 158 + __ptr &= ~BIT(bit); 159 + return (void *)__ptr; 160 + } 161 + 162 + static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) 163 + { 164 + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); 165 + } 166 + 167 + static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) 168 + { 169 + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); 170 + } 171 + 172 + static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) 173 + { 174 + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); 175 + } 176 + 177 + static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) 178 + { 179 + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); 180 + } 181 + #endif /* CONFIG_PAGE_TABLE_ISOLATION */ 182 + 183 + /* 184 + * Page table pages are page-aligned. The lower half of the top 185 + * level is used for userspace and the top half for the kernel. 186 + * 187 + * Returns true for parts of the PGD that map userspace and 188 + * false for the parts that map the kernel. 189 + */ 190 + static inline bool pgdp_maps_userspace(void *__ptr) 191 + { 192 + unsigned long ptr = (unsigned long)__ptr; 193 + 194 + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); 195 + } 196 + 197 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 198 + pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); 199 + 200 + /* 201 + * Take a PGD location (pgdp) and a pgd value that needs to be set there. 202 + * Populates the user and returns the resulting PGD that must be set in 203 + * the kernel copy of the page tables. 204 + */ 205 + static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) 206 + { 207 + if (!static_cpu_has(X86_FEATURE_PTI)) 208 + return pgd; 209 + return __pti_set_user_pgd(pgdp, pgd); 210 + } 211 + #else 212 + static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) 213 + { 214 + return pgd; 215 + } 216 + #endif 217 + 134 218 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) 135 219 { 220 + #if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) 221 + p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); 222 + #else 136 223 *p4dp = p4d; 224 + #endif 137 225 } 138 226 139 227 static inline void native_p4d_clear(p4d_t *p4d) ··· 235 147 236 148 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) 237 149 { 150 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 151 + *pgdp = pti_set_user_pgd(pgdp, pgd); 152 + #else 238 153 *pgdp = pgd; 154 + #endif 239 155 } 240 156 241 157 static inline void native_pgd_clear(pgd_t *pgd)

+6 -2

arch/x86/include/asm/pgtable_64_types.h

··· 79 79 #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 80 80 81 81 #ifdef CONFIG_X86_5LEVEL 82 - # define VMALLOC_SIZE_TB _AC(16384, UL) 83 - # define __VMALLOC_BASE _AC(0xff92000000000000, UL) 82 + # define VMALLOC_SIZE_TB _AC(12800, UL) 83 + # define __VMALLOC_BASE _AC(0xffa0000000000000, UL) 84 84 # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) 85 + # define LDT_PGD_ENTRY _AC(-112, UL) 86 + # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) 85 87 #else 86 88 # define VMALLOC_SIZE_TB _AC(32, UL) 87 89 # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) 88 90 # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) 91 + # define LDT_PGD_ENTRY _AC(-4, UL) 92 + # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) 89 93 #endif 90 94 91 95 #ifdef CONFIG_RANDOMIZE_MEMORY

+5

arch/x86/include/asm/processor-flags.h

··· 38 38 #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) 39 39 #define CR3_PCID_MASK 0xFFFull 40 40 #define CR3_NOFLUSH BIT_ULL(63) 41 + 42 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 43 + # define X86_CR3_PTI_SWITCH_BIT 11 44 + #endif 45 + 41 46 #else 42 47 /* 43 48 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save

+16 -7

arch/x86/include/asm/processor.h

··· 852 852 853 853 #else 854 854 /* 855 - * User space process size. 47bits minus one guard page. The guard 856 - * page is necessary on Intel CPUs: if a SYSCALL instruction is at 857 - * the highest possible canonical userspace address, then that 858 - * syscall will enter the kernel with a non-canonical return 859 - * address, and SYSRET will explode dangerously. We avoid this 860 - * particular problem by preventing anything from being mapped 861 - * at the maximum canonical address. 855 + * User space process size. This is the first address outside the user range. 856 + * There are a few constraints that determine this: 857 + * 858 + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical 859 + * address, then that syscall will enter the kernel with a 860 + * non-canonical return address, and SYSRET will explode dangerously. 861 + * We avoid this particular problem by preventing anything executable 862 + * from being mapped at the maximum canonical address. 863 + * 864 + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the 865 + * CPUs malfunction if they execute code from the highest canonical page. 866 + * They'll speculate right off the end of the canonical space, and 867 + * bad things happen. This is worked around in the same way as the 868 + * Intel problem. 869 + * 870 + * With page table isolation enabled, we map the LDT in ... [stay tuned] 862 871 */ 863 872 #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 864 873

+14

arch/x86/include/asm/pti.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef _ASM_X86_PTI_H 3 + #define _ASM_X86_PTI_H 4 + #ifndef __ASSEMBLY__ 5 + 6 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 7 + extern void pti_init(void); 8 + extern void pti_check_boottime_disable(void); 9 + #else 10 + static inline void pti_check_boottime_disable(void) { } 11 + #endif 12 + 13 + #endif /* __ASSEMBLY__ */ 14 + #endif /* _ASM_X86_PTI_H */

+171 -31

arch/x86/include/asm/tlbflush.h

··· 10 10 #include <asm/special_insns.h> 11 11 #include <asm/smp.h> 12 12 #include <asm/invpcid.h> 13 + #include <asm/pti.h> 14 + #include <asm/processor-flags.h> 13 15 14 - static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) 15 - { 16 - /* 17 - * Bump the generation count. This also serves as a full barrier 18 - * that synchronizes with switch_mm(): callers are required to order 19 - * their read of mm_cpumask after their writes to the paging 20 - * structures. 21 - */ 22 - return atomic64_inc_return(&mm->context.tlb_gen); 23 - } 16 + /* 17 + * The x86 feature is called PCID (Process Context IDentifier). It is similar 18 + * to what is traditionally called ASID on the RISC processors. 19 + * 20 + * We don't use the traditional ASID implementation, where each process/mm gets 21 + * its own ASID and flush/restart when we run out of ASID space. 22 + * 23 + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's 24 + * that came by on this CPU, allowing cheaper switch_mm between processes on 25 + * this CPU. 26 + * 27 + * We end up with different spaces for different things. To avoid confusion we 28 + * use different names for each of them: 29 + * 30 + * ASID - [0, TLB_NR_DYN_ASIDS-1] 31 + * the canonical identifier for an mm 32 + * 33 + * kPCID - [1, TLB_NR_DYN_ASIDS] 34 + * the value we write into the PCID part of CR3; corresponds to the 35 + * ASID+1, because PCID 0 is special. 36 + * 37 + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] 38 + * for KPTI each mm has two address spaces and thus needs two 39 + * PCID values, but we can still do with a single ASID denomination 40 + * for each mm. Corresponds to kPCID + 2048. 41 + * 42 + */ 24 43 25 44 /* There are 12 bits of space for ASIDS in CR3 */ 26 45 #define CR3_HW_ASID_BITS 12 46 + 27 47 /* 28 48 * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for 29 49 * user/kernel switches 30 50 */ 31 - #define PTI_CONSUMED_ASID_BITS 0 51 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 52 + # define PTI_CONSUMED_PCID_BITS 1 53 + #else 54 + # define PTI_CONSUMED_PCID_BITS 0 55 + #endif 32 56 33 - #define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) 57 + #define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) 58 + 34 59 /* 35 60 * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account 36 - * for them being zero-based. Another -1 is because ASID 0 is reserved for 61 + * for them being zero-based. Another -1 is because PCID 0 is reserved for 37 62 * use by non-PCID-aware users. 38 63 */ 39 - #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) 64 + #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) 40 65 66 + /* 67 + * 6 because 6 should be plenty and struct tlb_state will fit in two cache 68 + * lines. 69 + */ 70 + #define TLB_NR_DYN_ASIDS 6 71 + 72 + /* 73 + * Given @asid, compute kPCID 74 + */ 41 75 static inline u16 kern_pcid(u16 asid) 42 76 { 43 77 VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 78 + 79 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 44 80 /* 81 + * Make sure that the dynamic ASID space does not confict with the 82 + * bit we are using to switch between user and kernel ASIDs. 83 + */ 84 + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); 85 + 86 + /* 87 + * The ASID being passed in here should have respected the 88 + * MAX_ASID_AVAILABLE and thus never have the switch bit set. 89 + */ 90 + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); 91 + #endif 92 + /* 93 + * The dynamically-assigned ASIDs that get passed in are small 94 + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, 95 + * so do not bother to clear it. 96 + * 45 97 * If PCID is on, ASID-aware code paths put the ASID+1 into the 46 98 * PCID bits. This serves two purposes. It prevents a nasty 47 99 * situation in which PCID-unaware code saves CR3, loads some other ··· 103 51 * CR4.PCIDE off will trigger deterministically. 104 52 */ 105 53 return asid + 1; 54 + } 55 + 56 + /* 57 + * Given @asid, compute uPCID 58 + */ 59 + static inline u16 user_pcid(u16 asid) 60 + { 61 + u16 ret = kern_pcid(asid); 62 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 63 + ret |= 1 << X86_CR3_PTI_SWITCH_BIT; 64 + #endif 65 + return ret; 106 66 } 107 67 108 68 struct pgd_t; ··· 159 95 return !static_cpu_has(X86_FEATURE_PCID); 160 96 } 161 97 162 - /* 163 - * 6 because 6 should be plenty and struct tlb_state will fit in 164 - * two cache lines. 165 - */ 166 - #define TLB_NR_DYN_ASIDS 6 167 - 168 98 struct tlb_context { 169 99 u64 ctx_id; 170 100 u64 tlb_gen; ··· 191 133 * lazy mode. 192 134 */ 193 135 bool is_lazy; 136 + 137 + /* 138 + * If set we changed the page tables in such a way that we 139 + * needed an invalidation of all contexts (aka. PCIDs / ASIDs). 140 + * This tells us to go invalidate all the non-loaded ctxs[] 141 + * on the next context switch. 142 + * 143 + * The current ctx was kept up-to-date as it ran and does not 144 + * need to be invalidated. 145 + */ 146 + bool invalidate_other; 147 + 148 + /* 149 + * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate 150 + * the corresponding user PCID needs a flush next time we 151 + * switch to it; see SWITCH_TO_USER_CR3. 152 + */ 153 + unsigned short user_pcid_flush_mask; 194 154 195 155 /* 196 156 * Access to this CR4 shadow and to H/W CR4 is protected by ··· 291 215 } 292 216 293 217 /* 218 + * Mark all other ASIDs as invalid, preserves the current. 219 + */ 220 + static inline void invalidate_other_asid(void) 221 + { 222 + this_cpu_write(cpu_tlbstate.invalidate_other, true); 223 + } 224 + 225 + /* 294 226 * Save some of cr4 feature set we're using (e.g. Pentium 4MB 295 227 * enable and PPro Global page enable), so that any CPU's that boot 296 228 * up after us can get the correct flags. This should only be used ··· 318 234 extern void initialize_tlbstate_and_flush(void); 319 235 320 236 /* 237 + * Given an ASID, flush the corresponding user ASID. We can delay this 238 + * until the next time we switch to it. 239 + * 240 + * See SWITCH_TO_USER_CR3. 241 + */ 242 + static inline void invalidate_user_asid(u16 asid) 243 + { 244 + /* There is no user ASID if address space separation is off */ 245 + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) 246 + return; 247 + 248 + /* 249 + * We only have a single ASID if PCID is off and the CR3 250 + * write will have flushed it. 251 + */ 252 + if (!cpu_feature_enabled(X86_FEATURE_PCID)) 253 + return; 254 + 255 + if (!static_cpu_has(X86_FEATURE_PTI)) 256 + return; 257 + 258 + __set_bit(kern_pcid(asid), 259 + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); 260 + } 261 + 262 + /* 321 263 * flush the entire current user mapping 322 264 */ 323 265 static inline void __native_flush_tlb(void) 324 266 { 267 + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); 325 268 /* 326 - * If current->mm == NULL then we borrow a mm which may change during a 327 - * task switch and therefore we must not be preempted while we write CR3 328 - * back: 269 + * If current->mm == NULL then we borrow a mm which may change 270 + * during a task switch and therefore we must not be preempted 271 + * while we write CR3 back: 329 272 */ 330 273 preempt_disable(); 331 274 native_write_cr3(__native_read_cr3()); ··· 370 259 /* 371 260 * Using INVPCID is considerably faster than a pair of writes 372 261 * to CR4 sandwiched inside an IRQ flag save/restore. 262 + * 263 + * Note, this works with CR4.PCIDE=0 or 1. 373 264 */ 374 265 invpcid_flush_all(); 375 266 return; ··· 398 285 */ 399 286 static inline void __native_flush_tlb_single(unsigned long addr) 400 287 { 288 + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 289 + 401 290 asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); 291 + 292 + if (!static_cpu_has(X86_FEATURE_PTI)) 293 + return; 294 + 295 + /* 296 + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. 297 + * Just use invalidate_user_asid() in case we are called early. 298 + */ 299 + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) 300 + invalidate_user_asid(loaded_mm_asid); 301 + else 302 + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); 402 303 } 403 304 404 305 /* ··· 428 301 */ 429 302 __flush_tlb(); 430 303 } 431 - 432 - /* 433 - * Note: if we somehow had PCID but not PGE, then this wouldn't work -- 434 - * we'd end up flushing kernel translations for the current ASID but 435 - * we might fail to flush kernel translations for other cached ASIDs. 436 - * 437 - * To avoid this issue, we force PCID off if PGE is off. 438 - */ 439 304 } 440 305 441 306 /* ··· 437 318 { 438 319 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); 439 320 __flush_tlb_single(addr); 321 + 322 + if (!static_cpu_has(X86_FEATURE_PTI)) 323 + return; 324 + 325 + /* 326 + * __flush_tlb_single() will have cleared the TLB entry for this ASID, 327 + * but since kernel space is replicated across all, we must also 328 + * invalidate all others. 329 + */ 330 + invalidate_other_asid(); 440 331 } 441 332 442 333 #define TLB_FLUSH_ALL -1UL ··· 506 377 507 378 void native_flush_tlb_others(const struct cpumask *cpumask, 508 379 const struct flush_tlb_info *info); 380 + 381 + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) 382 + { 383 + /* 384 + * Bump the generation count. This also serves as a full barrier 385 + * that synchronizes with switch_mm(): callers are required to order 386 + * their read of mm_cpumask after their writes to the paging 387 + * structures. 388 + */ 389 + return atomic64_inc_return(&mm->context.tlb_gen); 390 + } 509 391 510 392 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, 511 393 struct mm_struct *mm)

+1

arch/x86/include/asm/vsyscall.h

··· 7 7 8 8 #ifdef CONFIG_X86_VSYSCALL_EMULATION 9 9 extern void map_vsyscall(void); 10 + extern void set_vsyscall_pgtable_user_bits(pgd_t *root); 10 11 11 12 /* 12 13 * Called on instruction fetch fault in vsyscall page.

+6 -1

arch/x86/include/uapi/asm/processor-flags.h

··· 78 78 #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) 79 79 #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ 80 80 #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) 81 - #define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ 81 + 82 + #define X86_CR3_PCID_BITS 12 83 + #define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) 84 + 85 + #define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ 86 + #define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) 82 87 83 88 /* 84 89 * Intel CPU features in CR4

+4

arch/x86/kernel/asm-offsets.c

··· 17 17 #include <asm/sigframe.h> 18 18 #include <asm/bootparam.h> 19 19 #include <asm/suspend.h> 20 + #include <asm/tlbflush.h> 20 21 21 22 #ifdef CONFIG_XEN 22 23 #include <xen/interface/xen.h> ··· 94 93 95 94 BLANK(); 96 95 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); 96 + 97 + /* TLB state for the entry code */ 98 + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); 97 99 98 100 /* Layout info for cpu_entry_area */ 99 101 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);

+8 -1

arch/x86/kernel/cpu/common.c

··· 922 922 } 923 923 924 924 setup_force_cpu_cap(X86_FEATURE_ALWAYS); 925 + 926 + /* Assume for now that ALL x86 CPUs are insecure */ 927 + setup_force_cpu_bug(X86_BUG_CPU_INSECURE); 928 + 925 929 fpu__init_system(c); 926 930 927 931 #ifdef CONFIG_X86_32 ··· 1364 1360 (entry_SYSCALL_64_trampoline - _entry_trampoline); 1365 1361 1366 1362 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); 1367 - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); 1363 + if (static_cpu_has(X86_FEATURE_PTI)) 1364 + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); 1365 + else 1366 + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); 1368 1367 1369 1368 #ifdef CONFIG_IA32_EMULATION 1370 1369 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);

+4 -2

arch/x86/kernel/dumpstack.c

··· 297 297 unsigned long sp; 298 298 #endif 299 299 printk(KERN_DEFAULT 300 - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, 300 + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, 301 301 IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", 302 302 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", 303 303 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", 304 - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); 304 + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", 305 + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? 306 + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); 305 307 306 308 if (notify_die(DIE_OOPS, str, regs, err, 307 309 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)

+27 -3

arch/x86/kernel/head_64.S

··· 341 341 .balign PAGE_SIZE; \ 342 342 GLOBAL(name) 343 343 344 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 345 + /* 346 + * Each PGD needs to be 8k long and 8k aligned. We do not 347 + * ever go out to userspace with these, so we do not 348 + * strictly *need* the second page, but this allows us to 349 + * have a single set_pgd() implementation that does not 350 + * need to worry about whether it has 4k or 8k to work 351 + * with. 352 + * 353 + * This ensures PGDs are 8k long: 354 + */ 355 + #define PTI_USER_PGD_FILL 512 356 + /* This ensures they are 8k-aligned: */ 357 + #define NEXT_PGD_PAGE(name) \ 358 + .balign 2 * PAGE_SIZE; \ 359 + GLOBAL(name) 360 + #else 361 + #define NEXT_PGD_PAGE(name) NEXT_PAGE(name) 362 + #define PTI_USER_PGD_FILL 0 363 + #endif 364 + 344 365 /* Automate the creation of 1 to 1 mapping pmd entries */ 345 366 #define PMDS(START, PERM, COUNT) \ 346 367 i = 0 ; \ ··· 371 350 .endr 372 351 373 352 __INITDATA 374 - NEXT_PAGE(early_top_pgt) 353 + NEXT_PGD_PAGE(early_top_pgt) 375 354 .fill 511,8,0 376 355 #ifdef CONFIG_X86_5LEVEL 377 356 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 378 357 #else 379 358 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 380 359 #endif 360 + .fill PTI_USER_PGD_FILL,8,0 381 361 382 362 NEXT_PAGE(early_dynamic_pgts) 383 363 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 ··· 386 364 .data 387 365 388 366 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) 389 - NEXT_PAGE(init_top_pgt) 367 + NEXT_PGD_PAGE(init_top_pgt) 390 368 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 391 369 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 392 370 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 393 371 .org init_top_pgt + PGD_START_KERNEL*8, 0 394 372 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 395 373 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 374 + .fill PTI_USER_PGD_FILL,8,0 396 375 397 376 NEXT_PAGE(level3_ident_pgt) 398 377 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC ··· 404 381 */ 405 382 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) 406 383 #else 407 - NEXT_PAGE(init_top_pgt) 384 + NEXT_PGD_PAGE(init_top_pgt) 408 385 .fill 512,8,0 386 + .fill PTI_USER_PGD_FILL,8,0 409 387 #endif 410 388 411 389 #ifdef CONFIG_X86_5LEVEL

+141 -3

arch/x86/kernel/ldt.c

··· 24 24 #include <linux/uaccess.h> 25 25 26 26 #include <asm/ldt.h> 27 + #include <asm/tlb.h> 27 28 #include <asm/desc.h> 28 29 #include <asm/mmu_context.h> 29 30 #include <asm/syscalls.h> ··· 52 51 static void flush_ldt(void *__mm) 53 52 { 54 53 struct mm_struct *mm = __mm; 55 - mm_context_t *pc; 56 54 57 55 if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) 58 56 return; 59 57 60 - pc = &mm->context; 61 - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); 58 + load_mm_ldt(mm); 62 59 63 60 refresh_ldt_segments(); 64 61 } ··· 93 94 return NULL; 94 95 } 95 96 97 + /* The new LDT isn't aliased for PTI yet. */ 98 + new_ldt->slot = -1; 99 + 96 100 new_ldt->nr_entries = num_entries; 97 101 return new_ldt; 102 + } 103 + 104 + /* 105 + * If PTI is enabled, this maps the LDT into the kernelmode and 106 + * usermode tables for the given mm. 107 + * 108 + * There is no corresponding unmap function. Even if the LDT is freed, we 109 + * leave the PTEs around until the slot is reused or the mm is destroyed. 110 + * This is harmless: the LDT is always in ordinary memory, and no one will 111 + * access the freed slot. 112 + * 113 + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make 114 + * it useful, and the flush would slow down modify_ldt(). 115 + */ 116 + static int 117 + map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) 118 + { 119 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 120 + bool is_vmalloc, had_top_level_entry; 121 + unsigned long va; 122 + spinlock_t *ptl; 123 + pgd_t *pgd; 124 + int i; 125 + 126 + if (!static_cpu_has(X86_FEATURE_PTI)) 127 + return 0; 128 + 129 + /* 130 + * Any given ldt_struct should have map_ldt_struct() called at most 131 + * once. 132 + */ 133 + WARN_ON(ldt->slot != -1); 134 + 135 + /* 136 + * Did we already have the top level entry allocated? We can't 137 + * use pgd_none() for this because it doens't do anything on 138 + * 4-level page table kernels. 139 + */ 140 + pgd = pgd_offset(mm, LDT_BASE_ADDR); 141 + had_top_level_entry = (pgd->pgd != 0); 142 + 143 + is_vmalloc = is_vmalloc_addr(ldt->entries); 144 + 145 + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { 146 + unsigned long offset = i << PAGE_SHIFT; 147 + const void *src = (char *)ldt->entries + offset; 148 + unsigned long pfn; 149 + pte_t pte, *ptep; 150 + 151 + va = (unsigned long)ldt_slot_va(slot) + offset; 152 + pfn = is_vmalloc ? vmalloc_to_pfn(src) : 153 + page_to_pfn(virt_to_page(src)); 154 + /* 155 + * Treat the PTI LDT range as a *userspace* range. 156 + * get_locked_pte() will allocate all needed pagetables 157 + * and account for them in this mm. 158 + */ 159 + ptep = get_locked_pte(mm, va, &ptl); 160 + if (!ptep) 161 + return -ENOMEM; 162 + /* 163 + * Map it RO so the easy to find address is not a primary 164 + * target via some kernel interface which misses a 165 + * permission check. 166 + */ 167 + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); 168 + set_pte_at(mm, va, ptep, pte); 169 + pte_unmap_unlock(ptep, ptl); 170 + } 171 + 172 + if (mm->context.ldt) { 173 + /* 174 + * We already had an LDT. The top-level entry should already 175 + * have been allocated and synchronized with the usermode 176 + * tables. 177 + */ 178 + WARN_ON(!had_top_level_entry); 179 + if (static_cpu_has(X86_FEATURE_PTI)) 180 + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); 181 + } else { 182 + /* 183 + * This is the first time we're mapping an LDT for this process. 184 + * Sync the pgd to the usermode tables. 185 + */ 186 + WARN_ON(had_top_level_entry); 187 + if (static_cpu_has(X86_FEATURE_PTI)) { 188 + WARN_ON(kernel_to_user_pgdp(pgd)->pgd); 189 + set_pgd(kernel_to_user_pgdp(pgd), *pgd); 190 + } 191 + } 192 + 193 + va = (unsigned long)ldt_slot_va(slot); 194 + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); 195 + 196 + ldt->slot = slot; 197 + #endif 198 + return 0; 199 + } 200 + 201 + static void free_ldt_pgtables(struct mm_struct *mm) 202 + { 203 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 204 + struct mmu_gather tlb; 205 + unsigned long start = LDT_BASE_ADDR; 206 + unsigned long end = start + (1UL << PGDIR_SHIFT); 207 + 208 + if (!static_cpu_has(X86_FEATURE_PTI)) 209 + return; 210 + 211 + tlb_gather_mmu(&tlb, mm, start, end); 212 + free_pgd_range(&tlb, start, end, start, end); 213 + tlb_finish_mmu(&tlb, start, end); 214 + #endif 98 215 } 99 216 100 217 /* After calling this, the LDT is immutable. */ ··· 271 156 new_ldt->nr_entries * LDT_ENTRY_SIZE); 272 157 finalize_ldt_struct(new_ldt); 273 158 159 + retval = map_ldt_struct(mm, new_ldt, 0); 160 + if (retval) { 161 + free_ldt_pgtables(mm); 162 + free_ldt_struct(new_ldt); 163 + goto out_unlock; 164 + } 274 165 mm->context.ldt = new_ldt; 275 166 276 167 out_unlock: ··· 293 172 { 294 173 free_ldt_struct(mm->context.ldt); 295 174 mm->context.ldt = NULL; 175 + } 176 + 177 + void ldt_arch_exit_mmap(struct mm_struct *mm) 178 + { 179 + free_ldt_pgtables(mm); 296 180 } 297 181 298 182 static int read_ldt(void __user *ptr, unsigned long bytecount) ··· 412 286 413 287 new_ldt->entries[ldt_info.entry_number] = ldt; 414 288 finalize_ldt_struct(new_ldt); 289 + 290 + /* 291 + * If we are using PTI, map the new LDT into the userspace pagetables. 292 + * If there is already an LDT, use the other slot so that other CPUs 293 + * will continue to use the old LDT until install_ldt() switches 294 + * them over to the new LDT. 295 + */ 296 + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); 297 + if (error) { 298 + free_ldt_struct(old_ldt); 299 + goto out_unlock; 300 + } 415 301 416 302 install_ldt(mm, new_ldt); 417 303 free_ldt_struct(old_ldt);

+2 -9

arch/x86/kernel/tls.c

··· 93 93 cpu = get_cpu(); 94 94 95 95 while (n-- > 0) { 96 - if (LDT_empty(info) || LDT_zero(info)) { 96 + if (LDT_empty(info) || LDT_zero(info)) 97 97 memset(desc, 0, sizeof(*desc)); 98 - } else { 98 + else 99 99 fill_ldt(desc, info); 100 - 101 - /* 102 - * Always set the accessed bit so that the CPU 103 - * doesn't try to write to the (read-only) GDT. 104 - */ 105 - desc->type |= 1; 106 - } 107 100 ++info; 108 101 ++desc; 109 102 }

+8

arch/x86/kernel/vmlinux.lds.S

··· 61 61 . = ALIGN(HPAGE_SIZE); \ 62 62 __end_rodata_hpage_align = .; 63 63 64 + #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); 65 + #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); 66 + 64 67 #else 65 68 66 69 #define X64_ALIGN_RODATA_BEGIN 67 70 #define X64_ALIGN_RODATA_END 71 + 72 + #define ALIGN_ENTRY_TEXT_BEGIN 73 + #define ALIGN_ENTRY_TEXT_END 68 74 69 75 #endif 70 76 ··· 108 102 CPUIDLE_TEXT 109 103 LOCK_TEXT 110 104 KPROBES_TEXT 105 + ALIGN_ENTRY_TEXT_BEGIN 111 106 ENTRY_TEXT 112 107 IRQENTRY_TEXT 108 + ALIGN_ENTRY_TEXT_END 113 109 SOFTIRQENTRY_TEXT 114 110 *(.fixup) 115 111 *(.gnu.warning)

+4 -3

arch/x86/mm/Makefile

··· 41 41 obj-$(CONFIG_ACPI_NUMA) += srat.o 42 42 obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 43 43 44 - obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 45 - obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 46 - obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o 44 + obj-$(CONFIG_X86_INTEL_MPX) += mpx.o 45 + obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 46 + obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o 47 + obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o 47 48 48 49 obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o 49 50 obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o

+27

arch/x86/mm/cpu_entry_area.c

··· 38 38 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); 39 39 } 40 40 41 + static void percpu_setup_debug_store(int cpu) 42 + { 43 + #ifdef CONFIG_CPU_SUP_INTEL 44 + int npages; 45 + void *cea; 46 + 47 + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 48 + return; 49 + 50 + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; 51 + npages = sizeof(struct debug_store) / PAGE_SIZE; 52 + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); 53 + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, 54 + PAGE_KERNEL); 55 + 56 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; 57 + /* 58 + * Force the population of PMDs for not yet allocated per cpu 59 + * memory like debug store buffers. 60 + */ 61 + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; 62 + for (; npages; npages--, cea += PAGE_SIZE) 63 + cea_set_pte(cea, 0, PAGE_NONE); 64 + #endif 65 + } 66 + 41 67 /* Setup the fixmap mappings only once per-processor */ 42 68 static void __init setup_cpu_entry_area(int cpu) 43 69 { ··· 135 109 cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, 136 110 __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); 137 111 #endif 112 + percpu_setup_debug_store(cpu); 138 113 } 139 114 140 115 static __init void setup_cpu_entry_area_ptes(void)

+74 -6

arch/x86/mm/debug_pagetables.c

··· 5 5 6 6 static int ptdump_show(struct seq_file *m, void *v) 7 7 { 8 - ptdump_walk_pgd_level(m, NULL); 8 + ptdump_walk_pgd_level_debugfs(m, NULL, false); 9 9 return 0; 10 10 } 11 11 ··· 22 22 .release = single_release, 23 23 }; 24 24 25 - static struct dentry *pe; 25 + static int ptdump_show_curknl(struct seq_file *m, void *v) 26 + { 27 + if (current->mm->pgd) { 28 + down_read(&current->mm->mmap_sem); 29 + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); 30 + up_read(&current->mm->mmap_sem); 31 + } 32 + return 0; 33 + } 34 + 35 + static int ptdump_open_curknl(struct inode *inode, struct file *filp) 36 + { 37 + return single_open(filp, ptdump_show_curknl, NULL); 38 + } 39 + 40 + static const struct file_operations ptdump_curknl_fops = { 41 + .owner = THIS_MODULE, 42 + .open = ptdump_open_curknl, 43 + .read = seq_read, 44 + .llseek = seq_lseek, 45 + .release = single_release, 46 + }; 47 + 48 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 49 + static struct dentry *pe_curusr; 50 + 51 + static int ptdump_show_curusr(struct seq_file *m, void *v) 52 + { 53 + if (current->mm->pgd) { 54 + down_read(&current->mm->mmap_sem); 55 + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); 56 + up_read(&current->mm->mmap_sem); 57 + } 58 + return 0; 59 + } 60 + 61 + static int ptdump_open_curusr(struct inode *inode, struct file *filp) 62 + { 63 + return single_open(filp, ptdump_show_curusr, NULL); 64 + } 65 + 66 + static const struct file_operations ptdump_curusr_fops = { 67 + .owner = THIS_MODULE, 68 + .open = ptdump_open_curusr, 69 + .read = seq_read, 70 + .llseek = seq_lseek, 71 + .release = single_release, 72 + }; 73 + #endif 74 + 75 + static struct dentry *dir, *pe_knl, *pe_curknl; 26 76 27 77 static int __init pt_dump_debug_init(void) 28 78 { 29 - pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, 30 - &ptdump_fops); 31 - if (!pe) 79 + dir = debugfs_create_dir("page_tables", NULL); 80 + if (!dir) 32 81 return -ENOMEM; 33 82 83 + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, 84 + &ptdump_fops); 85 + if (!pe_knl) 86 + goto err; 87 + 88 + pe_curknl = debugfs_create_file("current_kernel", 0400, 89 + dir, NULL, &ptdump_curknl_fops); 90 + if (!pe_curknl) 91 + goto err; 92 + 93 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 94 + pe_curusr = debugfs_create_file("current_user", 0400, 95 + dir, NULL, &ptdump_curusr_fops); 96 + if (!pe_curusr) 97 + goto err; 98 + #endif 34 99 return 0; 100 + err: 101 + debugfs_remove_recursive(dir); 102 + return -ENOMEM; 35 103 } 36 104 37 105 static void __exit pt_dump_debug_exit(void) 38 106 { 39 - debugfs_remove_recursive(pe); 107 + debugfs_remove_recursive(dir); 40 108 } 41 109 42 110 module_init(pt_dump_debug_init);

+38 -5

arch/x86/mm/dump_pagetables.c

··· 52 52 USER_SPACE_NR = 0, 53 53 KERNEL_SPACE_NR, 54 54 LOW_KERNEL_NR, 55 + #if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) 56 + LDT_NR, 57 + #endif 55 58 VMALLOC_START_NR, 56 59 VMEMMAP_START_NR, 57 60 #ifdef CONFIG_KASAN 58 61 KASAN_SHADOW_START_NR, 59 62 KASAN_SHADOW_END_NR, 63 + #endif 64 + #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) 65 + LDT_NR, 60 66 #endif 61 67 CPU_ENTRY_AREA_NR, 62 68 #ifdef CONFIG_X86_ESPFIX64 ··· 87 81 #ifdef CONFIG_KASAN 88 82 [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, 89 83 [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, 84 + #endif 85 + #ifdef CONFIG_MODIFY_LDT_SYSCALL 86 + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, 90 87 #endif 91 88 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 92 89 #ifdef CONFIG_X86_ESPFIX64 ··· 476 467 } 477 468 478 469 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 479 - bool checkwx) 470 + bool checkwx, bool dmesg) 480 471 { 481 472 #ifdef CONFIG_X86_64 482 473 pgd_t *start = (pgd_t *) &init_top_pgt; ··· 489 480 490 481 if (pgd) { 491 482 start = pgd; 492 - st.to_dmesg = true; 483 + st.to_dmesg = dmesg; 493 484 } 494 485 495 486 st.check_wx = checkwx; ··· 527 518 528 519 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) 529 520 { 530 - ptdump_walk_pgd_level_core(m, pgd, false); 521 + ptdump_walk_pgd_level_core(m, pgd, false, true); 531 522 } 532 - EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); 523 + 524 + void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) 525 + { 526 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 527 + if (user && static_cpu_has(X86_FEATURE_PTI)) 528 + pgd = kernel_to_user_pgdp(pgd); 529 + #endif 530 + ptdump_walk_pgd_level_core(m, pgd, false, false); 531 + } 532 + EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); 533 + 534 + static void ptdump_walk_user_pgd_level_checkwx(void) 535 + { 536 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 537 + pgd_t *pgd = (pgd_t *) &init_top_pgt; 538 + 539 + if (!static_cpu_has(X86_FEATURE_PTI)) 540 + return; 541 + 542 + pr_info("x86/mm: Checking user space page tables\n"); 543 + pgd = kernel_to_user_pgdp(pgd); 544 + ptdump_walk_pgd_level_core(NULL, pgd, true, false); 545 + #endif 546 + } 533 547 534 548 void ptdump_walk_pgd_level_checkwx(void) 535 549 { 536 - ptdump_walk_pgd_level_core(NULL, NULL, true); 550 + ptdump_walk_pgd_level_core(NULL, NULL, true, false); 551 + ptdump_walk_user_pgd_level_checkwx(); 537 552 } 538 553 539 554 static int __init pt_dump_init(void)

+49 -31

arch/x86/mm/init.c

··· 20 20 #include <asm/kaslr.h> 21 21 #include <asm/hypervisor.h> 22 22 #include <asm/cpufeature.h> 23 + #include <asm/pti.h> 23 24 24 25 /* 25 26 * We need to define the tracepoints somewhere, and tlb.c ··· 161 160 162 161 static int page_size_mask; 163 162 163 + static void enable_global_pages(void) 164 + { 165 + if (!static_cpu_has(X86_FEATURE_PTI)) 166 + __supported_pte_mask |= _PAGE_GLOBAL; 167 + } 168 + 164 169 static void __init probe_page_size_mask(void) 165 170 { 166 171 /* ··· 184 177 cr4_set_bits_and_update_boot(X86_CR4_PSE); 185 178 186 179 /* Enable PGE if available */ 180 + __supported_pte_mask &= ~_PAGE_GLOBAL; 187 181 if (boot_cpu_has(X86_FEATURE_PGE)) { 188 182 cr4_set_bits_and_update_boot(X86_CR4_PGE); 189 - __supported_pte_mask |= _PAGE_GLOBAL; 190 - } else 191 - __supported_pte_mask &= ~_PAGE_GLOBAL; 183 + enable_global_pages(); 184 + } 192 185 193 186 /* Enable 1 GB linear kernel mappings if available: */ 194 187 if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { ··· 201 194 202 195 static void setup_pcid(void) 203 196 { 204 - #ifdef CONFIG_X86_64 205 - if (boot_cpu_has(X86_FEATURE_PCID)) { 206 - if (boot_cpu_has(X86_FEATURE_PGE)) { 207 - /* 208 - * This can't be cr4_set_bits_and_update_boot() -- 209 - * the trampoline code can't handle CR4.PCIDE and 210 - * it wouldn't do any good anyway. Despite the name, 211 - * cr4_set_bits_and_update_boot() doesn't actually 212 - * cause the bits in question to remain set all the 213 - * way through the secondary boot asm. 214 - * 215 - * Instead, we brute-force it and set CR4.PCIDE 216 - * manually in start_secondary(). 217 - */ 218 - cr4_set_bits(X86_CR4_PCIDE); 219 - } else { 220 - /* 221 - * flush_tlb_all(), as currently implemented, won't 222 - * work if PCID is on but PGE is not. Since that 223 - * combination doesn't exist on real hardware, there's 224 - * no reason to try to fully support it, but it's 225 - * polite to avoid corrupting data if we're on 226 - * an improperly configured VM. 227 - */ 228 - setup_clear_cpu_cap(X86_FEATURE_PCID); 229 - } 197 + if (!IS_ENABLED(CONFIG_X86_64)) 198 + return; 199 + 200 + if (!boot_cpu_has(X86_FEATURE_PCID)) 201 + return; 202 + 203 + if (boot_cpu_has(X86_FEATURE_PGE)) { 204 + /* 205 + * This can't be cr4_set_bits_and_update_boot() -- the 206 + * trampoline code can't handle CR4.PCIDE and it wouldn't 207 + * do any good anyway. Despite the name, 208 + * cr4_set_bits_and_update_boot() doesn't actually cause 209 + * the bits in question to remain set all the way through 210 + * the secondary boot asm. 211 + * 212 + * Instead, we brute-force it and set CR4.PCIDE manually in 213 + * start_secondary(). 214 + */ 215 + cr4_set_bits(X86_CR4_PCIDE); 216 + 217 + /* 218 + * INVPCID's single-context modes (2/3) only work if we set 219 + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable 220 + * on systems that have X86_CR4_PCIDE clear, or that have 221 + * no INVPCID support at all. 222 + */ 223 + if (boot_cpu_has(X86_FEATURE_INVPCID)) 224 + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); 225 + } else { 226 + /* 227 + * flush_tlb_all(), as currently implemented, won't work if 228 + * PCID is on but PGE is not. Since that combination 229 + * doesn't exist on real hardware, there's no reason to try 230 + * to fully support it, but it's polite to avoid corrupting 231 + * data if we're on an improperly configured VM. 232 + */ 233 + setup_clear_cpu_cap(X86_FEATURE_PCID); 230 234 } 231 - #endif 232 235 } 233 236 234 237 #ifdef CONFIG_X86_32 ··· 639 622 { 640 623 unsigned long end; 641 624 625 + pti_check_boottime_disable(); 642 626 probe_page_size_mask(); 643 627 setup_pcid(); 644 628 ··· 863 845 free_area_init_nodes(max_zone_pfns); 864 846 } 865 847 866 - DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 848 + __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 867 849 .loaded_mm = &init_mm, 868 850 .next_asid = 1, 869 851 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */

+3 -2

arch/x86/mm/pgtable.c

··· 355 355 kmem_cache_free(pgd_cache, pgd); 356 356 } 357 357 #else 358 + 358 359 static inline pgd_t *_pgd_alloc(void) 359 360 { 360 - return (pgd_t *)__get_free_page(PGALLOC_GFP); 361 + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); 361 362 } 362 363 363 364 static inline void _pgd_free(pgd_t *pgd) 364 365 { 365 - free_page((unsigned long)pgd); 366 + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); 366 367 } 367 368 #endif /* CONFIG_X86_PAE */ 368 369

+387

arch/x86/mm/pti.c

··· 1 + /* 2 + * Copyright(c) 2017 Intel Corporation. All rights reserved. 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of version 2 of the GNU General Public License as 6 + * published by the Free Software Foundation. 7 + * 8 + * This program is distributed in the hope that it will be useful, but 9 + * WITHOUT ANY WARRANTY; without even the implied warranty of 10 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 + * General Public License for more details. 12 + * 13 + * This code is based in part on work published here: 14 + * 15 + * https://github.com/IAIK/KAISER 16 + * 17 + * The original work was written by and and signed off by for the Linux 18 + * kernel by: 19 + * 20 + * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> 21 + * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> 22 + * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> 23 + * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> 24 + * 25 + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> 26 + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and 27 + * Andy Lutomirsky <luto@amacapital.net> 28 + */ 29 + #include <linux/kernel.h> 30 + #include <linux/errno.h> 31 + #include <linux/string.h> 32 + #include <linux/types.h> 33 + #include <linux/bug.h> 34 + #include <linux/init.h> 35 + #include <linux/spinlock.h> 36 + #include <linux/mm.h> 37 + #include <linux/uaccess.h> 38 + 39 + #include <asm/cpufeature.h> 40 + #include <asm/hypervisor.h> 41 + #include <asm/vsyscall.h> 42 + #include <asm/cmdline.h> 43 + #include <asm/pti.h> 44 + #include <asm/pgtable.h> 45 + #include <asm/pgalloc.h> 46 + #include <asm/tlbflush.h> 47 + #include <asm/desc.h> 48 + 49 + #undef pr_fmt 50 + #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt 51 + 52 + /* Backporting helper */ 53 + #ifndef __GFP_NOTRACK 54 + #define __GFP_NOTRACK 0 55 + #endif 56 + 57 + static void __init pti_print_if_insecure(const char *reason) 58 + { 59 + if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) 60 + pr_info("%s\n", reason); 61 + } 62 + 63 + static void __init pti_print_if_secure(const char *reason) 64 + { 65 + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) 66 + pr_info("%s\n", reason); 67 + } 68 + 69 + void __init pti_check_boottime_disable(void) 70 + { 71 + char arg[5]; 72 + int ret; 73 + 74 + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { 75 + pti_print_if_insecure("disabled on XEN PV."); 76 + return; 77 + } 78 + 79 + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); 80 + if (ret > 0) { 81 + if (ret == 3 && !strncmp(arg, "off", 3)) { 82 + pti_print_if_insecure("disabled on command line."); 83 + return; 84 + } 85 + if (ret == 2 && !strncmp(arg, "on", 2)) { 86 + pti_print_if_secure("force enabled on command line."); 87 + goto enable; 88 + } 89 + if (ret == 4 && !strncmp(arg, "auto", 4)) 90 + goto autosel; 91 + } 92 + 93 + if (cmdline_find_option_bool(boot_command_line, "nopti")) { 94 + pti_print_if_insecure("disabled on command line."); 95 + return; 96 + } 97 + 98 + autosel: 99 + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) 100 + return; 101 + enable: 102 + setup_force_cpu_cap(X86_FEATURE_PTI); 103 + } 104 + 105 + pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) 106 + { 107 + /* 108 + * Changes to the high (kernel) portion of the kernelmode page 109 + * tables are not automatically propagated to the usermode tables. 110 + * 111 + * Users should keep in mind that, unlike the kernelmode tables, 112 + * there is no vmalloc_fault equivalent for the usermode tables. 113 + * Top-level entries added to init_mm's usermode pgd after boot 114 + * will not be automatically propagated to other mms. 115 + */ 116 + if (!pgdp_maps_userspace(pgdp)) 117 + return pgd; 118 + 119 + /* 120 + * The user page tables get the full PGD, accessible from 121 + * userspace: 122 + */ 123 + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; 124 + 125 + /* 126 + * If this is normal user memory, make it NX in the kernel 127 + * pagetables so that, if we somehow screw up and return to 128 + * usermode with the kernel CR3 loaded, we'll get a page fault 129 + * instead of allowing user code to execute with the wrong CR3. 130 + * 131 + * As exceptions, we don't set NX if: 132 + * - _PAGE_USER is not set. This could be an executable 133 + * EFI runtime mapping or something similar, and the kernel 134 + * may execute from it 135 + * - we don't have NX support 136 + * - we're clearing the PGD (i.e. the new pgd is not present). 137 + */ 138 + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && 139 + (__supported_pte_mask & _PAGE_NX)) 140 + pgd.pgd |= _PAGE_NX; 141 + 142 + /* return the copy of the PGD we want the kernel to use: */ 143 + return pgd; 144 + } 145 + 146 + /* 147 + * Walk the user copy of the page tables (optionally) trying to allocate 148 + * page table pages on the way down. 149 + * 150 + * Returns a pointer to a P4D on success, or NULL on failure. 151 + */ 152 + static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) 153 + { 154 + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); 155 + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 156 + 157 + if (address < PAGE_OFFSET) { 158 + WARN_ONCE(1, "attempt to walk user address\n"); 159 + return NULL; 160 + } 161 + 162 + if (pgd_none(*pgd)) { 163 + unsigned long new_p4d_page = __get_free_page(gfp); 164 + if (!new_p4d_page) 165 + return NULL; 166 + 167 + if (pgd_none(*pgd)) { 168 + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 169 + new_p4d_page = 0; 170 + } 171 + if (new_p4d_page) 172 + free_page(new_p4d_page); 173 + } 174 + BUILD_BUG_ON(pgd_large(*pgd) != 0); 175 + 176 + return p4d_offset(pgd, address); 177 + } 178 + 179 + /* 180 + * Walk the user copy of the page tables (optionally) trying to allocate 181 + * page table pages on the way down. 182 + * 183 + * Returns a pointer to a PMD on success, or NULL on failure. 184 + */ 185 + static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) 186 + { 187 + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 188 + p4d_t *p4d = pti_user_pagetable_walk_p4d(address); 189 + pud_t *pud; 190 + 191 + BUILD_BUG_ON(p4d_large(*p4d) != 0); 192 + if (p4d_none(*p4d)) { 193 + unsigned long new_pud_page = __get_free_page(gfp); 194 + if (!new_pud_page) 195 + return NULL; 196 + 197 + if (p4d_none(*p4d)) { 198 + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); 199 + new_pud_page = 0; 200 + } 201 + if (new_pud_page) 202 + free_page(new_pud_page); 203 + } 204 + 205 + pud = pud_offset(p4d, address); 206 + /* The user page tables do not use large mappings: */ 207 + if (pud_large(*pud)) { 208 + WARN_ON(1); 209 + return NULL; 210 + } 211 + if (pud_none(*pud)) { 212 + unsigned long new_pmd_page = __get_free_page(gfp); 213 + if (!new_pmd_page) 214 + return NULL; 215 + 216 + if (pud_none(*pud)) { 217 + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); 218 + new_pmd_page = 0; 219 + } 220 + if (new_pmd_page) 221 + free_page(new_pmd_page); 222 + } 223 + 224 + return pmd_offset(pud, address); 225 + } 226 + 227 + #ifdef CONFIG_X86_VSYSCALL_EMULATION 228 + /* 229 + * Walk the shadow copy of the page tables (optionally) trying to allocate 230 + * page table pages on the way down. Does not support large pages. 231 + * 232 + * Note: this is only used when mapping *new* kernel data into the 233 + * user/shadow page tables. It is never used for userspace data. 234 + * 235 + * Returns a pointer to a PTE on success, or NULL on failure. 236 + */ 237 + static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) 238 + { 239 + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 240 + pmd_t *pmd = pti_user_pagetable_walk_pmd(address); 241 + pte_t *pte; 242 + 243 + /* We can't do anything sensible if we hit a large mapping. */ 244 + if (pmd_large(*pmd)) { 245 + WARN_ON(1); 246 + return NULL; 247 + } 248 + 249 + if (pmd_none(*pmd)) { 250 + unsigned long new_pte_page = __get_free_page(gfp); 251 + if (!new_pte_page) 252 + return NULL; 253 + 254 + if (pmd_none(*pmd)) { 255 + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); 256 + new_pte_page = 0; 257 + } 258 + if (new_pte_page) 259 + free_page(new_pte_page); 260 + } 261 + 262 + pte = pte_offset_kernel(pmd, address); 263 + if (pte_flags(*pte) & _PAGE_USER) { 264 + WARN_ONCE(1, "attempt to walk to user pte\n"); 265 + return NULL; 266 + } 267 + return pte; 268 + } 269 + 270 + static void __init pti_setup_vsyscall(void) 271 + { 272 + pte_t *pte, *target_pte; 273 + unsigned int level; 274 + 275 + pte = lookup_address(VSYSCALL_ADDR, &level); 276 + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) 277 + return; 278 + 279 + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); 280 + if (WARN_ON(!target_pte)) 281 + return; 282 + 283 + *target_pte = *pte; 284 + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); 285 + } 286 + #else 287 + static void __init pti_setup_vsyscall(void) { } 288 + #endif 289 + 290 + static void __init 291 + pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) 292 + { 293 + unsigned long addr; 294 + 295 + /* 296 + * Clone the populated PMDs which cover start to end. These PMD areas 297 + * can have holes. 298 + */ 299 + for (addr = start; addr < end; addr += PMD_SIZE) { 300 + pmd_t *pmd, *target_pmd; 301 + pgd_t *pgd; 302 + p4d_t *p4d; 303 + pud_t *pud; 304 + 305 + pgd = pgd_offset_k(addr); 306 + if (WARN_ON(pgd_none(*pgd))) 307 + return; 308 + p4d = p4d_offset(pgd, addr); 309 + if (WARN_ON(p4d_none(*p4d))) 310 + return; 311 + pud = pud_offset(p4d, addr); 312 + if (pud_none(*pud)) 313 + continue; 314 + pmd = pmd_offset(pud, addr); 315 + if (pmd_none(*pmd)) 316 + continue; 317 + 318 + target_pmd = pti_user_pagetable_walk_pmd(addr); 319 + if (WARN_ON(!target_pmd)) 320 + return; 321 + 322 + /* 323 + * Copy the PMD. That is, the kernelmode and usermode 324 + * tables will share the last-level page tables of this 325 + * address range 326 + */ 327 + *target_pmd = pmd_clear_flags(*pmd, clear); 328 + } 329 + } 330 + 331 + /* 332 + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a 333 + * next-level entry on 5-level systems. 334 + */ 335 + static void __init pti_clone_p4d(unsigned long addr) 336 + { 337 + p4d_t *kernel_p4d, *user_p4d; 338 + pgd_t *kernel_pgd; 339 + 340 + user_p4d = pti_user_pagetable_walk_p4d(addr); 341 + kernel_pgd = pgd_offset_k(addr); 342 + kernel_p4d = p4d_offset(kernel_pgd, addr); 343 + *user_p4d = *kernel_p4d; 344 + } 345 + 346 + /* 347 + * Clone the CPU_ENTRY_AREA into the user space visible page table. 348 + */ 349 + static void __init pti_clone_user_shared(void) 350 + { 351 + pti_clone_p4d(CPU_ENTRY_AREA_BASE); 352 + } 353 + 354 + /* 355 + * Clone the ESPFIX P4D into the user space visinble page table 356 + */ 357 + static void __init pti_setup_espfix64(void) 358 + { 359 + #ifdef CONFIG_X86_ESPFIX64 360 + pti_clone_p4d(ESPFIX_BASE_ADDR); 361 + #endif 362 + } 363 + 364 + /* 365 + * Clone the populated PMDs of the entry and irqentry text and force it RO. 366 + */ 367 + static void __init pti_clone_entry_text(void) 368 + { 369 + pti_clone_pmds((unsigned long) __entry_text_start, 370 + (unsigned long) __irqentry_text_end, _PAGE_RW); 371 + } 372 + 373 + /* 374 + * Initialize kernel page table isolation 375 + */ 376 + void __init pti_init(void) 377 + { 378 + if (!static_cpu_has(X86_FEATURE_PTI)) 379 + return; 380 + 381 + pr_info("enabled\n"); 382 + 383 + pti_clone_user_shared(); 384 + pti_clone_entry_text(); 385 + pti_setup_espfix64(); 386 + pti_setup_vsyscall(); 387 + }

+56 -2

arch/x86/mm/tlb.c

··· 28 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 29 29 */ 30 30 31 + /* 32 + * We get here when we do something requiring a TLB invalidation 33 + * but could not go invalidate all of the contexts. We do the 34 + * necessary invalidation by clearing out the 'ctx_id' which 35 + * forces a TLB flush when the context is loaded. 36 + */ 37 + void clear_asid_other(void) 38 + { 39 + u16 asid; 40 + 41 + /* 42 + * This is only expected to be set if we have disabled 43 + * kernel _PAGE_GLOBAL pages. 44 + */ 45 + if (!static_cpu_has(X86_FEATURE_PTI)) { 46 + WARN_ON_ONCE(1); 47 + return; 48 + } 49 + 50 + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 51 + /* Do not need to flush the current asid */ 52 + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) 53 + continue; 54 + /* 55 + * Make sure the next time we go to switch to 56 + * this asid, we do a flush: 57 + */ 58 + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); 59 + } 60 + this_cpu_write(cpu_tlbstate.invalidate_other, false); 61 + } 62 + 31 63 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 64 33 65 ··· 73 41 *need_flush = true; 74 42 return; 75 43 } 44 + 45 + if (this_cpu_read(cpu_tlbstate.invalidate_other)) 46 + clear_asid_other(); 76 47 77 48 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 78 49 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != ··· 98 63 this_cpu_write(cpu_tlbstate.next_asid, 1); 99 64 } 100 65 *need_flush = true; 66 + } 67 + 68 + static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) 69 + { 70 + unsigned long new_mm_cr3; 71 + 72 + if (need_flush) { 73 + invalidate_user_asid(new_asid); 74 + new_mm_cr3 = build_cr3(pgdir, new_asid); 75 + } else { 76 + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); 77 + } 78 + 79 + /* 80 + * Caution: many callers of this function expect 81 + * that load_cr3() is serializing and orders TLB 82 + * fills with respect to the mm_cpumask writes. 83 + */ 84 + write_cr3(new_mm_cr3); 101 85 } 102 86 103 87 void leave_mm(int cpu) ··· 249 195 if (need_flush) { 250 196 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 251 197 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 252 - write_cr3(build_cr3(next->pgd, new_asid)); 198 + load_new_mm_cr3(next->pgd, new_asid, true); 253 199 254 200 /* 255 201 * NB: This gets called via leave_mm() in the idle path ··· 262 208 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 263 209 } else { 264 210 /* The new ASID is already up to date. */ 265 - write_cr3(build_cr3_noflush(next->pgd, new_asid)); 211 + load_new_mm_cr3(next->pgd, new_asid, false); 266 212 267 213 /* See above wrt _rcuidle. */ 268 214 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);

+4 -1

arch/x86/platform/efi/efi_64.c

··· 196 196 * because we want to avoid inserting EFI region mappings (EFI_VA_END 197 197 * to EFI_VA_START) into the standard kernel page tables. Everything 198 198 * else can be shared, see efi_sync_low_kernel_mappings(). 199 + * 200 + * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the 201 + * allocation. 199 202 */ 200 203 int __init efi_alloc_page_tables(void) 201 204 { ··· 211 208 return 0; 212 209 213 210 gfp_mask = GFP_KERNEL | __GFP_ZERO; 214 - efi_pgd = (pgd_t *)__get_free_page(gfp_mask); 211 + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); 215 212 if (!efi_pgd) 216 213 return -ENOMEM; 217 214

+11

include/linux/pti.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef _INCLUDE_PTI_H 3 + #define _INCLUDE_PTI_H 4 + 5 + #ifdef CONFIG_PAGE_TABLE_ISOLATION 6 + #include <asm/pti.h> 7 + #else 8 + static inline void pti_init(void) { } 9 + #endif 10 + 11 + #endif

+3

init/main.c

··· 75 75 #include <linux/slab.h> 76 76 #include <linux/perf_event.h> 77 77 #include <linux/ptrace.h> 78 + #include <linux/pti.h> 78 79 #include <linux/blkdev.h> 79 80 #include <linux/elevator.h> 80 81 #include <linux/sched_clock.h> ··· 507 506 ioremap_huge_init(); 508 507 /* Should be run before the first non-init thread is created */ 509 508 init_espfix_bsp(); 509 + /* Should be run after espfix64 is set up. */ 510 + pti_init(); 510 511 } 511 512 512 513 asmlinkage __visible void __init start_kernel(void)

+10

security/Kconfig

··· 54 54 implement socket and networking access controls. 55 55 If you are unsure how to answer this question, answer N. 56 56 57 + config PAGE_TABLE_ISOLATION 58 + bool "Remove the kernel mapping in user mode" 59 + depends on X86_64 && !UML 60 + help 61 + This feature reduces the number of hardware side channels by 62 + ensuring that the majority of kernel addresses are not mapped 63 + into userspace. 64 + 65 + See Documentation/x86/pagetable-isolation.txt for more details. 66 + 57 67 config SECURITY_INFINIBAND 58 68 bool "Infiniband Security Hooks" 59 69 depends on SECURITY && INFINIBAND

+1 -2

tools/testing/selftests/x86/ldt_gdt.c

··· 122 122 * NB: Different Linux versions do different things with the 123 123 * accessed bit in set_thread_area(). 124 124 */ 125 - if (ar != expected_ar && 126 - (ldt || ar != (expected_ar | AR_ACCESSED))) { 125 + if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) { 127 126 printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", 128 127 (ldt ? "LDT" : "GDT"), index, ar, expected_ar); 129 128 nerrs++;

Configure Feed

Configure Feed