Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull x86 PTI preparatory patches from Thomas Gleixner:
"Todays Advent calendar window contains twentyfour easy to digest
patches. The original plan was to have twenty three matching the date,
but a late fixup made that moot.

- Move the cpu_entry_area mapping out of the fixmap into a separate
address space. That's necessary because the fixmap becomes too big
with NRCPUS=8192 and this caused already subtle and hard to
diagnose failures.

The top most patch is fresh from today and cures a brain slip of
that tall grumpy german greybeard, who ignored the intricacies of
32bit wraparounds.

- Limit the number of CPUs on 32bit to 64. That's insane big already,
but at least it's small enough to prevent address space issues with
the cpu_entry_area map, which have been observed and debugged with
the fixmap code

- A few TLB flush fixes in various places plus documentation which of
the TLB functions should be used for what.

- Rename the SYSENTER stack to CPU_ENTRY_AREA stack as it is used for
more than sysenter now and keeping the name makes backtraces
confusing.

- Prevent LDT inheritance on exec() by moving it to arch_dup_mmap(),
which is only invoked on fork().

- Make vysycall more robust.

- A few fixes and cleanups of the debug_pagetables code. Check
PAGE_PRESENT instead of checking the PTE for 0 and a cleanup of the
C89 initialization of the address hint array which already was out
of sync with the index enums.

- Move the ESPFIX init to a different place to prepare for PTI.

- Several code moves with no functional change to make PTI
integration simpler and header files less convoluted.

- Documentation fixes and clarifications"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
x86/cpu_entry_area: Prevent wraparound in setup_cpu_entry_area_ptes() on 32bit
init: Invoke init_espfix_bsp() from mm_init()
x86/cpu_entry_area: Move it out of the fixmap
x86/cpu_entry_area: Move it to a separate unit
x86/mm: Create asm/invpcid.h
x86/mm: Put MMU to hardware ASID translation in one place
x86/mm: Remove hard-coded ASID limit checks
x86/mm: Move the CR3 construction functions to tlbflush.h
x86/mm: Add comments to clarify which TLB-flush functions are supposed to flush what
x86/mm: Remove superfluous barriers
x86/mm: Use __flush_tlb_one() for kernel memory
x86/microcode: Dont abuse the TLB-flush interface
x86/uv: Use the right TLB-flush API
x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack
x86/doc: Remove obvious weirdnesses from the x86 MM layout documentation
x86/mm/64: Improve the memory map documentation
x86/ldt: Prevent LDT inheritance on exec
x86/ldt: Rework locking
arch, mm: Allow arch_dup_mmap() to fail
x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE mode
...

Linus Torvalds 8 years ago caf9a826 9c294ec0

+636 -468

44 changed files

expand all

Documentation

x86

x86_64

mm.txt

arch

powerpc

include

asm

mmu_context.h

include

asm

mmu_context.h

unicore32

include

asm

mmu_context.h

x86

Kconfig

entry

entry_32.S

entry_64.S

vsyscall

vsyscall_64.c

include

asm

cpu_entry_area.h

desc.h

espfix.h

fixmap.h

invpcid.h

mmu.h

mmu_context.h

pgtable_32_types.h

pgtable_64_types.h

processor.h

stacktrace.h

tlbflush.h

kernel

asm-offsets.c

asm-offsets_32.c

cpu

common.c

microcode

intel.c

dumpstack.c

dumpstack_32.c

dumpstack_64.c

ldt.c

smpboot.c

traps.c

Makefile

cpu_entry_area.c

dump_pagetables.c

init_32.c

kasan_init_64.c

pgtable_32.c

tlb.c

platform

tlb_uv.c

xen

mmu_pv.c

include

asm-generic

mm_hooks.h

pgtable.h

init

main.c

kernel

fork.c

tools

testing

selftests

x86

ldt_gdt.c

+11 -13

Documentation/x86/x86_64/mm.txt

··· 1 1 2 - <previous description obsolete, deleted> 3 - 4 2 Virtual memory map with 4 level page tables: 5 3 6 4 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm ··· 12 14 ... unused hole ... 13 15 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) 14 16 ... unused hole ... 17 + fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping 15 18 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks 16 19 ... unused hole ... 17 20 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space 18 21 ... unused hole ... 19 22 ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 20 - ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) 21 - ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls 23 + ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable) 24 + [fixmap start] - ffffffffff5fffff kernel-internal fixmap range 25 + ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI 22 26 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole 23 27 24 28 Virtual memory map with 5 level page tables: ··· 36 36 ... unused hole ... 37 37 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) 38 38 ... unused hole ... 39 + fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping 39 40 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks 40 41 ... unused hole ... 41 42 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space 42 43 ... unused hole ... 43 44 ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 44 - ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space 45 - ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls 45 + ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space 46 + [fixmap start] - ffffffffff5fffff kernel-internal fixmap range 47 + ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI 46 48 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole 47 49 48 50 Architecture defines a 64-bit virtual address. Implementations can support 49 51 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 50 - through to the most-significant implemented bit are set to either all ones 51 - or all zero. This causes hole between user space and kernel addresses. 52 + through to the most-significant implemented bit are sign extended. 53 + This causes hole between user space and kernel addresses if you interpret them 54 + as unsigned. 52 55 53 56 The direct mapping covers all memory in the system up to the highest 54 57 memory address (this means in some cases it can also include PCI memory ··· 60 57 vmalloc space is lazily synchronized into the different PML4/PML5 pages of 61 58 the processes using the page fault handler, with init_top_pgt as 62 59 reference. 63 - 64 - Current X86-64 implementations support up to 46 bits of address space (64 TB), 65 - which is our current limit. This expands into MBZ space in the page tables. 66 60 67 61 We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual 68 62 memory window (this size is arbitrary, it can be raised later if needed). ··· 72 72 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all 73 73 physical memory, vmalloc/ioremap space and virtual memory map are randomized. 74 74 Their order is preserved but their base will be offset early at boot time. 75 - 76 - -Andi Kleen, Jul 2004

+3 -2

arch/powerpc/include/asm/mmu_context.h

··· 160 160 #endif 161 161 } 162 162 163 - static inline void arch_dup_mmap(struct mm_struct *oldmm, 164 - struct mm_struct *mm) 163 + static inline int arch_dup_mmap(struct mm_struct *oldmm, 164 + struct mm_struct *mm) 165 165 { 166 + return 0; 166 167 } 167 168 168 169 #ifndef CONFIG_PPC_BOOK3S_64

+2 -1

arch/um/include/asm/mmu_context.h

··· 15 15 /* 16 16 * Needed since we do not use the asm-generic/mm_hooks.h: 17 17 */ 18 - static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 18 + static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 19 19 { 20 20 uml_setup_stubs(mm); 21 + return 0; 21 22 } 22 23 extern void arch_exit_mmap(struct mm_struct *mm); 23 24 static inline void arch_unmap(struct mm_struct *mm,

+3 -2

arch/unicore32/include/asm/mmu_context.h

··· 81 81 } \ 82 82 } while (0) 83 83 84 - static inline void arch_dup_mmap(struct mm_struct *oldmm, 85 - struct mm_struct *mm) 84 + static inline int arch_dup_mmap(struct mm_struct *oldmm, 85 + struct mm_struct *mm) 86 86 { 87 + return 0; 87 88 } 88 89 89 90 static inline void arch_unmap(struct mm_struct *mm,

+2 -1

arch/x86/Kconfig

··· 926 926 config NR_CPUS 927 927 int "Maximum number of CPUs" if SMP && !MAXSMP 928 928 range 2 8 if SMP && X86_32 && !X86_BIGSMP 929 - range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK 929 + range 2 64 if SMP && X86_32 && X86_BIGSMP 930 + range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 930 931 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 931 932 default "1" if !SMP 932 933 default "8192" if MAXSMP

+6 -6

arch/x86/entry/entry_32.S

··· 942 942 943 943 /* Are we currently on the SYSENTER stack? */ 944 944 movl PER_CPU_VAR(cpu_entry_area), %ecx 945 - addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx 946 - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ 947 - cmpl $SIZEOF_SYSENTER_stack, %ecx 945 + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx 946 + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ 947 + cmpl $SIZEOF_entry_stack, %ecx 948 948 jb .Ldebug_from_sysenter_stack 949 949 950 950 TRACE_IRQS_OFF ··· 986 986 987 987 /* Are we currently on the SYSENTER stack? */ 988 988 movl PER_CPU_VAR(cpu_entry_area), %ecx 989 - addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx 990 - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ 991 - cmpl $SIZEOF_SYSENTER_stack, %ecx 989 + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx 990 + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ 991 + cmpl $SIZEOF_entry_stack, %ecx 992 992 jb .Lnmi_from_sysenter_stack 993 993 994 994 /* Not on SYSENTER stack. */

+2 -2

arch/x86/entry/entry_64.S

··· 158 158 _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) 159 159 160 160 /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ 161 - #define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ 162 - SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA 161 + #define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ 162 + SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA 163 163 164 164 ENTRY(entry_SYSCALL_64_trampoline) 165 165 UNWIND_HINT_EMPTY

+37 -1

arch/x86/entry/vsyscall/vsyscall_64.c

··· 37 37 #include <asm/unistd.h> 38 38 #include <asm/fixmap.h> 39 39 #include <asm/traps.h> 40 + #include <asm/paravirt.h> 40 41 41 42 #define CREATE_TRACE_POINTS 42 43 #include "vsyscall_trace.h" ··· 138 137 */ 139 138 140 139 WARN_ON_ONCE(address != regs->ip); 140 + 141 + /* This should be unreachable in NATIVE mode. */ 142 + if (WARN_ON(vsyscall_mode == NATIVE)) 143 + return false; 141 144 142 145 if (vsyscall_mode == NONE) { 143 146 warn_bad_vsyscall(KERN_INFO, regs, ··· 334 329 return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; 335 330 } 336 331 332 + /* 333 + * The VSYSCALL page is the only user-accessible page in the kernel address 334 + * range. Normally, the kernel page tables can have _PAGE_USER clear, but 335 + * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls 336 + * are enabled. 337 + * 338 + * Some day we may create a "minimal" vsyscall mode in which we emulate 339 + * vsyscalls but leave the page not present. If so, we skip calling 340 + * this. 341 + */ 342 + static void __init set_vsyscall_pgtable_user_bits(void) 343 + { 344 + pgd_t *pgd; 345 + p4d_t *p4d; 346 + pud_t *pud; 347 + pmd_t *pmd; 348 + 349 + pgd = pgd_offset_k(VSYSCALL_ADDR); 350 + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); 351 + p4d = p4d_offset(pgd, VSYSCALL_ADDR); 352 + #if CONFIG_PGTABLE_LEVELS >= 5 353 + p4d->p4d |= _PAGE_USER; 354 + #endif 355 + pud = pud_offset(p4d, VSYSCALL_ADDR); 356 + set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); 357 + pmd = pmd_offset(pud, VSYSCALL_ADDR); 358 + set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); 359 + } 360 + 337 361 void __init map_vsyscall(void) 338 362 { 339 363 extern char __vsyscall_page; 340 364 unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); 341 365 342 - if (vsyscall_mode != NONE) 366 + if (vsyscall_mode != NONE) { 343 367 __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, 344 368 vsyscall_mode == NATIVE 345 369 ? PAGE_KERNEL_VSYSCALL 346 370 : PAGE_KERNEL_VVAR); 371 + set_vsyscall_pgtable_user_bits(); 372 + } 347 373 348 374 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != 349 375 (unsigned long)VSYSCALL_ADDR);

+68

arch/x86/include/asm/cpu_entry_area.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #ifndef _ASM_X86_CPU_ENTRY_AREA_H 4 + #define _ASM_X86_CPU_ENTRY_AREA_H 5 + 6 + #include <linux/percpu-defs.h> 7 + #include <asm/processor.h> 8 + 9 + /* 10 + * cpu_entry_area is a percpu region that contains things needed by the CPU 11 + * and early entry/exit code. Real types aren't used for all fields here 12 + * to avoid circular header dependencies. 13 + * 14 + * Every field is a virtual alias of some other allocated backing store. 15 + * There is no direct allocation of a struct cpu_entry_area. 16 + */ 17 + struct cpu_entry_area { 18 + char gdt[PAGE_SIZE]; 19 + 20 + /* 21 + * The GDT is just below entry_stack and thus serves (on x86_64) as 22 + * a a read-only guard page. 23 + */ 24 + struct entry_stack_page entry_stack_page; 25 + 26 + /* 27 + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because 28 + * we need task switches to work, and task switches write to the TSS. 29 + */ 30 + struct tss_struct tss; 31 + 32 + char entry_trampoline[PAGE_SIZE]; 33 + 34 + #ifdef CONFIG_X86_64 35 + /* 36 + * Exception stacks used for IST entries. 37 + * 38 + * In the future, this should have a separate slot for each stack 39 + * with guard pages between them. 40 + */ 41 + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; 42 + #endif 43 + }; 44 + 45 + #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) 46 + #define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) 47 + 48 + DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); 49 + 50 + extern void setup_cpu_entry_areas(void); 51 + extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); 52 + 53 + #define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE 54 + #define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) 55 + 56 + #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) 57 + 58 + #define CPU_ENTRY_AREA_MAP_SIZE \ 59 + (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) 60 + 61 + extern struct cpu_entry_area *get_cpu_entry_area(int cpu); 62 + 63 + static inline struct entry_stack *cpu_entry_stack(int cpu) 64 + { 65 + return &get_cpu_entry_area(cpu)->entry_stack_page.stack; 66 + } 67 + 68 + #endif

arch/x86/include/asm/desc.h

··· 7 7 #include <asm/mmu.h> 8 8 #include <asm/fixmap.h> 9 9 #include <asm/irq_vectors.h> 10 + #include <asm/cpu_entry_area.h> 10 11 11 12 #include <linux/smp.h> 12 13 #include <linux/percpu.h>

+4 -3

arch/x86/include/asm/espfix.h

··· 2 2 #ifndef _ASM_X86_ESPFIX_H 3 3 #define _ASM_X86_ESPFIX_H 4 4 5 - #ifdef CONFIG_X86_64 5 + #ifdef CONFIG_X86_ESPFIX64 6 6 7 7 #include <asm/percpu.h> 8 8 ··· 11 11 12 12 extern void init_espfix_bsp(void); 13 13 extern void init_espfix_ap(int cpu); 14 - 15 - #endif /* CONFIG_X86_64 */ 14 + #else 15 + static inline void init_espfix_ap(int cpu) { } 16 + #endif 16 17 17 18 #endif /* _ASM_X86_ESPFIX_H */

+1 -70

arch/x86/include/asm/fixmap.h

··· 45 45 #endif 46 46 47 47 /* 48 - * cpu_entry_area is a percpu region in the fixmap that contains things 49 - * needed by the CPU and early entry/exit code. Real types aren't used 50 - * for all fields here to avoid circular header dependencies. 51 - * 52 - * Every field is a virtual alias of some other allocated backing store. 53 - * There is no direct allocation of a struct cpu_entry_area. 54 - */ 55 - struct cpu_entry_area { 56 - char gdt[PAGE_SIZE]; 57 - 58 - /* 59 - * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as 60 - * a a read-only guard page. 61 - */ 62 - struct SYSENTER_stack_page SYSENTER_stack_page; 63 - 64 - /* 65 - * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because 66 - * we need task switches to work, and task switches write to the TSS. 67 - */ 68 - struct tss_struct tss; 69 - 70 - char entry_trampoline[PAGE_SIZE]; 71 - 72 - #ifdef CONFIG_X86_64 73 - /* 74 - * Exception stacks used for IST entries. 75 - * 76 - * In the future, this should have a separate slot for each stack 77 - * with guard pages between them. 78 - */ 79 - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; 80 - #endif 81 - }; 82 - 83 - #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) 84 - 85 - extern void setup_cpu_entry_areas(void); 86 - 87 - /* 88 48 * Here we define all the compile-time 'special' virtual 89 49 * addresses. The point is to have a constant address at 90 50 * compile time, but to set the physical address only ··· 83 123 FIX_IO_APIC_BASE_0, 84 124 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, 85 125 #endif 86 - FIX_RO_IDT, /* Virtual mapping for read-only IDT */ 87 126 #ifdef CONFIG_X86_32 88 127 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ 89 128 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ··· 98 139 #ifdef CONFIG_X86_INTEL_MID 99 140 FIX_LNW_VRTC, 100 141 #endif 101 - /* Fixmap entries to remap the GDTs, one per processor. */ 102 - FIX_CPU_ENTRY_AREA_TOP, 103 - FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, 104 142 105 143 #ifdef CONFIG_ACPI_APEI_GHES 106 144 /* Used for GHES mapping from assorted contexts */ ··· 138 182 extern void reserve_top_address(unsigned long reserve); 139 183 140 184 #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) 141 - #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) 185 + #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) 142 186 143 187 extern int fixmaps_set; 144 188 ··· 185 229 186 230 void __early_set_fixmap(enum fixed_addresses idx, 187 231 phys_addr_t phys, pgprot_t flags); 188 - 189 - static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) 190 - { 191 - BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); 192 - 193 - return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; 194 - } 195 - 196 - #define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ 197 - BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ 198 - __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ 199 - }) 200 - 201 - #define get_cpu_entry_area_index(cpu, field) \ 202 - __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) 203 - 204 - static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) 205 - { 206 - return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); 207 - } 208 - 209 - static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) 210 - { 211 - return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; 212 - } 213 232 214 233 #endif /* !__ASSEMBLY__ */ 215 234 #endif /* _ASM_X86_FIXMAP_H */

+53

arch/x86/include/asm/invpcid.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_X86_INVPCID 3 + #define _ASM_X86_INVPCID 4 + 5 + static inline void __invpcid(unsigned long pcid, unsigned long addr, 6 + unsigned long type) 7 + { 8 + struct { u64 d[2]; } desc = { { pcid, addr } }; 9 + 10 + /* 11 + * The memory clobber is because the whole point is to invalidate 12 + * stale TLB entries and, especially if we're flushing global 13 + * mappings, we don't want the compiler to reorder any subsequent 14 + * memory accesses before the TLB flush. 15 + * 16 + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and 17 + * invpcid (%rcx), %rax in long mode. 18 + */ 19 + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" 20 + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); 21 + } 22 + 23 + #define INVPCID_TYPE_INDIV_ADDR 0 24 + #define INVPCID_TYPE_SINGLE_CTXT 1 25 + #define INVPCID_TYPE_ALL_INCL_GLOBAL 2 26 + #define INVPCID_TYPE_ALL_NON_GLOBAL 3 27 + 28 + /* Flush all mappings for a given pcid and addr, not including globals. */ 29 + static inline void invpcid_flush_one(unsigned long pcid, 30 + unsigned long addr) 31 + { 32 + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); 33 + } 34 + 35 + /* Flush all mappings for a given PCID, not including globals. */ 36 + static inline void invpcid_flush_single_context(unsigned long pcid) 37 + { 38 + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); 39 + } 40 + 41 + /* Flush all mappings, including globals, for all PCIDs. */ 42 + static inline void invpcid_flush_all(void) 43 + { 44 + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); 45 + } 46 + 47 + /* Flush all mappings for all PCIDs except globals. */ 48 + static inline void invpcid_flush_all_nonglobals(void) 49 + { 50 + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); 51 + } 52 + 53 + #endif /* _ASM_X86_INVPCID */

+3 -1

arch/x86/include/asm/mmu.h

··· 3 3 #define _ASM_X86_MMU_H 4 4 5 5 #include <linux/spinlock.h> 6 + #include <linux/rwsem.h> 6 7 #include <linux/mutex.h> 7 8 #include <linux/atomic.h> 8 9 ··· 28 27 atomic64_t tlb_gen; 29 28 30 29 #ifdef CONFIG_MODIFY_LDT_SYSCALL 31 - struct ldt_struct *ldt; 30 + struct rw_semaphore ldt_usr_sem; 31 + struct ldt_struct *ldt; 32 32 #endif 33 33 34 34 #ifdef CONFIG_X86_64

+18 -36

arch/x86/include/asm/mmu_context.h

··· 57 57 /* 58 58 * Used for LDT copy/destruction. 59 59 */ 60 - int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); 60 + static inline void init_new_context_ldt(struct mm_struct *mm) 61 + { 62 + mm->context.ldt = NULL; 63 + init_rwsem(&mm->context.ldt_usr_sem); 64 + } 65 + int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); 61 66 void destroy_context_ldt(struct mm_struct *mm); 62 67 #else /* CONFIG_MODIFY_LDT_SYSCALL */ 63 - static inline int init_new_context_ldt(struct task_struct *tsk, 64 - struct mm_struct *mm) 68 + static inline void init_new_context_ldt(struct mm_struct *mm) { } 69 + static inline int ldt_dup_context(struct mm_struct *oldmm, 70 + struct mm_struct *mm) 65 71 { 66 72 return 0; 67 73 } ··· 138 132 static inline int init_new_context(struct task_struct *tsk, 139 133 struct mm_struct *mm) 140 134 { 135 + mutex_init(&mm->context.lock); 136 + 141 137 mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); 142 138 atomic64_set(&mm->context.tlb_gen, 0); 143 139 144 - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 140 + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 145 141 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { 146 142 /* pkey 0 is the default and always allocated */ 147 143 mm->context.pkey_allocation_map = 0x1; 148 144 /* -1 means unallocated or invalid */ 149 145 mm->context.execute_only_pkey = -1; 150 146 } 151 - #endif 152 - return init_new_context_ldt(tsk, mm); 147 + #endif 148 + init_new_context_ldt(mm); 149 + return 0; 153 150 } 154 151 static inline void destroy_context(struct mm_struct *mm) 155 152 { ··· 185 176 } while (0) 186 177 #endif 187 178 188 - static inline void arch_dup_mmap(struct mm_struct *oldmm, 189 - struct mm_struct *mm) 179 + static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 190 180 { 191 181 paravirt_arch_dup_mmap(oldmm, mm); 182 + return ldt_dup_context(oldmm, mm); 192 183 } 193 184 194 185 static inline void arch_exit_mmap(struct mm_struct *mm) ··· 291 282 } 292 283 293 284 /* 294 - * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID 295 - * bits. This serves two purposes. It prevents a nasty situation in 296 - * which PCID-unaware code saves CR3, loads some other value (with PCID 297 - * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if 298 - * the saved ASID was nonzero. It also means that any bugs involving 299 - * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger 300 - * deterministically. 301 - */ 302 - 303 - static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) 304 - { 305 - if (static_cpu_has(X86_FEATURE_PCID)) { 306 - VM_WARN_ON_ONCE(asid > 4094); 307 - return __sme_pa(mm->pgd) | (asid + 1); 308 - } else { 309 - VM_WARN_ON_ONCE(asid != 0); 310 - return __sme_pa(mm->pgd); 311 - } 312 - } 313 - 314 - static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) 315 - { 316 - VM_WARN_ON_ONCE(asid > 4094); 317 - return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; 318 - } 319 - 320 - /* 321 285 * This can be used from process context to figure out what the value of 322 286 * CR3 is without needing to do a (slow) __read_cr3(). 323 287 * ··· 299 317 */ 300 318 static inline unsigned long __get_current_cr3_fast(void) 301 319 { 302 - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), 320 + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, 303 321 this_cpu_read(cpu_tlbstate.loaded_mm_asid)); 304 322 305 323 /* For now, be very restrictive about when this can be called. */

+12 -3

arch/x86/include/asm/pgtable_32_types.h

··· 38 38 #define LAST_PKMAP 1024 39 39 #endif 40 40 41 - #define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ 42 - & PMD_MASK) 41 + /* 42 + * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c 43 + * to avoid include recursion hell 44 + */ 45 + #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) 46 + 47 + #define CPU_ENTRY_AREA_BASE \ 48 + ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) 49 + 50 + #define PKMAP_BASE \ 51 + ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) 43 52 44 53 #ifdef CONFIG_HIGHMEM 45 54 # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) 46 55 #else 47 - # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) 56 + # define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) 48 57 #endif 49 58 50 59 #define MODULES_VADDR VMALLOC_START

+28 -19

arch/x86/include/asm/pgtable_64_types.h

··· 76 76 #define PGDIR_MASK (~(PGDIR_SIZE - 1)) 77 77 78 78 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ 79 - #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 79 + #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) 80 + 80 81 #ifdef CONFIG_X86_5LEVEL 81 - #define VMALLOC_SIZE_TB _AC(16384, UL) 82 - #define __VMALLOC_BASE _AC(0xff92000000000000, UL) 83 - #define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) 82 + # define VMALLOC_SIZE_TB _AC(16384, UL) 83 + # define __VMALLOC_BASE _AC(0xff92000000000000, UL) 84 + # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) 84 85 #else 85 - #define VMALLOC_SIZE_TB _AC(32, UL) 86 - #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) 87 - #define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) 86 + # define VMALLOC_SIZE_TB _AC(32, UL) 87 + # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) 88 + # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) 88 89 #endif 90 + 89 91 #ifdef CONFIG_RANDOMIZE_MEMORY 90 - #define VMALLOC_START vmalloc_base 91 - #define VMEMMAP_START vmemmap_base 92 + # define VMALLOC_START vmalloc_base 93 + # define VMEMMAP_START vmemmap_base 92 94 #else 93 - #define VMALLOC_START __VMALLOC_BASE 94 - #define VMEMMAP_START __VMEMMAP_BASE 95 + # define VMALLOC_START __VMALLOC_BASE 96 + # define VMEMMAP_START __VMEMMAP_BASE 95 97 #endif /* CONFIG_RANDOMIZE_MEMORY */ 96 - #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) 97 - #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) 98 + 99 + #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) 100 + 101 + #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) 98 102 /* The module sections ends with the start of the fixmap */ 99 - #define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) 100 - #define MODULES_LEN (MODULES_END - MODULES_VADDR) 101 - #define ESPFIX_PGD_ENTRY _AC(-2, UL) 102 - #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) 103 - #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) 104 - #define EFI_VA_END (-68 * (_AC(1, UL) << 30)) 103 + #define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) 104 + #define MODULES_LEN (MODULES_END - MODULES_VADDR) 105 + 106 + #define ESPFIX_PGD_ENTRY _AC(-2, UL) 107 + #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) 108 + 109 + #define CPU_ENTRY_AREA_PGD _AC(-3, UL) 110 + #define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) 111 + 112 + #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) 113 + #define EFI_VA_END (-68 * (_AC(1, UL) << 30)) 105 114 106 115 #define EARLY_DYNAMIC_PAGE_TABLES 64 107 116

+3 -3

arch/x86/include/asm/processor.h

··· 337 337 #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) 338 338 #define INVALID_IO_BITMAP_OFFSET 0x8000 339 339 340 - struct SYSENTER_stack { 340 + struct entry_stack { 341 341 unsigned long words[64]; 342 342 }; 343 343 344 - struct SYSENTER_stack_page { 345 - struct SYSENTER_stack stack; 344 + struct entry_stack_page { 345 + struct entry_stack stack; 346 346 } __aligned(PAGE_SIZE); 347 347 348 348 struct tss_struct {

+2 -2

arch/x86/include/asm/stacktrace.h

··· 16 16 STACK_TYPE_TASK, 17 17 STACK_TYPE_IRQ, 18 18 STACK_TYPE_SOFTIRQ, 19 - STACK_TYPE_SYSENTER, 19 + STACK_TYPE_ENTRY, 20 20 STACK_TYPE_EXCEPTION, 21 21 STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, 22 22 }; ··· 29 29 bool in_task_stack(unsigned long *stack, struct task_struct *task, 30 30 struct stack_info *info); 31 31 32 - bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); 32 + bool in_entry_stack(unsigned long *stack, struct stack_info *info); 33 33 34 34 int get_stack_info(unsigned long *stack, struct task_struct *task, 35 35 struct stack_info *info, unsigned long *visit_mask);

+77 -69

arch/x86/include/asm/tlbflush.h

··· 9 9 #include <asm/cpufeature.h> 10 10 #include <asm/special_insns.h> 11 11 #include <asm/smp.h> 12 - 13 - static inline void __invpcid(unsigned long pcid, unsigned long addr, 14 - unsigned long type) 15 - { 16 - struct { u64 d[2]; } desc = { { pcid, addr } }; 17 - 18 - /* 19 - * The memory clobber is because the whole point is to invalidate 20 - * stale TLB entries and, especially if we're flushing global 21 - * mappings, we don't want the compiler to reorder any subsequent 22 - * memory accesses before the TLB flush. 23 - * 24 - * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and 25 - * invpcid (%rcx), %rax in long mode. 26 - */ 27 - asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" 28 - : : "m" (desc), "a" (type), "c" (&desc) : "memory"); 29 - } 30 - 31 - #define INVPCID_TYPE_INDIV_ADDR 0 32 - #define INVPCID_TYPE_SINGLE_CTXT 1 33 - #define INVPCID_TYPE_ALL_INCL_GLOBAL 2 34 - #define INVPCID_TYPE_ALL_NON_GLOBAL 3 35 - 36 - /* Flush all mappings for a given pcid and addr, not including globals. */ 37 - static inline void invpcid_flush_one(unsigned long pcid, 38 - unsigned long addr) 39 - { 40 - __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); 41 - } 42 - 43 - /* Flush all mappings for a given PCID, not including globals. */ 44 - static inline void invpcid_flush_single_context(unsigned long pcid) 45 - { 46 - __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); 47 - } 48 - 49 - /* Flush all mappings, including globals, for all PCIDs. */ 50 - static inline void invpcid_flush_all(void) 51 - { 52 - __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); 53 - } 54 - 55 - /* Flush all mappings for all PCIDs except globals. */ 56 - static inline void invpcid_flush_all_nonglobals(void) 57 - { 58 - __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); 59 - } 12 + #include <asm/invpcid.h> 60 13 61 14 static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) 62 15 { 63 - u64 new_tlb_gen; 64 - 65 16 /* 66 17 * Bump the generation count. This also serves as a full barrier 67 18 * that synchronizes with switch_mm(): callers are required to order 68 19 * their read of mm_cpumask after their writes to the paging 69 20 * structures. 70 21 */ 71 - smp_mb__before_atomic(); 72 - new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); 73 - smp_mb__after_atomic(); 22 + return atomic64_inc_return(&mm->context.tlb_gen); 23 + } 74 24 75 - return new_tlb_gen; 25 + /* There are 12 bits of space for ASIDS in CR3 */ 26 + #define CR3_HW_ASID_BITS 12 27 + /* 28 + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for 29 + * user/kernel switches 30 + */ 31 + #define PTI_CONSUMED_ASID_BITS 0 32 + 33 + #define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) 34 + /* 35 + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account 36 + * for them being zero-based. Another -1 is because ASID 0 is reserved for 37 + * use by non-PCID-aware users. 38 + */ 39 + #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) 40 + 41 + static inline u16 kern_pcid(u16 asid) 42 + { 43 + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 44 + /* 45 + * If PCID is on, ASID-aware code paths put the ASID+1 into the 46 + * PCID bits. This serves two purposes. It prevents a nasty 47 + * situation in which PCID-unaware code saves CR3, loads some other 48 + * value (with PCID == 0), and then restores CR3, thus corrupting 49 + * the TLB for ASID 0 if the saved ASID was nonzero. It also means 50 + * that any bugs involving loading a PCID-enabled CR3 with 51 + * CR4.PCIDE off will trigger deterministically. 52 + */ 53 + return asid + 1; 54 + } 55 + 56 + struct pgd_t; 57 + static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) 58 + { 59 + if (static_cpu_has(X86_FEATURE_PCID)) { 60 + return __sme_pa(pgd) | kern_pcid(asid); 61 + } else { 62 + VM_WARN_ON_ONCE(asid != 0); 63 + return __sme_pa(pgd); 64 + } 65 + } 66 + 67 + static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) 68 + { 69 + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 70 + VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); 71 + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; 76 72 } 77 73 78 74 #ifdef CONFIG_PARAVIRT ··· 233 237 234 238 extern void initialize_tlbstate_and_flush(void); 235 239 240 + /* 241 + * flush the entire current user mapping 242 + */ 236 243 static inline void __native_flush_tlb(void) 237 244 { 238 245 /* ··· 248 249 preempt_enable(); 249 250 } 250 251 251 - static inline void __native_flush_tlb_global_irq_disabled(void) 252 - { 253 - unsigned long cr4; 254 - 255 - cr4 = this_cpu_read(cpu_tlbstate.cr4); 256 - /* clear PGE */ 257 - native_write_cr4(cr4 & ~X86_CR4_PGE); 258 - /* write old PGE again and flush TLBs */ 259 - native_write_cr4(cr4); 260 - } 261 - 252 + /* 253 + * flush everything 254 + */ 262 255 static inline void __native_flush_tlb_global(void) 263 256 { 264 - unsigned long flags; 257 + unsigned long cr4, flags; 265 258 266 259 if (static_cpu_has(X86_FEATURE_INVPCID)) { 267 260 /* ··· 271 280 */ 272 281 raw_local_irq_save(flags); 273 282 274 - __native_flush_tlb_global_irq_disabled(); 283 + cr4 = this_cpu_read(cpu_tlbstate.cr4); 284 + /* toggle PGE */ 285 + native_write_cr4(cr4 ^ X86_CR4_PGE); 286 + /* write old PGE again and flush TLBs */ 287 + native_write_cr4(cr4); 275 288 276 289 raw_local_irq_restore(flags); 277 290 } 278 291 292 + /* 293 + * flush one page in the user mapping 294 + */ 279 295 static inline void __native_flush_tlb_single(unsigned long addr) 280 296 { 281 297 asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); 282 298 } 283 299 300 + /* 301 + * flush everything 302 + */ 284 303 static inline void __flush_tlb_all(void) 285 304 { 286 - if (boot_cpu_has(X86_FEATURE_PGE)) 305 + if (boot_cpu_has(X86_FEATURE_PGE)) { 287 306 __flush_tlb_global(); 288 - else 307 + } else { 308 + /* 309 + * !PGE -> !PCID (setup_pcid()), thus every flush is total. 310 + */ 289 311 __flush_tlb(); 312 + } 290 313 291 314 /* 292 315 * Note: if we somehow had PCID but not PGE, then this wouldn't work -- ··· 311 306 */ 312 307 } 313 308 309 + /* 310 + * flush one page in the kernel mapping 311 + */ 314 312 static inline void __flush_tlb_one(unsigned long addr) 315 313 { 316 314 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);

+2 -2

arch/x86/kernel/asm-offsets.c

··· 97 97 /* Layout info for cpu_entry_area */ 98 98 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); 99 99 OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); 100 - OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); 101 - DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); 100 + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); 101 + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); 102 102 }

+1 -1

arch/x86/kernel/asm-offsets_32.c

··· 48 48 49 49 /* Offset from the sysenter stack to tss.sp0 */ 50 50 DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - 51 - offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); 51 + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); 52 52 53 53 #ifdef CONFIG_CC_STACKPROTECTOR 54 54 BLANK();

+3 -97

arch/x86/kernel/cpu/common.c

··· 506 506 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, 507 507 [DEBUG_STACK - 1] = DEBUG_STKSZ 508 508 }; 509 - 510 - static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks 511 - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); 512 509 #endif 513 - 514 - static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, 515 - SYSENTER_stack_storage); 516 - 517 - static void __init 518 - set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) 519 - { 520 - for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) 521 - __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); 522 - } 523 - 524 - /* Setup the fixmap mappings only once per-processor */ 525 - static void __init setup_cpu_entry_area(int cpu) 526 - { 527 - #ifdef CONFIG_X86_64 528 - extern char _entry_trampoline[]; 529 - 530 - /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ 531 - pgprot_t gdt_prot = PAGE_KERNEL_RO; 532 - pgprot_t tss_prot = PAGE_KERNEL_RO; 533 - #else 534 - /* 535 - * On native 32-bit systems, the GDT cannot be read-only because 536 - * our double fault handler uses a task gate, and entering through 537 - * a task gate needs to change an available TSS to busy. If the 538 - * GDT is read-only, that will triple fault. The TSS cannot be 539 - * read-only because the CPU writes to it on task switches. 540 - * 541 - * On Xen PV, the GDT must be read-only because the hypervisor 542 - * requires it. 543 - */ 544 - pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? 545 - PAGE_KERNEL_RO : PAGE_KERNEL; 546 - pgprot_t tss_prot = PAGE_KERNEL; 547 - #endif 548 - 549 - __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); 550 - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), 551 - per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, 552 - PAGE_KERNEL); 553 - 554 - /* 555 - * The Intel SDM says (Volume 3, 7.2.1): 556 - * 557 - * Avoid placing a page boundary in the part of the TSS that the 558 - * processor reads during a task switch (the first 104 bytes). The 559 - * processor may not correctly perform address translations if a 560 - * boundary occurs in this area. During a task switch, the processor 561 - * reads and writes into the first 104 bytes of each TSS (using 562 - * contiguous physical addresses beginning with the physical address 563 - * of the first byte of the TSS). So, after TSS access begins, if 564 - * part of the 104 bytes is not physically contiguous, the processor 565 - * will access incorrect information without generating a page-fault 566 - * exception. 567 - * 568 - * There are also a lot of errata involving the TSS spanning a page 569 - * boundary. Assert that we're not doing that. 570 - */ 571 - BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ 572 - offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); 573 - BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); 574 - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), 575 - &per_cpu(cpu_tss_rw, cpu), 576 - sizeof(struct tss_struct) / PAGE_SIZE, 577 - tss_prot); 578 - 579 - #ifdef CONFIG_X86_32 580 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); 581 - #endif 582 - 583 - #ifdef CONFIG_X86_64 584 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); 585 - BUILD_BUG_ON(sizeof(exception_stacks) != 586 - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); 587 - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), 588 - &per_cpu(exception_stacks, cpu), 589 - sizeof(exception_stacks) / PAGE_SIZE, 590 - PAGE_KERNEL); 591 - 592 - __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), 593 - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); 594 - #endif 595 - } 596 - 597 - void __init setup_cpu_entry_areas(void) 598 - { 599 - unsigned int cpu; 600 - 601 - for_each_possible_cpu(cpu) 602 - setup_cpu_entry_area(cpu); 603 - } 604 510 605 511 /* Load the original GDT from the per-cpu structure */ 606 512 void load_direct_gdt(int cpu) ··· 1254 1348 1255 1349 tss->x86_tss.ss1 = __KERNEL_CS; 1256 1350 wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); 1257 - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); 1351 + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); 1258 1352 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); 1259 1353 1260 1354 put_cpu(); ··· 1371 1465 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). 1372 1466 */ 1373 1467 wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); 1374 - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); 1468 + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); 1375 1469 wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); 1376 1470 #else 1377 1471 wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); ··· 1586 1680 */ 1587 1681 set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); 1588 1682 load_TR_desc(); 1589 - load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); 1683 + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); 1590 1684 1591 1685 load_mm_ldt(&init_mm); 1592 1686

-13

arch/x86/kernel/cpu/microcode/intel.c

··· 565 565 } 566 566 #else 567 567 568 - /* 569 - * Flush global tlb. We only do this in x86_64 where paging has been enabled 570 - * already and PGE should be enabled as well. 571 - */ 572 - static inline void flush_tlb_early(void) 573 - { 574 - __native_flush_tlb_global_irq_disabled(); 575 - } 576 - 577 568 static inline void print_ucode(struct ucode_cpu_info *uci) 578 569 { 579 570 struct microcode_intel *mc; ··· 593 602 if (rev != mc->hdr.rev) 594 603 return -1; 595 604 596 - #ifdef CONFIG_X86_64 597 - /* Flush global tlb. This is precaution. */ 598 - flush_tlb_early(); 599 - #endif 600 605 uci->cpu_sig.rev = rev; 601 606 602 607 if (early)

+6 -5

arch/x86/kernel/dumpstack.c

··· 18 18 #include <linux/nmi.h> 19 19 #include <linux/sysfs.h> 20 20 21 + #include <asm/cpu_entry_area.h> 21 22 #include <asm/stacktrace.h> 22 23 #include <asm/unwind.h> 23 24 ··· 44 43 return true; 45 44 } 46 45 47 - bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) 46 + bool in_entry_stack(unsigned long *stack, struct stack_info *info) 48 47 { 49 - struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); 48 + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); 50 49 51 50 void *begin = ss; 52 51 void *end = ss + 1; ··· 54 53 if ((void *)stack < begin || (void *)stack >= end) 55 54 return false; 56 55 57 - info->type = STACK_TYPE_SYSENTER; 56 + info->type = STACK_TYPE_ENTRY; 58 57 info->begin = begin; 59 58 info->end = end; 60 59 info->next_sp = NULL; ··· 112 111 * - task stack 113 112 * - interrupt stack 114 113 * - HW exception stacks (double fault, nmi, debug, mce) 115 - * - SYSENTER stack 114 + * - entry stack 116 115 * 117 116 * x86-32 can have up to four stacks: 118 117 * - task stack 119 118 * - softirq stack 120 119 * - hardirq stack 121 - * - SYSENTER stack 120 + * - entry stack 122 121 */ 123 122 for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { 124 123 const char *stack_name;

+3 -3

arch/x86/kernel/dumpstack_32.c

··· 26 26 if (type == STACK_TYPE_SOFTIRQ) 27 27 return "SOFTIRQ"; 28 28 29 - if (type == STACK_TYPE_SYSENTER) 30 - return "SYSENTER"; 29 + if (type == STACK_TYPE_ENTRY) 30 + return "ENTRY_TRAMPOLINE"; 31 31 32 32 return NULL; 33 33 } ··· 96 96 if (task != current) 97 97 goto unknown; 98 98 99 - if (in_sysenter_stack(stack, info)) 99 + if (in_entry_stack(stack, info)) 100 100 goto recursion_check; 101 101 102 102 if (in_hardirq_stack(stack, info))

+9 -3

arch/x86/kernel/dumpstack_64.c

··· 37 37 if (type == STACK_TYPE_IRQ) 38 38 return "IRQ"; 39 39 40 - if (type == STACK_TYPE_SYSENTER) 41 - return "SYSENTER"; 40 + if (type == STACK_TYPE_ENTRY) { 41 + /* 42 + * On 64-bit, we have a generic entry stack that we 43 + * use for all the kernel entry points, including 44 + * SYSENTER. 45 + */ 46 + return "ENTRY_TRAMPOLINE"; 47 + } 42 48 43 49 if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) 44 50 return exception_stack_names[type - STACK_TYPE_EXCEPTION]; ··· 124 118 if (in_irq_stack(stack, info)) 125 119 goto recursion_check; 126 120 127 - if (in_sysenter_stack(stack, info)) 121 + if (in_entry_stack(stack, info)) 128 122 goto recursion_check; 129 123 130 124 goto unknown;

+25 -24

arch/x86/kernel/ldt.c

··· 5 5 * Copyright (C) 2002 Andi Kleen 6 6 * 7 7 * This handles calls from both 32bit and 64bit mode. 8 + * 9 + * Lock order: 10 + * contex.ldt_usr_sem 11 + * mmap_sem 12 + * context.lock 8 13 */ 9 14 10 15 #include <linux/errno.h> ··· 47 42 #endif 48 43 } 49 44 50 - /* context.lock is held for us, so we don't need any locking. */ 45 + /* context.lock is held by the task which issued the smp function call */ 51 46 static void flush_ldt(void *__mm) 52 47 { 53 48 struct mm_struct *mm = __mm; ··· 104 99 paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); 105 100 } 106 101 107 - /* context.lock is held */ 108 - static void install_ldt(struct mm_struct *current_mm, 109 - struct ldt_struct *ldt) 102 + static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) 110 103 { 111 - /* Synchronizes with READ_ONCE in load_mm_ldt. */ 112 - smp_store_release(&current_mm->context.ldt, ldt); 104 + mutex_lock(&mm->context.lock); 113 105 114 - /* Activate the LDT for all CPUs using current_mm. */ 115 - on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); 106 + /* Synchronizes with READ_ONCE in load_mm_ldt. */ 107 + smp_store_release(&mm->context.ldt, ldt); 108 + 109 + /* Activate the LDT for all CPUs using currents mm. */ 110 + on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); 111 + 112 + mutex_unlock(&mm->context.lock); 116 113 } 117 114 118 115 static void free_ldt_struct(struct ldt_struct *ldt) ··· 131 124 } 132 125 133 126 /* 134 - * we do not have to muck with descriptors here, that is 135 - * done in switch_mm() as needed. 127 + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, 128 + * the new task is not running, so nothing can be installed. 136 129 */ 137 - int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) 130 + int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) 138 131 { 139 132 struct ldt_struct *new_ldt; 140 - struct mm_struct *old_mm; 141 133 int retval = 0; 142 134 143 - mutex_init(&mm->context.lock); 144 - old_mm = current->mm; 145 - if (!old_mm) { 146 - mm->context.ldt = NULL; 135 + if (!old_mm) 147 136 return 0; 148 - } 149 137 150 138 mutex_lock(&old_mm->context.lock); 151 - if (!old_mm->context.ldt) { 152 - mm->context.ldt = NULL; 139 + if (!old_mm->context.ldt) 153 140 goto out_unlock; 154 - } 155 141 156 142 new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); 157 143 if (!new_ldt) { ··· 180 180 unsigned long entries_size; 181 181 int retval; 182 182 183 - mutex_lock(&mm->context.lock); 183 + down_read(&mm->context.ldt_usr_sem); 184 184 185 185 if (!mm->context.ldt) { 186 186 retval = 0; ··· 209 209 retval = bytecount; 210 210 211 211 out_unlock: 212 - mutex_unlock(&mm->context.lock); 212 + up_read(&mm->context.ldt_usr_sem); 213 213 return retval; 214 214 } 215 215 ··· 269 269 ldt.avl = 0; 270 270 } 271 271 272 - mutex_lock(&mm->context.lock); 272 + if (down_write_killable(&mm->context.ldt_usr_sem)) 273 + return -EINTR; 273 274 274 275 old_ldt = mm->context.ldt; 275 276 old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; ··· 292 291 error = 0; 293 292 294 293 out_unlock: 295 - mutex_unlock(&mm->context.lock); 294 + up_write(&mm->context.ldt_usr_sem); 296 295 out: 297 296 return error; 298 297 }

+1 -5

arch/x86/kernel/smpboot.c

··· 932 932 initial_code = (unsigned long)start_secondary; 933 933 initial_stack = idle->thread.sp; 934 934 935 - /* 936 - * Enable the espfix hack for this CPU 937 - */ 938 - #ifdef CONFIG_X86_ESPFIX64 935 + /* Enable the espfix hack for this CPU */ 939 936 init_espfix_ap(cpu); 940 - #endif 941 937 942 938 /* So we see what's up */ 943 939 announce_cpu(cpu, apicid);

+4 -2

arch/x86/kernel/traps.c

··· 51 51 #include <asm/traps.h> 52 52 #include <asm/desc.h> 53 53 #include <asm/fpu/internal.h> 54 + #include <asm/cpu_entry_area.h> 54 55 #include <asm/mce.h> 55 56 #include <asm/fixmap.h> 56 57 #include <asm/mach_traps.h> ··· 952 951 * "sidt" instruction will not leak the location of the kernel, and 953 952 * to defend the IDT against arbitrary memory write vulnerabilities. 954 953 * It will be reloaded in cpu_init() */ 955 - __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); 956 - idt_descr.address = fix_to_virt(FIX_RO_IDT); 954 + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), 955 + PAGE_KERNEL_RO); 956 + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; 957 957 958 958 /* 959 959 * Should be a barrier for any external CPU state:

+1 -1

arch/x86/mm/Makefile

··· 10 10 endif 11 11 12 12 obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 13 - pat.o pgtable.o physaddr.o setup_nx.o tlb.o 13 + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o 14 14 15 15 # Make sure __phys_addr has no stackprotector 16 16 nostackp := $(call cc-option, -fno-stack-protector)

+139

arch/x86/mm/cpu_entry_area.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/spinlock.h> 4 + #include <linux/percpu.h> 5 + 6 + #include <asm/cpu_entry_area.h> 7 + #include <asm/pgtable.h> 8 + #include <asm/fixmap.h> 9 + #include <asm/desc.h> 10 + 11 + static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); 12 + 13 + #ifdef CONFIG_X86_64 14 + static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks 15 + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); 16 + #endif 17 + 18 + struct cpu_entry_area *get_cpu_entry_area(int cpu) 19 + { 20 + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; 21 + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); 22 + 23 + return (struct cpu_entry_area *) va; 24 + } 25 + EXPORT_SYMBOL(get_cpu_entry_area); 26 + 27 + void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) 28 + { 29 + unsigned long va = (unsigned long) cea_vaddr; 30 + 31 + set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); 32 + } 33 + 34 + static void __init 35 + cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) 36 + { 37 + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) 38 + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); 39 + } 40 + 41 + /* Setup the fixmap mappings only once per-processor */ 42 + static void __init setup_cpu_entry_area(int cpu) 43 + { 44 + #ifdef CONFIG_X86_64 45 + extern char _entry_trampoline[]; 46 + 47 + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ 48 + pgprot_t gdt_prot = PAGE_KERNEL_RO; 49 + pgprot_t tss_prot = PAGE_KERNEL_RO; 50 + #else 51 + /* 52 + * On native 32-bit systems, the GDT cannot be read-only because 53 + * our double fault handler uses a task gate, and entering through 54 + * a task gate needs to change an available TSS to busy. If the 55 + * GDT is read-only, that will triple fault. The TSS cannot be 56 + * read-only because the CPU writes to it on task switches. 57 + * 58 + * On Xen PV, the GDT must be read-only because the hypervisor 59 + * requires it. 60 + */ 61 + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? 62 + PAGE_KERNEL_RO : PAGE_KERNEL; 63 + pgprot_t tss_prot = PAGE_KERNEL; 64 + #endif 65 + 66 + cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), 67 + gdt_prot); 68 + 69 + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, 70 + per_cpu_ptr(&entry_stack_storage, cpu), 1, 71 + PAGE_KERNEL); 72 + 73 + /* 74 + * The Intel SDM says (Volume 3, 7.2.1): 75 + * 76 + * Avoid placing a page boundary in the part of the TSS that the 77 + * processor reads during a task switch (the first 104 bytes). The 78 + * processor may not correctly perform address translations if a 79 + * boundary occurs in this area. During a task switch, the processor 80 + * reads and writes into the first 104 bytes of each TSS (using 81 + * contiguous physical addresses beginning with the physical address 82 + * of the first byte of the TSS). So, after TSS access begins, if 83 + * part of the 104 bytes is not physically contiguous, the processor 84 + * will access incorrect information without generating a page-fault 85 + * exception. 86 + * 87 + * There are also a lot of errata involving the TSS spanning a page 88 + * boundary. Assert that we're not doing that. 89 + */ 90 + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ 91 + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); 92 + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); 93 + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, 94 + &per_cpu(cpu_tss_rw, cpu), 95 + sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); 96 + 97 + #ifdef CONFIG_X86_32 98 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); 99 + #endif 100 + 101 + #ifdef CONFIG_X86_64 102 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); 103 + BUILD_BUG_ON(sizeof(exception_stacks) != 104 + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); 105 + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, 106 + &per_cpu(exception_stacks, cpu), 107 + sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); 108 + 109 + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, 110 + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); 111 + #endif 112 + } 113 + 114 + static __init void setup_cpu_entry_area_ptes(void) 115 + { 116 + #ifdef CONFIG_X86_32 117 + unsigned long start, end; 118 + 119 + BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); 120 + BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); 121 + 122 + start = CPU_ENTRY_AREA_BASE; 123 + end = start + CPU_ENTRY_AREA_MAP_SIZE; 124 + 125 + /* Careful here: start + PMD_SIZE might wrap around */ 126 + for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) 127 + populate_extra_pte(start); 128 + #endif 129 + } 130 + 131 + void __init setup_cpu_entry_areas(void) 132 + { 133 + unsigned int cpu; 134 + 135 + setup_cpu_entry_area_ptes(); 136 + 137 + for_each_possible_cpu(cpu) 138 + setup_cpu_entry_area(cpu); 139 + }

+59 -39

arch/x86/mm/dump_pagetables.c

··· 44 44 unsigned long max_lines; 45 45 }; 46 46 47 - /* indices for address_markers; keep sync'd w/ address_markers below */ 47 + /* Address space markers hints */ 48 + 49 + #ifdef CONFIG_X86_64 50 + 48 51 enum address_markers_idx { 49 52 USER_SPACE_NR = 0, 50 - #ifdef CONFIG_X86_64 51 53 KERNEL_SPACE_NR, 52 54 LOW_KERNEL_NR, 53 55 VMALLOC_START_NR, ··· 58 56 KASAN_SHADOW_START_NR, 59 57 KASAN_SHADOW_END_NR, 60 58 #endif 61 - # ifdef CONFIG_X86_ESPFIX64 59 + CPU_ENTRY_AREA_NR, 60 + #ifdef CONFIG_X86_ESPFIX64 62 61 ESPFIX_START_NR, 63 - # endif 62 + #endif 63 + #ifdef CONFIG_EFI 64 + EFI_END_NR, 65 + #endif 64 66 HIGH_KERNEL_NR, 65 67 MODULES_VADDR_NR, 66 68 MODULES_END_NR, 67 - #else 69 + FIXADDR_START_NR, 70 + END_OF_SPACE_NR, 71 + }; 72 + 73 + static struct addr_marker address_markers[] = { 74 + [USER_SPACE_NR] = { 0, "User Space" }, 75 + [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, 76 + [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, 77 + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 78 + [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, 79 + #ifdef CONFIG_KASAN 80 + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, 81 + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, 82 + #endif 83 + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, 84 + #ifdef CONFIG_X86_ESPFIX64 85 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 86 + #endif 87 + #ifdef CONFIG_EFI 88 + [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, 89 + #endif 90 + [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, 91 + [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, 92 + [MODULES_END_NR] = { MODULES_END, "End Modules" }, 93 + [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, 94 + [END_OF_SPACE_NR] = { -1, NULL } 95 + }; 96 + 97 + #else /* CONFIG_X86_64 */ 98 + 99 + enum address_markers_idx { 100 + USER_SPACE_NR = 0, 68 101 KERNEL_SPACE_NR, 69 102 VMALLOC_START_NR, 70 103 VMALLOC_END_NR, 71 - # ifdef CONFIG_HIGHMEM 104 + #ifdef CONFIG_HIGHMEM 72 105 PKMAP_BASE_NR, 73 - # endif 74 - FIXADDR_START_NR, 75 106 #endif 107 + CPU_ENTRY_AREA_NR, 108 + FIXADDR_START_NR, 109 + END_OF_SPACE_NR, 76 110 }; 77 111 78 - /* Address space markers hints */ 79 112 static struct addr_marker address_markers[] = { 80 - { 0, "User Space" }, 81 - #ifdef CONFIG_X86_64 82 - { 0x8000000000000000UL, "Kernel Space" }, 83 - { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, 84 - { 0/* VMALLOC_START */, "vmalloc() Area" }, 85 - { 0/* VMEMMAP_START */, "Vmemmap" }, 86 - #ifdef CONFIG_KASAN 87 - { KASAN_SHADOW_START, "KASAN shadow" }, 88 - { KASAN_SHADOW_END, "KASAN shadow end" }, 113 + [USER_SPACE_NR] = { 0, "User Space" }, 114 + [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, 115 + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, 116 + [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, 117 + #ifdef CONFIG_HIGHMEM 118 + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, 89 119 #endif 90 - # ifdef CONFIG_X86_ESPFIX64 91 - { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, 92 - # endif 93 - # ifdef CONFIG_EFI 94 - { EFI_VA_END, "EFI Runtime Services" }, 95 - # endif 96 - { __START_KERNEL_map, "High Kernel Mapping" }, 97 - { MODULES_VADDR, "Modules" }, 98 - { MODULES_END, "End Modules" }, 99 - #else 100 - { PAGE_OFFSET, "Kernel Mapping" }, 101 - { 0/* VMALLOC_START */, "vmalloc() Area" }, 102 - { 0/*VMALLOC_END*/, "vmalloc() End" }, 103 - # ifdef CONFIG_HIGHMEM 104 - { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, 105 - # endif 106 - { 0/*FIXADDR_START*/, "Fixmap Area" }, 107 - #endif 108 - { -1, NULL } /* End of list */ 120 + [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, 121 + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, 122 + [END_OF_SPACE_NR] = { -1, NULL } 109 123 }; 124 + 125 + #endif /* !CONFIG_X86_64 */ 110 126 111 127 /* Multipliers for offsets within the PTEs */ 112 128 #define PTE_LEVEL_MULT (PAGE_SIZE) ··· 160 140 static const char * const level_name[] = 161 141 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; 162 142 163 - if (!pgprot_val(prot)) { 143 + if (!(pr & _PAGE_PRESENT)) { 164 144 /* Not present */ 165 145 pt_dump_cont_printf(m, dmsg, " "); 166 146 } else { ··· 545 525 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; 546 526 # endif 547 527 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; 528 + address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; 548 529 #endif 549 - 550 530 return 0; 551 531 } 552 532 __initcall(pt_dump_init);

arch/x86/mm/init_32.c

··· 50 50 #include <asm/setup.h> 51 51 #include <asm/set_memory.h> 52 52 #include <asm/page_types.h> 53 + #include <asm/cpu_entry_area.h> 53 54 #include <asm/init.h> 54 55 55 56 #include "mm_internal.h" ··· 767 766 mem_init_print_info(NULL); 768 767 printk(KERN_INFO "virtual kernel memory layout:\n" 769 768 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 769 + " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" 770 770 #ifdef CONFIG_HIGHMEM 771 771 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 772 772 #endif ··· 778 776 " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", 779 777 FIXADDR_START, FIXADDR_TOP, 780 778 (FIXADDR_TOP - FIXADDR_START) >> 10, 779 + 780 + CPU_ENTRY_AREA_BASE, 781 + CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, 782 + CPU_ENTRY_AREA_MAP_SIZE >> 10, 781 783 782 784 #ifdef CONFIG_HIGHMEM 783 785 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,

+20 -17

arch/x86/mm/kasan_init_64.c

··· 15 15 #include <asm/tlbflush.h> 16 16 #include <asm/sections.h> 17 17 #include <asm/pgtable.h> 18 + #include <asm/cpu_entry_area.h> 18 19 19 20 extern struct range pfn_mapped[E820_MAX_ENTRIES]; 20 21 ··· 323 322 map_range(&pfn_mapped[i]); 324 323 } 325 324 325 + shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; 326 + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); 327 + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, 328 + PAGE_SIZE); 329 + 330 + shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + 331 + CPU_ENTRY_AREA_MAP_SIZE); 332 + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); 333 + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, 334 + PAGE_SIZE); 335 + 326 336 kasan_populate_zero_shadow( 327 337 kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), 328 - kasan_mem_to_shadow((void *)__START_KERNEL_map)); 338 + shadow_cpu_entry_begin); 339 + 340 + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, 341 + (unsigned long)shadow_cpu_entry_end, 0); 342 + 343 + kasan_populate_zero_shadow(shadow_cpu_entry_end, 344 + kasan_mem_to_shadow((void *)__START_KERNEL_map)); 329 345 330 346 kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), 331 347 (unsigned long)kasan_mem_to_shadow(_end), 332 348 early_pfn_to_nid(__pa(_stext))); 333 349 334 - shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); 335 - shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); 336 - shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, 337 - PAGE_SIZE); 338 - 339 - shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); 340 - shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); 341 - shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, 342 - PAGE_SIZE); 343 - 344 350 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), 345 - shadow_cpu_entry_begin); 346 - 347 - kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, 348 - (unsigned long)shadow_cpu_entry_end, 0); 349 - 350 - kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); 351 + (void *)KASAN_SHADOW_END); 351 352 352 353 load_cr3(init_top_pgt); 353 354 __flush_tlb_all();

arch/x86/mm/pgtable_32.c

··· 10 10 #include <linux/pagemap.h> 11 11 #include <linux/spinlock.h> 12 12 13 + #include <asm/cpu_entry_area.h> 13 14 #include <asm/pgtable.h> 14 15 #include <asm/pgalloc.h> 15 16 #include <asm/fixmap.h>

+5 -5

arch/x86/mm/tlb.c

··· 128 128 * isn't free. 129 129 */ 130 130 #ifdef CONFIG_DEBUG_VM 131 - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { 131 + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { 132 132 /* 133 133 * If we were to BUG here, we'd be very likely to kill 134 134 * the system so hard that we don't see the call trace. ··· 195 195 if (need_flush) { 196 196 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 197 197 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 198 - write_cr3(build_cr3(next, new_asid)); 198 + write_cr3(build_cr3(next->pgd, new_asid)); 199 199 200 200 /* 201 201 * NB: This gets called via leave_mm() in the idle path ··· 208 208 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 209 209 } else { 210 210 /* The new ASID is already up to date. */ 211 - write_cr3(build_cr3_noflush(next, new_asid)); 211 + write_cr3(build_cr3_noflush(next->pgd, new_asid)); 212 212 213 213 /* See above wrt _rcuidle. */ 214 214 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); ··· 288 288 !(cr4_read_shadow() & X86_CR4_PCIDE)); 289 289 290 290 /* Force ASID 0 and force a TLB flush. */ 291 - write_cr3(build_cr3(mm, 0)); 291 + write_cr3(build_cr3(mm->pgd, 0)); 292 292 293 293 /* Reinitialize tlbstate. */ 294 294 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); ··· 551 551 552 552 /* flush range by one by one 'invlpg' */ 553 553 for (addr = f->start; addr < f->end; addr += PAGE_SIZE) 554 - __flush_tlb_single(addr); 554 + __flush_tlb_one(addr); 555 555 } 556 556 557 557 void flush_tlb_kernel_range(unsigned long start, unsigned long end)

+1 -1

arch/x86/platform/uv/tlb_uv.c

··· 299 299 local_flush_tlb(); 300 300 stat->d_alltlb++; 301 301 } else { 302 - __flush_tlb_one(msg->address); 302 + __flush_tlb_single(msg->address); 303 303 stat->d_onetlb++; 304 304 } 305 305 stat->d_requestee++;

-2

arch/x86/xen/mmu_pv.c

··· 2273 2273 2274 2274 switch (idx) { 2275 2275 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: 2276 - case FIX_RO_IDT: 2277 2276 #ifdef CONFIG_X86_32 2278 2277 case FIX_WP_TEST: 2279 2278 # ifdef CONFIG_HIGHMEM ··· 2283 2284 #endif 2284 2285 case FIX_TEXT_POKE0: 2285 2286 case FIX_TEXT_POKE1: 2286 - case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: 2287 2287 /* All local page mappings */ 2288 2288 pte = pfn_pte(phys, prot); 2289 2289 break;

+3 -2

include/asm-generic/mm_hooks.h

··· 7 7 #ifndef _ASM_GENERIC_MM_HOOKS_H 8 8 #define _ASM_GENERIC_MM_HOOKS_H 9 9 10 - static inline void arch_dup_mmap(struct mm_struct *oldmm, 11 - struct mm_struct *mm) 10 + static inline int arch_dup_mmap(struct mm_struct *oldmm, 11 + struct mm_struct *mm) 12 12 { 13 + return 0; 13 14 } 14 15 15 16 static inline void arch_exit_mmap(struct mm_struct *mm)

include/asm-generic/pgtable.h

··· 1025 1025 struct file; 1026 1026 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 1027 1027 unsigned long size, pgprot_t *vma_prot); 1028 + 1029 + #ifndef CONFIG_X86_ESPFIX64 1030 + static inline void init_espfix_bsp(void) { } 1031 + #endif 1032 + 1028 1033 #endif /* !__ASSEMBLY__ */ 1029 1034 1030 1035 #ifndef io_remap_pfn_range

+2 -4

init/main.c

··· 504 504 pgtable_init(); 505 505 vmalloc_init(); 506 506 ioremap_huge_init(); 507 + /* Should be run before the first non-init thread is created */ 508 + init_espfix_bsp(); 507 509 } 508 510 509 511 asmlinkage __visible void __init start_kernel(void) ··· 680 678 #ifdef CONFIG_X86 681 679 if (efi_enabled(EFI_RUNTIME_SERVICES)) 682 680 efi_enter_virtual_mode(); 683 - #endif 684 - #ifdef CONFIG_X86_ESPFIX64 685 - /* Should be run before the first non-init thread is created */ 686 - init_espfix_bsp(); 687 681 #endif 688 682 thread_stack_cache_init(); 689 683 cred_init();

+1 -2

kernel/fork.c

··· 721 721 goto out; 722 722 } 723 723 /* a new mm has just been created */ 724 - arch_dup_mmap(oldmm, mm); 725 - retval = 0; 724 + retval = arch_dup_mmap(oldmm, mm); 726 725 out: 727 726 up_write(&mm->mmap_sem); 728 727 flush_tlb_mm(oldmm);

+3 -6

tools/testing/selftests/x86/ldt_gdt.c

··· 627 627 static int finish_exec_test(void) 628 628 { 629 629 /* 630 - * In a sensible world, this would be check_invalid_segment(0, 1); 631 - * For better or for worse, though, the LDT is inherited across exec. 632 - * We can probably change this safely, but for now we test it. 630 + * Older kernel versions did inherit the LDT on exec() which is 631 + * wrong because exec() starts from a clean state. 633 632 */ 634 - check_valid_segment(0, 1, 635 - AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, 636 - 42, true); 633 + check_invalid_segment(0, 1); 637 634 638 635 return nerrs ? 1 : 0; 639 636 }

Configure Feed

Configure Feed