Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:
"Speculation:

- Make the microcode check more robust

- Make the L1TF memory limit depend on the internal cache physical
address space and not on the CPUID advertised physical address
space, which might be significantly smaller. This avoids disabling
L1TF on machines which utilize the full physical address space.

- Fix the GDT mapping for EFI calls on 32bit PTI

- Fix the MCE nospec implementation to prevent #GP

Fixes and robustness:

- Use the proper operand order for LSL in the VDSO

- Prevent NMI uaccess race against CR3 switching

- Add a lockdep check to verify that text_mutex is held in
text_poke() functions

- Repair the fallout of giving native_restore_fl() a prototype

- Prevent kernel memory dumps based on usermode RIP

- Wipe KASAN shadow stack before rewinding the stack to prevent false
positives

- Move the AMS GOTO enforcement to the actual build stage to allow
user API header extraction without a compiler

- Fix a section mismatch introduced by the on demand VDSO mapping
change

Miscellaneous:

- Trivial typo, GCC quirk removal and CC_SET/OUT() cleanups"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/pti: Fix section mismatch warning/error
x86/vdso: Fix lsl operand order
x86/mce: Fix set_mce_nospec() to avoid #GP fault
x86/efi: Load fixmap GDT in efi_call_phys_epilog()
x86/nmi: Fix NMI uaccess race against CR3 switching
x86: Allow generating user-space headers without a compiler
x86/dumpstack: Don't dump kernel memory based on usermode RIP
x86/asm: Use CC_SET()/CC_OUT() in __gen_sigismember()
x86/alternatives: Lockdep-enforce text_mutex in text_poke*()
x86/entry/64: Wipe KASAN stack shadow before rewind_stack_do_exit()
x86/irqflags: Mark native_restore_fl extern inline
x86/build: Remove jump label quirk for GCC older than 4.5.2
x86/Kconfig: Fix trivial typo
x86/speculation/l1tf: Increase l1tf memory limit for Nehalem+
x86/spectre: Add missing family 6 check to microcode check

+167 -50
+1 -1
arch/x86/Kconfig
··· 2843 2843 This option, if enabled, marks VGA/VBE/EFI framebuffers as generic 2844 2844 framebuffers so the new generic system-framebuffer drivers can be 2845 2845 used on x86. If the framebuffer is not compatible with the generic 2846 - modes, it is adverticed as fallback platform framebuffer so legacy 2846 + modes, it is advertised as fallback platform framebuffer so legacy 2847 2847 drivers like efifb, vesafb and uvesafb can pick it up. 2848 2848 If this option is not selected, all system framebuffers are always 2849 2849 marked as fallback platform framebuffers as usual.
+7 -16
arch/x86/Makefile
··· 175 175 endif 176 176 endif 177 177 178 - ifndef CC_HAVE_ASM_GOTO 179 - $(error Compiler lacks asm-goto support.) 180 - endif 181 - 182 - # 183 - # Jump labels need '-maccumulate-outgoing-args' for gcc < 4.5.2 to prevent a 184 - # GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46226). There's no way 185 - # to test for this bug at compile-time because the test case needs to execute, 186 - # which is a no-go for cross compilers. So check the GCC version instead. 187 - # 188 - ifdef CONFIG_JUMP_LABEL 189 - ifneq ($(ACCUMULATE_OUTGOING_ARGS), 1) 190 - ACCUMULATE_OUTGOING_ARGS = $(call cc-if-fullversion, -lt, 040502, 1) 191 - endif 192 - endif 193 - 194 178 ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) 195 179 # This compiler flag is not supported by Clang: 196 180 KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,) ··· 295 311 PHONY += vdso_install 296 312 vdso_install: 297 313 $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@ 314 + 315 + archprepare: checkbin 316 + checkbin: 317 + ifndef CC_HAVE_ASM_GOTO 318 + @echo Compiler lacks asm-goto support. 319 + @exit 1 320 + endif 298 321 299 322 archclean: 300 323 $(Q)rm -rf $(objtree)/arch/i386
+1 -1
arch/x86/events/core.c
··· 2465 2465 2466 2466 perf_callchain_store(entry, regs->ip); 2467 2467 2468 - if (!current->mm) 2468 + if (!nmi_uaccess_okay()) 2469 2469 return; 2470 2470 2471 2471 if (perf_callchain_user32(regs, entry))
+2 -1
arch/x86/include/asm/irqflags.h
··· 33 33 return flags; 34 34 } 35 35 36 - static inline void native_restore_fl(unsigned long flags) 36 + extern inline void native_restore_fl(unsigned long flags); 37 + extern inline void native_restore_fl(unsigned long flags) 37 38 { 38 39 asm volatile("push %0 ; popf" 39 40 : /* no output */
+3 -1
arch/x86/include/asm/processor.h
··· 132 132 /* Index into per_cpu list: */ 133 133 u16 cpu_index; 134 134 u32 microcode; 135 + /* Address space bits used by the cache internally */ 136 + u8 x86_cache_bits; 135 137 unsigned initialized : 1; 136 138 } __randomize_layout; 137 139 ··· 185 183 186 184 static inline unsigned long long l1tf_pfn_limit(void) 187 185 { 188 - return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT); 186 + return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); 189 187 } 190 188 191 189 extern void early_cpu_init(void);
+4 -3
arch/x86/include/asm/signal.h
··· 39 39 40 40 #define __ARCH_HAS_SA_RESTORER 41 41 42 + #include <asm/asm.h> 42 43 #include <uapi/asm/sigcontext.h> 43 44 44 45 #ifdef __i386__ ··· 87 86 88 87 static inline int __gen_sigismember(sigset_t *set, int _sig) 89 88 { 90 - unsigned char ret; 91 - asm("btl %2,%1\n\tsetc %0" 92 - : "=qm"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); 89 + bool ret; 90 + asm("btl %2,%1" CC_SET(c) 91 + : CC_OUT(c) (ret) : "m"(*set), "Ir"(_sig-1)); 93 92 return ret; 94 93 } 95 94
+1 -1
arch/x86/include/asm/stacktrace.h
··· 111 111 return (unsigned long)frame; 112 112 } 113 113 114 - void show_opcodes(u8 *rip, const char *loglvl); 114 + void show_opcodes(struct pt_regs *regs, const char *loglvl); 115 115 void show_ip(struct pt_regs *regs, const char *loglvl); 116 116 #endif /* _ASM_X86_STACKTRACE_H */
+40
arch/x86/include/asm/tlbflush.h
··· 175 175 * are on. This means that it may not match current->active_mm, 176 176 * which will contain the previous user mm when we're in lazy TLB 177 177 * mode even if we've already switched back to swapper_pg_dir. 178 + * 179 + * During switch_mm_irqs_off(), loaded_mm will be set to 180 + * LOADED_MM_SWITCHING during the brief interrupts-off window 181 + * when CR3 and loaded_mm would otherwise be inconsistent. This 182 + * is for nmi_uaccess_okay()'s benefit. 178 183 */ 179 184 struct mm_struct *loaded_mm; 185 + 186 + #define LOADED_MM_SWITCHING ((struct mm_struct *)1) 187 + 180 188 u16 loaded_mm_asid; 181 189 u16 next_asid; 182 190 /* last user mm's ctx id */ ··· 253 245 struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; 254 246 }; 255 247 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); 248 + 249 + /* 250 + * Blindly accessing user memory from NMI context can be dangerous 251 + * if we're in the middle of switching the current user task or 252 + * switching the loaded mm. It can also be dangerous if we 253 + * interrupted some kernel code that was temporarily using a 254 + * different mm. 255 + */ 256 + static inline bool nmi_uaccess_okay(void) 257 + { 258 + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 259 + struct mm_struct *current_mm = current->mm; 260 + 261 + VM_WARN_ON_ONCE(!loaded_mm); 262 + 263 + /* 264 + * The condition we want to check is 265 + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, 266 + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() 267 + * is supposed to be reasonably fast. 268 + * 269 + * Instead, we check the almost equivalent but somewhat conservative 270 + * condition below, and we rely on the fact that switch_mm_irqs_off() 271 + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. 272 + */ 273 + if (loaded_mm != current_mm) 274 + return false; 275 + 276 + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); 277 + 278 + return true; 279 + } 256 280 257 281 /* Initialize cr4 shadow for this CPU. */ 258 282 static inline void cr4_init_shadow(void)
+1 -1
arch/x86/include/asm/vgtod.h
··· 93 93 * 94 94 * If RDPID is available, use it. 95 95 */ 96 - alternative_io ("lsl %[p],%[seg]", 96 + alternative_io ("lsl %[seg],%[p]", 97 97 ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ 98 98 X86_FEATURE_RDPID, 99 99 [p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
+5 -4
arch/x86/kernel/alternative.c
··· 684 684 * It means the size must be writable atomically and the address must be aligned 685 685 * in a way that permits an atomic write. It also makes sure we fit on a single 686 686 * page. 687 - * 688 - * Note: Must be called under text_mutex. 689 687 */ 690 688 void *text_poke(void *addr, const void *opcode, size_t len) 691 689 { ··· 697 699 * pages as they are not yet initialized. 698 700 */ 699 701 BUG_ON(!after_bootmem); 702 + 703 + lockdep_assert_held(&text_mutex); 700 704 701 705 if (!core_kernel_text((unsigned long)addr)) { 702 706 pages[0] = vmalloc_to_page(addr); ··· 782 782 * - replace the first byte (int3) by the first byte of 783 783 * replacing opcode 784 784 * - sync cores 785 - * 786 - * Note: must be called under text_mutex. 787 785 */ 788 786 void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) 789 787 { ··· 790 792 bp_int3_handler = handler; 791 793 bp_int3_addr = (u8 *)addr + sizeof(int3); 792 794 bp_patching_in_progress = true; 795 + 796 + lockdep_assert_held(&text_mutex); 797 + 793 798 /* 794 799 * Corresponding read barrier in int3 notifier for making sure the 795 800 * in_progress and handler are correctly ordered wrt. patching.
+41 -5
arch/x86/kernel/cpu/bugs.c
··· 668 668 enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 669 669 EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); 670 670 671 + /* 672 + * These CPUs all support 44bits physical address space internally in the 673 + * cache but CPUID can report a smaller number of physical address bits. 674 + * 675 + * The L1TF mitigation uses the top most address bit for the inversion of 676 + * non present PTEs. When the installed memory reaches into the top most 677 + * address bit due to memory holes, which has been observed on machines 678 + * which report 36bits physical address bits and have 32G RAM installed, 679 + * then the mitigation range check in l1tf_select_mitigation() triggers. 680 + * This is a false positive because the mitigation is still possible due to 681 + * the fact that the cache uses 44bit internally. Use the cache bits 682 + * instead of the reported physical bits and adjust them on the affected 683 + * machines to 44bit if the reported bits are less than 44. 684 + */ 685 + static void override_cache_bits(struct cpuinfo_x86 *c) 686 + { 687 + if (c->x86 != 6) 688 + return; 689 + 690 + switch (c->x86_model) { 691 + case INTEL_FAM6_NEHALEM: 692 + case INTEL_FAM6_WESTMERE: 693 + case INTEL_FAM6_SANDYBRIDGE: 694 + case INTEL_FAM6_IVYBRIDGE: 695 + case INTEL_FAM6_HASWELL_CORE: 696 + case INTEL_FAM6_HASWELL_ULT: 697 + case INTEL_FAM6_HASWELL_GT3E: 698 + case INTEL_FAM6_BROADWELL_CORE: 699 + case INTEL_FAM6_BROADWELL_GT3E: 700 + case INTEL_FAM6_SKYLAKE_MOBILE: 701 + case INTEL_FAM6_SKYLAKE_DESKTOP: 702 + case INTEL_FAM6_KABYLAKE_MOBILE: 703 + case INTEL_FAM6_KABYLAKE_DESKTOP: 704 + if (c->x86_cache_bits < 44) 705 + c->x86_cache_bits = 44; 706 + break; 707 + } 708 + } 709 + 671 710 static void __init l1tf_select_mitigation(void) 672 711 { 673 712 u64 half_pa; 674 713 675 714 if (!boot_cpu_has_bug(X86_BUG_L1TF)) 676 715 return; 716 + 717 + override_cache_bits(&boot_cpu_data); 677 718 678 719 switch (l1tf_mitigation) { 679 720 case L1TF_MITIGATION_OFF: ··· 735 694 return; 736 695 #endif 737 696 738 - /* 739 - * This is extremely unlikely to happen because almost all 740 - * systems have far more MAX_PA/2 than RAM can be fit into 741 - * DIMM slots. 742 - */ 743 697 half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; 744 698 if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { 745 699 pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+1
arch/x86/kernel/cpu/common.c
··· 919 919 else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) 920 920 c->x86_phys_bits = 36; 921 921 #endif 922 + c->x86_cache_bits = c->x86_phys_bits; 922 923 } 923 924 924 925 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+3
arch/x86/kernel/cpu/intel.c
··· 150 150 if (cpu_has(c, X86_FEATURE_HYPERVISOR)) 151 151 return false; 152 152 153 + if (c->x86 != 6) 154 + return false; 155 + 153 156 for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { 154 157 if (c->x86_model == spectre_bad_microcodes[i].model && 155 158 c->x86_stepping == spectre_bad_microcodes[i].stepping)
+17 -3
arch/x86/kernel/dumpstack.c
··· 17 17 #include <linux/bug.h> 18 18 #include <linux/nmi.h> 19 19 #include <linux/sysfs.h> 20 + #include <linux/kasan.h> 20 21 21 22 #include <asm/cpu_entry_area.h> 22 23 #include <asm/stacktrace.h> ··· 90 89 * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random 91 90 * guesstimate in attempt to achieve all of the above. 92 91 */ 93 - void show_opcodes(u8 *rip, const char *loglvl) 92 + void show_opcodes(struct pt_regs *regs, const char *loglvl) 94 93 { 95 94 #define PROLOGUE_SIZE 42 96 95 #define EPILOGUE_SIZE 21 97 96 #define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) 98 97 u8 opcodes[OPCODE_BUFSIZE]; 98 + unsigned long prologue = regs->ip - PROLOGUE_SIZE; 99 + bool bad_ip; 99 100 100 - if (probe_kernel_read(opcodes, rip - PROLOGUE_SIZE, OPCODE_BUFSIZE)) { 101 + /* 102 + * Make sure userspace isn't trying to trick us into dumping kernel 103 + * memory by pointing the userspace instruction pointer at it. 104 + */ 105 + bad_ip = user_mode(regs) && 106 + __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX); 107 + 108 + if (bad_ip || probe_kernel_read(opcodes, (u8 *)prologue, 109 + OPCODE_BUFSIZE)) { 101 110 printk("%sCode: Bad RIP value.\n", loglvl); 102 111 } else { 103 112 printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" ··· 123 112 #else 124 113 printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip); 125 114 #endif 126 - show_opcodes((u8 *)regs->ip, loglvl); 115 + show_opcodes(regs, loglvl); 127 116 } 128 117 129 118 void show_iret_regs(struct pt_regs *regs) ··· 357 346 * We're not going to return, but we might be on an IST stack or 358 347 * have very little stack space left. Rewind the stack and kill 359 348 * the task. 349 + * Before we rewind the stack, we have to tell KASAN that we're going to 350 + * reuse the task stack and that existing poisons are invalid. 360 351 */ 352 + kasan_unpoison_task_stack(current); 361 353 rewind_stack_do_exit(signr); 362 354 } 363 355 NOKPROBE_SYMBOL(oops_end);
+5
arch/x86/lib/usercopy.c
··· 7 7 #include <linux/uaccess.h> 8 8 #include <linux/export.h> 9 9 10 + #include <asm/tlbflush.h> 11 + 10 12 /* 11 13 * We rely on the nested NMI work to allow atomic faults from the NMI path; the 12 14 * nested NMI paths are careful to preserve CR2. ··· 19 17 unsigned long ret; 20 18 21 19 if (__range_not_ok(from, n, TASK_SIZE)) 20 + return n; 21 + 22 + if (!nmi_uaccess_okay()) 22 23 return n; 23 24 24 25 /*
+1 -1
arch/x86/mm/fault.c
··· 837 837 838 838 printk(KERN_CONT "\n"); 839 839 840 - show_opcodes((u8 *)regs->ip, loglvl); 840 + show_opcodes(regs, loglvl); 841 841 } 842 842 843 843 static void
+24 -1
arch/x86/mm/pageattr.c
··· 1420 1420 return 0; 1421 1421 } 1422 1422 1423 + /* 1424 + * Machine check recovery code needs to change cache mode of poisoned 1425 + * pages to UC to avoid speculative access logging another error. But 1426 + * passing the address of the 1:1 mapping to set_memory_uc() is a fine 1427 + * way to encourage a speculative access. So we cheat and flip the top 1428 + * bit of the address. This works fine for the code that updates the 1429 + * page tables. But at the end of the process we need to flush the cache 1430 + * and the non-canonical address causes a #GP fault when used by the 1431 + * CLFLUSH instruction. 1432 + * 1433 + * But in the common case we already have a canonical address. This code 1434 + * will fix the top bit if needed and is a no-op otherwise. 1435 + */ 1436 + static inline unsigned long make_addr_canonical_again(unsigned long addr) 1437 + { 1438 + #ifdef CONFIG_X86_64 1439 + return (long)(addr << 1) >> 1; 1440 + #else 1441 + return addr; 1442 + #endif 1443 + } 1444 + 1445 + 1423 1446 static int change_page_attr_set_clr(unsigned long *addr, int numpages, 1424 1447 pgprot_t mask_set, pgprot_t mask_clr, 1425 1448 int force_split, int in_flag, ··· 1488 1465 * Save address for cache flush. *addr is modified in the call 1489 1466 * to __change_page_attr_set_clr() below. 1490 1467 */ 1491 - baddr = *addr; 1468 + baddr = make_addr_canonical_again(*addr); 1492 1469 } 1493 1470 1494 1471 /* Must avoid aliasing mappings in the highmem code */
+1 -1
arch/x86/mm/pti.c
··· 248 248 * 249 249 * Returns a pointer to a PTE on success, or NULL on failure. 250 250 */ 251 - static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) 251 + static pte_t *pti_user_pagetable_walk_pte(unsigned long address) 252 252 { 253 253 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 254 254 pmd_t *pmd;
+7
arch/x86/mm/tlb.c
··· 305 305 306 306 choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 307 307 308 + /* Let nmi_uaccess_okay() know that we're changing CR3. */ 309 + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); 310 + barrier(); 311 + 308 312 if (need_flush) { 309 313 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 310 314 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); ··· 338 334 */ 339 335 if (next != &init_mm) 340 336 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); 337 + 338 + /* Make sure we write CR3 before loaded_mm. */ 339 + barrier(); 341 340 342 341 this_cpu_write(cpu_tlbstate.loaded_mm, next); 343 342 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+2 -6
arch/x86/platform/efi/efi_32.c
··· 85 85 86 86 void __init efi_call_phys_epilog(pgd_t *save_pgd) 87 87 { 88 - struct desc_ptr gdt_descr; 89 - 90 - gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0); 91 - gdt_descr.size = GDT_SIZE - 1; 92 - load_gdt(&gdt_descr); 93 - 94 88 load_cr3(save_pgd); 95 89 __flush_tlb_all(); 90 + 91 + load_fixmap_gdt(0); 96 92 } 97 93 98 94 void __init efi_runtime_update_mappings(void)
-4
scripts/Kbuild.include
··· 153 153 # Usage: EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1) 154 154 cc-ifversion = $(shell [ $(cc-version) $(1) $(2) ] && echo $(3) || echo $(4)) 155 155 156 - # cc-if-fullversion 157 - # Usage: EXTRA_CFLAGS += $(call cc-if-fullversion, -lt, 040502, -O1) 158 - cc-if-fullversion = $(shell [ $(cc-fullversion) $(1) $(2) ] && echo $(3) || echo $(4)) 159 - 160 156 # cc-ldoption 161 157 # Usage: ldflags += $(call cc-ldoption, -Wl$(comma)--hash-style=both) 162 158 cc-ldoption = $(call try-run,\