Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-fred-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 FRED updates from Thomas Gleixner:

- Enable FRED right after init_mem_mapping() because at that point the
early IDT fault handler is replaced by the real fault handler. The
real fault handler retrieves the faulting address from the stack
frame and not from CR2 when the FRED feature is set. But that
obviously only works when FRED is enabled in the CPU as well.

- Set SS to __KERNEL_DS when enabling FRED to prevent a corner case
where ERETS can observe a SS mismatch and raises a #GP.

* tag 'x86-fred-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/entry: Set FRED RSP0 on return to userspace instead of context switch
x86/msr: Switch between WRMSRNS and WRMSR with the alternatives mechanism
x86/entry: Test ti_work for zero before processing individual bits
x86/fred: Set SS to __KERNEL_DS when enabling FRED
x86/fred: Enable FRED right after init_mem_mapping()
x86/fred: Move FRED RSP initialization into a separate function
x86/fred: Parse cmdline param "fred=" in cpu_parse_early_param()

+111 -64
+11 -2
arch/x86/include/asm/entry-common.h
··· 8 8 #include <asm/nospec-branch.h> 9 9 #include <asm/io_bitmap.h> 10 10 #include <asm/fpu/api.h> 11 + #include <asm/fred.h> 11 12 12 13 /* Check that the stack and regs on entry from user mode are sane. */ 13 14 static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) ··· 45 44 } 46 45 #define arch_enter_from_user_mode arch_enter_from_user_mode 47 46 48 - static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 49 - unsigned long ti_work) 47 + static inline void arch_exit_work(unsigned long ti_work) 50 48 { 51 49 if (ti_work & _TIF_USER_RETURN_NOTIFY) 52 50 fire_user_return_notifiers(); ··· 56 56 fpregs_assert_state_consistent(); 57 57 if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) 58 58 switch_fpu_return(); 59 + } 60 + 61 + static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, 62 + unsigned long ti_work) 63 + { 64 + if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work)) 65 + arch_exit_work(ti_work); 66 + 67 + fred_update_rsp0(); 59 68 60 69 #ifdef CONFIG_COMPAT 61 70 /*
+22 -1
arch/x86/include/asm/fred.h
··· 36 36 37 37 #ifdef CONFIG_X86_FRED 38 38 #include <linux/kernel.h> 39 + #include <linux/sched/task_stack.h> 39 40 40 41 #include <asm/ptrace.h> 41 42 ··· 85 84 } 86 85 87 86 void cpu_init_fred_exceptions(void); 87 + void cpu_init_fred_rsps(void); 88 88 void fred_complete_exception_setup(void); 89 89 90 + DECLARE_PER_CPU(unsigned long, fred_rsp0); 91 + 92 + static __always_inline void fred_sync_rsp0(unsigned long rsp0) 93 + { 94 + __this_cpu_write(fred_rsp0, rsp0); 95 + } 96 + 97 + static __always_inline void fred_update_rsp0(void) 98 + { 99 + unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE; 100 + 101 + if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) { 102 + wrmsrns(MSR_IA32_FRED_RSP0, rsp0); 103 + __this_cpu_write(fred_rsp0, rsp0); 104 + } 105 + } 90 106 #else /* CONFIG_X86_FRED */ 91 107 static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } 92 108 static inline void cpu_init_fred_exceptions(void) { } 109 + static inline void cpu_init_fred_rsps(void) { } 93 110 static inline void fred_complete_exception_setup(void) { } 94 - static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } 111 + static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } 112 + static inline void fred_sync_rsp0(unsigned long rsp0) { } 113 + static inline void fred_update_rsp0(void) { } 95 114 #endif /* CONFIG_X86_FRED */ 96 115 #endif /* !__ASSEMBLY__ */ 97 116
+11 -14
arch/x86/include/asm/msr.h
··· 99 99 : : "c" (msr), "a"(low), "d" (high) : "memory"); 100 100 } 101 101 102 - /* 103 - * WRMSRNS behaves exactly like WRMSR with the only difference being 104 - * that it is not a serializing instruction by default. 105 - */ 106 - static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) 107 - { 108 - /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ 109 - asm volatile("1: .byte 0x0f,0x01,0xc6\n" 110 - "2:\n" 111 - _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) 112 - : : "c" (msr), "a"(low), "d" (high)); 113 - } 114 - 115 102 #define native_rdmsr(msr, val1, val2) \ 116 103 do { \ 117 104 u64 __val = __rdmsr((msr)); \ ··· 299 312 300 313 #endif /* !CONFIG_PARAVIRT_XXL */ 301 314 315 + /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ 316 + #define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) 317 + 318 + /* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */ 302 319 static __always_inline void wrmsrns(u32 msr, u64 val) 303 320 { 304 - __wrmsrns(msr, val, val >> 32); 321 + /* 322 + * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant 323 + * DS prefix to avoid a trailing NOP. 324 + */ 325 + asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS) 326 + "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) 327 + : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))); 305 328 } 306 329 307 330 /*
+2 -1
arch/x86/include/asm/processor.h
··· 582 582 extern void load_direct_gdt(int); 583 583 extern void load_fixmap_gdt(int); 584 584 extern void cpu_init(void); 585 - extern void cpu_init_exception_handling(void); 585 + extern void cpu_init_exception_handling(bool boot_cpu); 586 + extern void cpu_init_replace_early_idt(void); 586 587 extern void cr4_init(void); 587 588 588 589 extern void set_task_blockstep(struct task_struct *task, bool on);
+1 -5
arch/x86/include/asm/switch_to.h
··· 70 70 #ifdef CONFIG_X86_32 71 71 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); 72 72 #else 73 - if (cpu_feature_enabled(X86_FEATURE_FRED)) { 74 - /* WRMSRNS is a baseline feature for FRED. */ 75 - wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); 76 - } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { 73 + if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV)) 77 74 /* Xen PV enters the kernel on the thread stack. */ 78 75 load_sp0(task_top_of_stack(task)); 79 - } 80 76 #endif 81 77 } 82 78
+20 -2
arch/x86/kernel/cpu/common.c
··· 1510 1510 if (cmdline_find_option_bool(boot_command_line, "nousershstk")) 1511 1511 setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); 1512 1512 1513 + /* Minimize the gap between FRED is available and available but disabled. */ 1514 + arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); 1515 + if (arglen != 2 || strncmp(arg, "on", 2)) 1516 + setup_clear_cpu_cap(X86_FEATURE_FRED); 1517 + 1513 1518 arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); 1514 1519 if (arglen <= 0) 1515 1520 return; ··· 2176 2171 * Setup everything needed to handle exceptions from the IDT, including the IST 2177 2172 * exceptions which use paranoid_entry(). 2178 2173 */ 2179 - void cpu_init_exception_handling(void) 2174 + void cpu_init_exception_handling(bool boot_cpu) 2180 2175 { 2181 2176 struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); 2182 2177 int cpu = raw_smp_processor_id(); ··· 2195 2190 /* GHCB needs to be setup to handle #VC. */ 2196 2191 setup_ghcb(); 2197 2192 2193 + if (cpu_feature_enabled(X86_FEATURE_FRED)) { 2194 + /* The boot CPU has enabled FRED during early boot */ 2195 + if (!boot_cpu) 2196 + cpu_init_fred_exceptions(); 2197 + 2198 + cpu_init_fred_rsps(); 2199 + } else { 2200 + load_current_idt(); 2201 + } 2202 + } 2203 + 2204 + void __init cpu_init_replace_early_idt(void) 2205 + { 2198 2206 if (cpu_feature_enabled(X86_FEATURE_FRED)) 2199 2207 cpu_init_fred_exceptions(); 2200 2208 else 2201 - load_current_idt(); 2209 + idt_setup_early_pf(); 2202 2210 } 2203 2211 2204 2212 /*
-1
arch/x86/kernel/cpu/cpuid-deps.c
··· 83 83 { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, 84 84 { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, 85 85 { X86_FEATURE_FRED, X86_FEATURE_LKGS }, 86 - { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, 87 86 {} 88 87 }; 89 88
+36 -9
arch/x86/kernel/fred.c
··· 21 21 22 22 #define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) 23 23 24 + DEFINE_PER_CPU(unsigned long, fred_rsp0); 25 + EXPORT_PER_CPU_SYMBOL(fred_rsp0); 26 + 24 27 void cpu_init_fred_exceptions(void) 25 28 { 26 29 /* When FRED is enabled by default, remove this log message */ 27 30 pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); 31 + 32 + /* 33 + * If a kernel event is delivered before a CPU goes to user level for 34 + * the first time, its SS is NULL thus NULL is pushed into the SS field 35 + * of the FRED stack frame. But before ERETS is executed, the CPU may 36 + * context switch to another task and go to user level. Then when the 37 + * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later 38 + * when ERETS is executed to return from the kernel event handler, a #GP 39 + * fault is generated because SS doesn't match the SS saved in the FRED 40 + * stack frame. 41 + * 42 + * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs. 43 + */ 44 + loadsegment(ss, __KERNEL_DS); 28 45 29 46 wrmsrl(MSR_IA32_FRED_CONFIG, 30 47 /* Reserve for CALL emulation */ ··· 49 32 FRED_CONFIG_INT_STKLVL(0) | 50 33 FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); 51 34 35 + wrmsrl(MSR_IA32_FRED_STKLVLS, 0); 36 + wrmsrl(MSR_IA32_FRED_RSP0, 0); 37 + wrmsrl(MSR_IA32_FRED_RSP1, 0); 38 + wrmsrl(MSR_IA32_FRED_RSP2, 0); 39 + wrmsrl(MSR_IA32_FRED_RSP3, 0); 40 + 41 + /* Enable FRED */ 42 + cr4_set_bits(X86_CR4_FRED); 43 + /* Any further IDT use is a bug */ 44 + idt_invalidate(); 45 + 46 + /* Use int $0x80 for 32-bit system calls in FRED mode */ 47 + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); 48 + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 49 + } 50 + 51 + /* Must be called after setup_cpu_entry_areas() */ 52 + void cpu_init_fred_rsps(void) 53 + { 52 54 /* 53 55 * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* 54 56 * (remember that user space faults are always taken on stack level 0) ··· 83 47 wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); 84 48 wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); 85 49 wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); 86 - 87 - /* Enable FRED */ 88 - cr4_set_bits(X86_CR4_FRED); 89 - /* Any further IDT use is a bug */ 90 - idt_invalidate(); 91 - 92 - /* Use int $0x80 for 32-bit system calls in FRED mode */ 93 - setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); 94 - setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 95 50 }
+6 -1
arch/x86/kernel/setup.c
··· 1039 1039 1040 1040 init_mem_mapping(); 1041 1041 1042 - idt_setup_early_pf(); 1042 + /* 1043 + * init_mem_mapping() relies on the early IDT page fault handling. 1044 + * Now either enable FRED or install the real page fault handler 1045 + * for 64-bit in the IDT. 1046 + */ 1047 + cpu_init_replace_early_idt(); 1043 1048 1044 1049 /* 1045 1050 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+1 -1
arch/x86/kernel/smpboot.c
··· 246 246 __flush_tlb_all(); 247 247 } 248 248 249 - cpu_init_exception_handling(); 249 + cpu_init_exception_handling(false); 250 250 251 251 /* 252 252 * Load the microcode before reaching the AP alive synchronization
+1 -27
arch/x86/kernel/traps.c
··· 1451 1451 } 1452 1452 #endif 1453 1453 1454 - /* Do not enable FRED by default yet. */ 1455 - static bool enable_fred __ro_after_init = false; 1456 - 1457 - #ifdef CONFIG_X86_FRED 1458 - static int __init fred_setup(char *str) 1459 - { 1460 - if (!str) 1461 - return -EINVAL; 1462 - 1463 - if (!cpu_feature_enabled(X86_FEATURE_FRED)) 1464 - return 0; 1465 - 1466 - if (!strcmp(str, "on")) 1467 - enable_fred = true; 1468 - else if (!strcmp(str, "off")) 1469 - enable_fred = false; 1470 - else 1471 - pr_warn("invalid FRED option: 'fred=%s'\n", str); 1472 - return 0; 1473 - } 1474 - early_param("fred", fred_setup); 1475 - #endif 1476 - 1477 1454 void __init trap_init(void) 1478 1455 { 1479 - if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred) 1480 - setup_clear_cpu_cap(X86_FEATURE_FRED); 1481 - 1482 1456 /* Init cpu_entry_area before IST entries are set up */ 1483 1457 setup_cpu_entry_areas(); 1484 1458 ··· 1460 1486 sev_es_init_vc_handling(); 1461 1487 1462 1488 /* Initialize TSS before setting up traps so ISTs work */ 1463 - cpu_init_exception_handling(); 1489 + cpu_init_exception_handling(true); 1464 1490 1465 1491 /* Setup traps as cpu_init() might #GP */ 1466 1492 if (!cpu_feature_enabled(X86_FEATURE_FRED))