Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86_urgent_for_v5.16_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:

- Fix a couple of SWAPGS fencing issues in the x86 entry code

- Use the proper operand types in __{get,put}_user() to prevent
truncation in SEV-ES string io

- Make sure the kernel mappings are present in trampoline_pgd in order
to prevent any potential accesses to unmapped memory after switching
to it

- Fix a trivial list corruption in objtool's pv_ops validation

- Disable the clocksource watchdog for TSC on platforms which claim
that the TSC is constant, doesn't stop in sleep states, CPU has TSC
adjust and the number of sockets of the platform are max 2, to
prevent erroneous markings of the TSC as unstable.

- Make sure TSC adjust is always checked not only when going idle

- Prevent a stack leak by initializing struct _fpx_sw_bytes properly in
the FPU code

- Fix INTEL_FAM6_RAPTORLAKE define naming to adhere to the convention

* tag 'x86_urgent_for_v5.16_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/xen: Add xenpv_restore_regs_and_return_to_usermode()
x86/entry: Use the correct fence macro after swapgs in kernel CR3
x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry()
x86/sev: Fix SEV-ES INS/OUTS instructions for word, dword, and qword
x86/64/mm: Map all kernel memory into trampoline_pgd
objtool: Fix pv_ops noinstr validation
x86/tsc: Disable clocksource watchdog for TSC on qualified platorms
x86/tsc: Add a timer to make sure TSC_adjust is always checked
x86/fpu/signal: Initialize sw_bytes in save_xstate_epilog()
x86/cpu: Drop spurious underscore from RAPTOR_LAKE #define

+160 -44
+18 -19
arch/x86/entry/entry_64.S
··· 574 574 ud2 575 575 1: 576 576 #endif 577 + #ifdef CONFIG_XEN_PV 578 + ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV 579 + #endif 580 + 577 581 POP_REGS pop_rdi=0 578 582 579 583 /* ··· 894 890 .Lparanoid_entry_checkgs: 895 891 /* EBX = 1 -> kernel GSBASE active, no restore required */ 896 892 movl $1, %ebx 893 + 897 894 /* 898 895 * The kernel-enforced convention is a negative GSBASE indicates 899 896 * a kernel value. No SWAPGS needed on entry and exit. ··· 902 897 movl $MSR_GS_BASE, %ecx 903 898 rdmsr 904 899 testl %edx, %edx 905 - jns .Lparanoid_entry_swapgs 906 - ret 907 - 908 - .Lparanoid_entry_swapgs: 909 - swapgs 910 - 911 - /* 912 - * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an 913 - * unconditional CR3 write, even in the PTI case. So do an lfence 914 - * to prevent GS speculation, regardless of whether PTI is enabled. 915 - */ 916 - FENCE_SWAPGS_KERNEL_ENTRY 900 + js .Lparanoid_kernel_gsbase 917 901 918 902 /* EBX = 0 -> SWAPGS required on exit */ 919 903 xorl %ebx, %ebx 904 + swapgs 905 + .Lparanoid_kernel_gsbase: 906 + 907 + FENCE_SWAPGS_KERNEL_ENTRY 920 908 ret 921 909 SYM_CODE_END(paranoid_entry) 922 910 ··· 991 993 pushq %r12 992 994 ret 993 995 994 - .Lerror_entry_done_lfence: 995 - FENCE_SWAPGS_KERNEL_ENTRY 996 - .Lerror_entry_done: 997 - ret 998 - 999 996 /* 1000 997 * There are two places in the kernel that can potentially fault with 1001 998 * usergs. Handle them here. B stepping K8s sometimes report a ··· 1013 1020 * .Lgs_change's error handler with kernel gsbase. 1014 1021 */ 1015 1022 SWAPGS 1016 - FENCE_SWAPGS_USER_ENTRY 1017 - jmp .Lerror_entry_done 1023 + 1024 + /* 1025 + * Issue an LFENCE to prevent GS speculation, regardless of whether it is a 1026 + * kernel or user gsbase. 1027 + */ 1028 + .Lerror_entry_done_lfence: 1029 + FENCE_SWAPGS_KERNEL_ENTRY 1030 + ret 1018 1031 1019 1032 .Lbstep_iret: 1020 1033 /* Fix truncated RIP */
+1 -1
arch/x86/include/asm/intel-family.h
··· 108 108 #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ 109 109 #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ 110 110 111 - #define INTEL_FAM6_RAPTOR_LAKE 0xB7 111 + #define INTEL_FAM6_RAPTORLAKE 0xB7 112 112 113 113 /* "Small Core" Processors (Atom) */ 114 114
+1 -1
arch/x86/kernel/fpu/signal.c
··· 118 118 struct fpstate *fpstate) 119 119 { 120 120 struct xregs_state __user *x = buf; 121 - struct _fpx_sw_bytes sw_bytes; 121 + struct _fpx_sw_bytes sw_bytes = {}; 122 122 u32 xfeatures; 123 123 int err; 124 124
+39 -18
arch/x86/kernel/sev.c
··· 294 294 char *dst, char *buf, size_t size) 295 295 { 296 296 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; 297 - char __user *target = (char __user *)dst; 298 - u64 d8; 299 - u32 d4; 300 - u16 d2; 301 - u8 d1; 302 297 303 298 /* 304 299 * This function uses __put_user() independent of whether kernel or user ··· 315 320 * instructions here would cause infinite nesting. 316 321 */ 317 322 switch (size) { 318 - case 1: 323 + case 1: { 324 + u8 d1; 325 + u8 __user *target = (u8 __user *)dst; 326 + 319 327 memcpy(&d1, buf, 1); 320 328 if (__put_user(d1, target)) 321 329 goto fault; 322 330 break; 323 - case 2: 331 + } 332 + case 2: { 333 + u16 d2; 334 + u16 __user *target = (u16 __user *)dst; 335 + 324 336 memcpy(&d2, buf, 2); 325 337 if (__put_user(d2, target)) 326 338 goto fault; 327 339 break; 328 - case 4: 340 + } 341 + case 4: { 342 + u32 d4; 343 + u32 __user *target = (u32 __user *)dst; 344 + 329 345 memcpy(&d4, buf, 4); 330 346 if (__put_user(d4, target)) 331 347 goto fault; 332 348 break; 333 - case 8: 349 + } 350 + case 8: { 351 + u64 d8; 352 + u64 __user *target = (u64 __user *)dst; 353 + 334 354 memcpy(&d8, buf, 8); 335 355 if (__put_user(d8, target)) 336 356 goto fault; 337 357 break; 358 + } 338 359 default: 339 360 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 340 361 return ES_UNSUPPORTED; ··· 373 362 char *src, char *buf, size_t size) 374 363 { 375 364 unsigned long error_code = X86_PF_PROT; 376 - char __user *s = (char __user *)src; 377 - u64 d8; 378 - u32 d4; 379 - u16 d2; 380 - u8 d1; 381 365 382 366 /* 383 367 * This function uses __get_user() independent of whether kernel or user ··· 394 388 * instructions here would cause infinite nesting. 395 389 */ 396 390 switch (size) { 397 - case 1: 391 + case 1: { 392 + u8 d1; 393 + u8 __user *s = (u8 __user *)src; 394 + 398 395 if (__get_user(d1, s)) 399 396 goto fault; 400 397 memcpy(buf, &d1, 1); 401 398 break; 402 - case 2: 399 + } 400 + case 2: { 401 + u16 d2; 402 + u16 __user *s = (u16 __user *)src; 403 + 403 404 if (__get_user(d2, s)) 404 405 goto fault; 405 406 memcpy(buf, &d2, 2); 406 407 break; 407 - case 4: 408 + } 409 + case 4: { 410 + u32 d4; 411 + u32 __user *s = (u32 __user *)src; 412 + 408 413 if (__get_user(d4, s)) 409 414 goto fault; 410 415 memcpy(buf, &d4, 4); 411 416 break; 412 - case 8: 417 + } 418 + case 8: { 419 + u64 d8; 420 + u64 __user *s = (u64 __user *)src; 413 421 if (__get_user(d8, s)) 414 422 goto fault; 415 423 memcpy(buf, &d8, 8); 416 424 break; 425 + } 417 426 default: 418 427 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 419 428 return ES_UNSUPPORTED;
+24 -4
arch/x86/kernel/tsc.c
··· 1180 1180 1181 1181 EXPORT_SYMBOL_GPL(mark_tsc_unstable); 1182 1182 1183 + static void __init tsc_disable_clocksource_watchdog(void) 1184 + { 1185 + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1186 + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1187 + } 1188 + 1183 1189 static void __init check_system_tsc_reliable(void) 1184 1190 { 1185 1191 #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) ··· 1202 1196 #endif 1203 1197 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) 1204 1198 tsc_clocksource_reliable = 1; 1199 + 1200 + /* 1201 + * Disable the clocksource watchdog when the system has: 1202 + * - TSC running at constant frequency 1203 + * - TSC which does not stop in C-States 1204 + * - the TSC_ADJUST register which allows to detect even minimal 1205 + * modifications 1206 + * - not more than two sockets. As the number of sockets cannot be 1207 + * evaluated at the early boot stage where this has to be 1208 + * invoked, check the number of online memory nodes as a 1209 + * fallback solution which is an reasonable estimate. 1210 + */ 1211 + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 1212 + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && 1213 + boot_cpu_has(X86_FEATURE_TSC_ADJUST) && 1214 + nr_online_nodes <= 2) 1215 + tsc_disable_clocksource_watchdog(); 1205 1216 } 1206 1217 1207 1218 /* ··· 1410 1387 if (tsc_unstable) 1411 1388 goto unreg; 1412 1389 1413 - if (tsc_clocksource_reliable || no_tsc_watchdog) 1414 - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1415 - 1416 1390 if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) 1417 1391 clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; 1418 1392 ··· 1547 1527 } 1548 1528 1549 1529 if (tsc_clocksource_reliable || no_tsc_watchdog) 1550 - clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1530 + tsc_disable_clocksource_watchdog(); 1551 1531 1552 1532 clocksource_register_khz(&clocksource_tsc_early, tsc_khz); 1553 1533 detect_art();
+41
arch/x86/kernel/tsc_sync.c
··· 30 30 }; 31 31 32 32 static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); 33 + static struct timer_list tsc_sync_check_timer; 33 34 34 35 /* 35 36 * TSC's on different sockets may be reset asynchronously. ··· 77 76 adj->warned = true; 78 77 } 79 78 } 79 + 80 + /* 81 + * Normally the tsc_sync will be checked every time system enters idle 82 + * state, but there is still caveat that a system won't enter idle, 83 + * either because it's too busy or configured purposely to not enter 84 + * idle. 85 + * 86 + * So setup a periodic timer (every 10 minutes) to make sure the check 87 + * is always on. 88 + */ 89 + 90 + #define SYNC_CHECK_INTERVAL (HZ * 600) 91 + 92 + static void tsc_sync_check_timer_fn(struct timer_list *unused) 93 + { 94 + int next_cpu; 95 + 96 + tsc_verify_tsc_adjust(false); 97 + 98 + /* Run the check for all onlined CPUs in turn */ 99 + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); 100 + if (next_cpu >= nr_cpu_ids) 101 + next_cpu = cpumask_first(cpu_online_mask); 102 + 103 + tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL; 104 + add_timer_on(&tsc_sync_check_timer, next_cpu); 105 + } 106 + 107 + static int __init start_sync_check_timer(void) 108 + { 109 + if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable) 110 + return 0; 111 + 112 + timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0); 113 + tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL; 114 + add_timer(&tsc_sync_check_timer); 115 + 116 + return 0; 117 + } 118 + late_initcall(start_sync_check_timer); 80 119 81 120 static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, 82 121 unsigned int cpu, bool bootcpu)
+11 -1
arch/x86/realmode/init.c
··· 72 72 #ifdef CONFIG_X86_64 73 73 u64 *trampoline_pgd; 74 74 u64 efer; 75 + int i; 75 76 #endif 76 77 77 78 base = (unsigned char *)real_mode_header; ··· 129 128 trampoline_header->flags = 0; 130 129 131 130 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 131 + 132 + /* Map the real mode stub as virtual == physical */ 132 133 trampoline_pgd[0] = trampoline_pgd_entry.pgd; 133 - trampoline_pgd[511] = init_top_pgt[511].pgd; 134 + 135 + /* 136 + * Include the entirety of the kernel mapping into the trampoline 137 + * PGD. This way, all mappings present in the normal kernel page 138 + * tables are usable while running on trampoline_pgd. 139 + */ 140 + for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++) 141 + trampoline_pgd[i] = init_top_pgt[i].pgd; 134 142 #endif 135 143 136 144 sme_sev_setup_real_mode(trampoline_header);
+20
arch/x86/xen/xen-asm.S
··· 20 20 21 21 #include <linux/init.h> 22 22 #include <linux/linkage.h> 23 + #include <../entry/calling.h> 23 24 24 25 .pushsection .noinstr.text, "ax" 25 26 /* ··· 192 191 pushq $0 193 192 jmp hypercall_iret 194 193 SYM_CODE_END(xen_iret) 194 + 195 + /* 196 + * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is 197 + * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode() 198 + * in XEN pv would cause %rsp to move up to the top of the kernel stack and 199 + * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI 200 + * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET 201 + * frame at the same address is useless. 202 + */ 203 + SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode) 204 + UNWIND_HINT_REGS 205 + POP_REGS 206 + 207 + /* stackleak_erase() can work safely on the kernel stack. */ 208 + STACKLEAK_ERASE_NOCLOBBER 209 + 210 + addq $8, %rsp /* skip regs->orig_ax */ 211 + jmp xen_iret 212 + SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode) 195 213 196 214 /* 197 215 * Xen handles syscall callbacks much like ordinary exceptions, which
+1
tools/objtool/elf.c
··· 375 375 return -1; 376 376 } 377 377 memset(sym, 0, sizeof(*sym)); 378 + INIT_LIST_HEAD(&sym->pv_target); 378 379 sym->alias = sym; 379 380 380 381 sym->idx = i;
+4
tools/objtool/objtool.c
··· 153 153 !strcmp(func->name, "_paravirt_ident_64")) 154 154 return; 155 155 156 + /* already added this function */ 157 + if (!list_empty(&func->pv_target)) 158 + return; 159 + 156 160 list_add(&func->pv_target, &f->pv_ops[idx].targets); 157 161 f->pv_ops[idx].clean = false; 158 162 }