Merge tag 'x86_urgent_for_v5.16_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+18 -19

arch/x86/entry/entry_64.S

··· 574 574 ud2 575 575 1: 576 576 #endif 577 + #ifdef CONFIG_XEN_PV 578 + ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV 579 + #endif 580 + 577 581 POP_REGS pop_rdi=0 578 582 579 583 /* ··· 894 890 .Lparanoid_entry_checkgs: 895 891 /* EBX = 1 -> kernel GSBASE active, no restore required */ 896 892 movl $1, %ebx 893 + 897 894 /* 898 895 * The kernel-enforced convention is a negative GSBASE indicates 899 896 * a kernel value. No SWAPGS needed on entry and exit. ··· 902 897 movl $MSR_GS_BASE, %ecx 903 898 rdmsr 904 899 testl %edx, %edx 905 - jns .Lparanoid_entry_swapgs 906 - ret 907 - 908 - .Lparanoid_entry_swapgs: 909 - swapgs 910 - 911 - /* 912 - * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an 913 - * unconditional CR3 write, even in the PTI case. So do an lfence 914 - * to prevent GS speculation, regardless of whether PTI is enabled. 915 - */ 916 - FENCE_SWAPGS_KERNEL_ENTRY 900 + js .Lparanoid_kernel_gsbase 917 901 918 902 /* EBX = 0 -> SWAPGS required on exit */ 919 903 xorl %ebx, %ebx 904 + swapgs 905 + .Lparanoid_kernel_gsbase: 906 + 907 + FENCE_SWAPGS_KERNEL_ENTRY 920 908 ret 921 909 SYM_CODE_END(paranoid_entry) 922 910 ··· 991 993 pushq %r12 992 994 ret 993 995 994 - .Lerror_entry_done_lfence: 995 - FENCE_SWAPGS_KERNEL_ENTRY 996 - .Lerror_entry_done: 997 - ret 998 - 999 996 /* 1000 997 * There are two places in the kernel that can potentially fault with 1001 998 * usergs. Handle them here. B stepping K8s sometimes report a ··· 1013 1020 * .Lgs_change's error handler with kernel gsbase. 1014 1021 */ 1015 1022 SWAPGS 1016 - FENCE_SWAPGS_USER_ENTRY 1017 - jmp .Lerror_entry_done 1023 + 1024 + /* 1025 + * Issue an LFENCE to prevent GS speculation, regardless of whether it is a 1026 + * kernel or user gsbase. 1027 + */ 1028 + .Lerror_entry_done_lfence: 1029 + FENCE_SWAPGS_KERNEL_ENTRY 1030 + ret 1018 1031 1019 1032 .Lbstep_iret: 1020 1033 /* Fix truncated RIP */

+1 -1

arch/x86/include/asm/intel-family.h

··· 108 108 #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ 109 109 #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ 110 110 111 - #define INTEL_FAM6_RAPTOR_LAKE 0xB7 111 + #define INTEL_FAM6_RAPTORLAKE 0xB7 112 112 113 113 /* "Small Core" Processors (Atom) */ 114 114

+1 -1

arch/x86/kernel/fpu/signal.c

··· 118 118 struct fpstate *fpstate) 119 119 { 120 120 struct xregs_state __user *x = buf; 121 - struct _fpx_sw_bytes sw_bytes; 121 + struct _fpx_sw_bytes sw_bytes = {}; 122 122 u32 xfeatures; 123 123 int err; 124 124

+39 -18

arch/x86/kernel/sev.c

··· 294 294 char *dst, char *buf, size_t size) 295 295 { 296 296 unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; 297 - char __user *target = (char __user *)dst; 298 - u64 d8; 299 - u32 d4; 300 - u16 d2; 301 - u8 d1; 302 297 303 298 /* 304 299 * This function uses __put_user() independent of whether kernel or user ··· 315 320 * instructions here would cause infinite nesting. 316 321 */ 317 322 switch (size) { 318 - case 1: 323 + case 1: { 324 + u8 d1; 325 + u8 __user *target = (u8 __user *)dst; 326 + 319 327 memcpy(&d1, buf, 1); 320 328 if (__put_user(d1, target)) 321 329 goto fault; 322 330 break; 323 - case 2: 331 + } 332 + case 2: { 333 + u16 d2; 334 + u16 __user *target = (u16 __user *)dst; 335 + 324 336 memcpy(&d2, buf, 2); 325 337 if (__put_user(d2, target)) 326 338 goto fault; 327 339 break; 328 - case 4: 340 + } 341 + case 4: { 342 + u32 d4; 343 + u32 __user *target = (u32 __user *)dst; 344 + 329 345 memcpy(&d4, buf, 4); 330 346 if (__put_user(d4, target)) 331 347 goto fault; 332 348 break; 333 - case 8: 349 + } 350 + case 8: { 351 + u64 d8; 352 + u64 __user *target = (u64 __user *)dst; 353 + 334 354 memcpy(&d8, buf, 8); 335 355 if (__put_user(d8, target)) 336 356 goto fault; 337 357 break; 358 + } 338 359 default: 339 360 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 340 361 return ES_UNSUPPORTED; ··· 373 362 char *src, char *buf, size_t size) 374 363 { 375 364 unsigned long error_code = X86_PF_PROT; 376 - char __user *s = (char __user *)src; 377 - u64 d8; 378 - u32 d4; 379 - u16 d2; 380 - u8 d1; 381 365 382 366 /* 383 367 * This function uses __get_user() independent of whether kernel or user ··· 394 388 * instructions here would cause infinite nesting. 395 389 */ 396 390 switch (size) { 397 - case 1: 391 + case 1: { 392 + u8 d1; 393 + u8 __user *s = (u8 __user *)src; 394 + 398 395 if (__get_user(d1, s)) 399 396 goto fault; 400 397 memcpy(buf, &d1, 1); 401 398 break; 402 - case 2: 399 + } 400 + case 2: { 401 + u16 d2; 402 + u16 __user *s = (u16 __user *)src; 403 + 403 404 if (__get_user(d2, s)) 404 405 goto fault; 405 406 memcpy(buf, &d2, 2); 406 407 break; 407 - case 4: 408 + } 409 + case 4: { 410 + u32 d4; 411 + u32 __user *s = (u32 __user *)src; 412 + 408 413 if (__get_user(d4, s)) 409 414 goto fault; 410 415 memcpy(buf, &d4, 4); 411 416 break; 412 - case 8: 417 + } 418 + case 8: { 419 + u64 d8; 420 + u64 __user *s = (u64 __user *)src; 413 421 if (__get_user(d8, s)) 414 422 goto fault; 415 423 memcpy(buf, &d8, 8); 416 424 break; 425 + } 417 426 default: 418 427 WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); 419 428 return ES_UNSUPPORTED;

+24 -4

arch/x86/kernel/tsc.c

··· 1180 1180 1181 1181 EXPORT_SYMBOL_GPL(mark_tsc_unstable); 1182 1182 1183 + static void __init tsc_disable_clocksource_watchdog(void) 1184 + { 1185 + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1186 + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1187 + } 1188 + 1183 1189 static void __init check_system_tsc_reliable(void) 1184 1190 { 1185 1191 #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) ··· 1202 1196 #endif 1203 1197 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) 1204 1198 tsc_clocksource_reliable = 1; 1199 + 1200 + /* 1201 + * Disable the clocksource watchdog when the system has: 1202 + * - TSC running at constant frequency 1203 + * - TSC which does not stop in C-States 1204 + * - the TSC_ADJUST register which allows to detect even minimal 1205 + * modifications 1206 + * - not more than two sockets. As the number of sockets cannot be 1207 + * evaluated at the early boot stage where this has to be 1208 + * invoked, check the number of online memory nodes as a 1209 + * fallback solution which is an reasonable estimate. 1210 + */ 1211 + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && 1212 + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && 1213 + boot_cpu_has(X86_FEATURE_TSC_ADJUST) && 1214 + nr_online_nodes <= 2) 1215 + tsc_disable_clocksource_watchdog(); 1205 1216 } 1206 1217 1207 1218 /* ··· 1410 1387 if (tsc_unstable) 1411 1388 goto unreg; 1412 1389 1413 - if (tsc_clocksource_reliable || no_tsc_watchdog) 1414 - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1415 - 1416 1390 if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) 1417 1391 clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; 1418 1392 ··· 1547 1527 } 1548 1528 1549 1529 if (tsc_clocksource_reliable || no_tsc_watchdog) 1550 - clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 1530 + tsc_disable_clocksource_watchdog(); 1551 1531 1552 1532 clocksource_register_khz(&clocksource_tsc_early, tsc_khz); 1553 1533 detect_art();

+41

arch/x86/kernel/tsc_sync.c

··· 30 30 }; 31 31 32 32 static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); 33 + static struct timer_list tsc_sync_check_timer; 33 34 34 35 /* 35 36 * TSC's on different sockets may be reset asynchronously. ··· 77 76 adj->warned = true; 78 77 } 79 78 } 79 + 80 + /* 81 + * Normally the tsc_sync will be checked every time system enters idle 82 + * state, but there is still caveat that a system won't enter idle, 83 + * either because it's too busy or configured purposely to not enter 84 + * idle. 85 + * 86 + * So setup a periodic timer (every 10 minutes) to make sure the check 87 + * is always on. 88 + */ 89 + 90 + #define SYNC_CHECK_INTERVAL (HZ * 600) 91 + 92 + static void tsc_sync_check_timer_fn(struct timer_list *unused) 93 + { 94 + int next_cpu; 95 + 96 + tsc_verify_tsc_adjust(false); 97 + 98 + /* Run the check for all onlined CPUs in turn */ 99 + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); 100 + if (next_cpu >= nr_cpu_ids) 101 + next_cpu = cpumask_first(cpu_online_mask); 102 + 103 + tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL; 104 + add_timer_on(&tsc_sync_check_timer, next_cpu); 105 + } 106 + 107 + static int __init start_sync_check_timer(void) 108 + { 109 + if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable) 110 + return 0; 111 + 112 + timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0); 113 + tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL; 114 + add_timer(&tsc_sync_check_timer); 115 + 116 + return 0; 117 + } 118 + late_initcall(start_sync_check_timer); 80 119 81 120 static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, 82 121 unsigned int cpu, bool bootcpu)

+11 -1

arch/x86/realmode/init.c

··· 72 72 #ifdef CONFIG_X86_64 73 73 u64 *trampoline_pgd; 74 74 u64 efer; 75 + int i; 75 76 #endif 76 77 77 78 base = (unsigned char *)real_mode_header; ··· 129 128 trampoline_header->flags = 0; 130 129 131 130 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 131 + 132 + /* Map the real mode stub as virtual == physical */ 132 133 trampoline_pgd[0] = trampoline_pgd_entry.pgd; 133 - trampoline_pgd[511] = init_top_pgt[511].pgd; 134 + 135 + /* 136 + * Include the entirety of the kernel mapping into the trampoline 137 + * PGD. This way, all mappings present in the normal kernel page 138 + * tables are usable while running on trampoline_pgd. 139 + */ 140 + for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++) 141 + trampoline_pgd[i] = init_top_pgt[i].pgd; 134 142 #endif 135 143 136 144 sme_sev_setup_real_mode(trampoline_header);

+20

arch/x86/xen/xen-asm.S

··· 20 20 21 21 #include <linux/init.h> 22 22 #include <linux/linkage.h> 23 + #include <../entry/calling.h> 23 24 24 25 .pushsection .noinstr.text, "ax" 25 26 /* ··· 192 191 pushq $0 193 192 jmp hypercall_iret 194 193 SYM_CODE_END(xen_iret) 194 + 195 + /* 196 + * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is 197 + * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode() 198 + * in XEN pv would cause %rsp to move up to the top of the kernel stack and 199 + * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI 200 + * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET 201 + * frame at the same address is useless. 202 + */ 203 + SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode) 204 + UNWIND_HINT_REGS 205 + POP_REGS 206 + 207 + /* stackleak_erase() can work safely on the kernel stack. */ 208 + STACKLEAK_ERASE_NOCLOBBER 209 + 210 + addq $8, %rsp /* skip regs->orig_ax */ 211 + jmp xen_iret 212 + SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode) 195 213 196 214 /* 197 215 * Xen handles syscall callbacks much like ordinary exceptions, which

+1

tools/objtool/elf.c

··· 375 375 return -1; 376 376 } 377 377 memset(sym, 0, sizeof(*sym)); 378 + INIT_LIST_HEAD(&sym->pv_target); 378 379 sym->alias = sym; 379 380 380 381 sym->idx = i;

+4

tools/objtool/objtool.c

··· 153 153 !strcmp(func->name, "_paravirt_ident_64")) 154 154 return; 155 155 156 + /* already added this function */ 157 + if (!list_empty(&func->pv_target)) 158 + return; 159 + 156 160 list_add(&func->pv_target, &f->pv_ops[idx].targets); 157 161 f->pv_ops[idx].clean = false; 158 162 }

Configure Feed

Configure Feed