Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86_urgent_for_v6.8_rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:

- Make sure clearing CPU buffers using VERW happens at the latest
possible point in the return-to-userspace path, otherwise memory
accesses after the VERW execution could cause data to land in CPU
buffers again

* tag 'x86_urgent_for_v6.8_rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
KVM/VMX: Move VERW closer to VMentry for MDS mitigation
KVM/VMX: Use BT+JNC, i.e. EFLAGS.CF to select VMRESUME vs. VMLAUNCH
x86/bugs: Use ALTERNATIVE() instead of mds_user_clear static key
x86/entry_32: Add VERW just before userspace transition
x86/entry_64: Add VERW just before userspace transition
x86/bugs: Add asm helpers for executing VERW

+110 -44
+25 -9
Documentation/arch/x86/mds.rst
··· 95 95 96 96 mds_clear_cpu_buffers() 97 97 98 + Also macro CLEAR_CPU_BUFFERS can be used in ASM late in exit-to-user path. 99 + Other than CFLAGS.ZF, this macro doesn't clobber any registers. 100 + 98 101 The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state 99 102 (idle) transitions. 100 103 ··· 141 138 142 139 When transitioning from kernel to user space the CPU buffers are flushed 143 140 on affected CPUs when the mitigation is not disabled on the kernel 144 - command line. The migitation is enabled through the static key 145 - mds_user_clear. 141 + command line. The mitigation is enabled through the feature flag 142 + X86_FEATURE_CLEAR_CPU_BUF. 146 143 147 - The mitigation is invoked in prepare_exit_to_usermode() which covers 148 - all but one of the kernel to user space transitions. The exception 149 - is when we return from a Non Maskable Interrupt (NMI), which is 150 - handled directly in do_nmi(). 144 + The mitigation is invoked just before transitioning to userspace after 145 + user registers are restored. This is done to minimize the window in 146 + which kernel data could be accessed after VERW e.g. via an NMI after 147 + VERW. 151 148 152 - (The reason that NMI is special is that prepare_exit_to_usermode() can 153 - enable IRQs. In NMI context, NMIs are blocked, and we don't want to 154 - enable IRQs with NMIs blocked.) 149 + **Corner case not handled** 150 + Interrupts returning to kernel don't clear CPUs buffers since the 151 + exit-to-user path is expected to do that anyways. But, there could be 152 + a case when an NMI is generated in kernel after the exit-to-user path 153 + has cleared the buffers. This case is not handled and NMI returning to 154 + kernel don't clear CPU buffers because: 155 + 156 + 1. It is rare to get an NMI after VERW, but before returning to userspace. 157 + 2. For an unprivileged user, there is no known way to make that NMI 158 + less rare or target it. 159 + 3. It would take a large number of these precisely-timed NMIs to mount 160 + an actual attack. There's presumably not enough bandwidth. 161 + 4. The NMI in question occurs after a VERW, i.e. when user state is 162 + restored and most interesting data is already scrubbed. Whats left 163 + is only the data that NMI touches, and that may or may not be of 164 + any interest. 155 165 156 166 157 167 2. C-State transition
+23
arch/x86/entry/entry.S
··· 6 6 #include <linux/export.h> 7 7 #include <linux/linkage.h> 8 8 #include <asm/msr-index.h> 9 + #include <asm/unwind_hints.h> 10 + #include <asm/segment.h> 11 + #include <asm/cache.h> 9 12 10 13 .pushsection .noinstr.text, "ax" 11 14 ··· 23 20 EXPORT_SYMBOL_GPL(entry_ibpb); 24 21 25 22 .popsection 23 + 24 + /* 25 + * Define the VERW operand that is disguised as entry code so that 26 + * it can be referenced with KPTI enabled. This ensure VERW can be 27 + * used late in exit-to-user path after page tables are switched. 28 + */ 29 + .pushsection .entry.text, "ax" 30 + 31 + .align L1_CACHE_BYTES, 0xcc 32 + SYM_CODE_START_NOALIGN(mds_verw_sel) 33 + UNWIND_HINT_UNDEFINED 34 + ANNOTATE_NOENDBR 35 + .word __KERNEL_DS 36 + .align L1_CACHE_BYTES, 0xcc 37 + SYM_CODE_END(mds_verw_sel); 38 + /* For KVM */ 39 + EXPORT_SYMBOL_GPL(mds_verw_sel); 40 + 41 + .popsection 42 +
+3
arch/x86/entry/entry_32.S
··· 885 885 BUG_IF_WRONG_CR3 no_user_check=1 886 886 popfl 887 887 popl %eax 888 + CLEAR_CPU_BUFFERS 888 889 889 890 /* 890 891 * Return back to the vDSO, which will pop ecx and edx. ··· 955 954 956 955 /* Restore user state */ 957 956 RESTORE_REGS pop=4 # skip orig_eax/error_code 957 + CLEAR_CPU_BUFFERS 958 958 .Lirq_return: 959 959 /* 960 960 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization ··· 1148 1146 1149 1147 /* Not on SYSENTER stack. */ 1150 1148 call exc_nmi 1149 + CLEAR_CPU_BUFFERS 1151 1150 jmp .Lnmi_return 1152 1151 1153 1152 .Lnmi_from_sysenter_stack:
+11
arch/x86/entry/entry_64.S
··· 161 161 SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) 162 162 ANNOTATE_NOENDBR 163 163 swapgs 164 + CLEAR_CPU_BUFFERS 164 165 sysretq 165 166 SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) 166 167 ANNOTATE_NOENDBR ··· 574 573 575 574 .Lswapgs_and_iret: 576 575 swapgs 576 + CLEAR_CPU_BUFFERS 577 577 /* Assert that the IRET frame indicates user mode. */ 578 578 testb $3, 8(%rsp) 579 579 jnz .Lnative_iret ··· 724 722 * still read. 725 723 */ 726 724 popq %rax /* Restore user RAX */ 725 + 726 + CLEAR_CPU_BUFFERS 727 727 728 728 /* 729 729 * RSP now points to an ordinary IRET frame, except that the page ··· 1454 1450 movq $0, 5*8(%rsp) /* clear "NMI executing" */ 1455 1451 1456 1452 /* 1453 + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like 1454 + * NMI in kernel after user state is restored. For an unprivileged user 1455 + * these conditions are hard to meet. 1456 + */ 1457 + 1458 + /* 1457 1459 * iretq reads the "iret" frame and exits the NMI stack in a 1458 1460 * single instruction. We are returning to kernel mode, so this 1459 1461 * cannot result in a fault. Similarly, we don't need to worry ··· 1476 1466 UNWIND_HINT_END_OF_STACK 1477 1467 ENDBR 1478 1468 mov $-ENOSYS, %eax 1469 + CLEAR_CPU_BUFFERS 1479 1470 sysretl 1480 1471 SYM_CODE_END(entry_SYSCALL32_ignore) 1481 1472
+1
arch/x86/entry/entry_64_compat.S
··· 270 270 xorl %r9d, %r9d 271 271 xorl %r10d, %r10d 272 272 swapgs 273 + CLEAR_CPU_BUFFERS 273 274 sysretl 274 275 SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) 275 276 ANNOTATE_NOENDBR
+1 -1
arch/x86/include/asm/cpufeatures.h
··· 95 95 #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ 96 96 #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ 97 97 #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ 98 - /* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ 98 + #define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ 99 99 #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ 100 100 #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ 101 101 #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
-1
arch/x86/include/asm/entry-common.h
··· 91 91 92 92 static __always_inline void arch_exit_to_user_mode(void) 93 93 { 94 - mds_user_clear_cpu_buffers(); 95 94 amd_clear_divider(); 96 95 } 97 96 #define arch_exit_to_user_mode arch_exit_to_user_mode
+13 -12
arch/x86/include/asm/nospec-branch.h
··· 315 315 #endif 316 316 .endm 317 317 318 + /* 319 + * Macro to execute VERW instruction that mitigate transient data sampling 320 + * attacks such as MDS. On affected systems a microcode update overloaded VERW 321 + * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. 322 + * 323 + * Note: Only the memory operand variant of VERW clears the CPU buffers. 324 + */ 325 + .macro CLEAR_CPU_BUFFERS 326 + ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF 327 + .endm 328 + 318 329 #else /* __ASSEMBLY__ */ 319 330 320 331 #define ANNOTATE_RETPOLINE_SAFE \ ··· 540 529 DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); 541 530 DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); 542 531 543 - DECLARE_STATIC_KEY_FALSE(mds_user_clear); 544 532 DECLARE_STATIC_KEY_FALSE(mds_idle_clear); 545 533 546 534 DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); 547 535 548 536 DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); 537 + 538 + extern u16 mds_verw_sel; 549 539 550 540 #include <asm/segment.h> 551 541 ··· 571 559 * "cc" clobber is required because VERW modifies ZF. 572 560 */ 573 561 asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); 574 - } 575 - 576 - /** 577 - * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability 578 - * 579 - * Clear CPU buffers if the corresponding static key is enabled 580 - */ 581 - static __always_inline void mds_user_clear_cpu_buffers(void) 582 - { 583 - if (static_branch_likely(&mds_user_clear)) 584 - mds_clear_cpu_buffers(); 585 562 } 586 563 587 564 /**
+6 -9
arch/x86/kernel/cpu/bugs.c
··· 111 111 /* Control unconditional IBPB in switch_mm() */ 112 112 DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); 113 113 114 - /* Control MDS CPU buffer clear before returning to user space */ 115 - DEFINE_STATIC_KEY_FALSE(mds_user_clear); 116 - EXPORT_SYMBOL_GPL(mds_user_clear); 117 114 /* Control MDS CPU buffer clear before idling (halt, mwait) */ 118 115 DEFINE_STATIC_KEY_FALSE(mds_idle_clear); 119 116 EXPORT_SYMBOL_GPL(mds_idle_clear); ··· 249 252 if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) 250 253 mds_mitigation = MDS_MITIGATION_VMWERV; 251 254 252 - static_branch_enable(&mds_user_clear); 255 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 253 256 254 257 if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && 255 258 (mds_nosmt || cpu_mitigations_auto_nosmt())) ··· 353 356 * For guests that can't determine whether the correct microcode is 354 357 * present on host, enable the mitigation for UCODE_NEEDED as well. 355 358 */ 356 - static_branch_enable(&mds_user_clear); 359 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 357 360 358 361 if (taa_nosmt || cpu_mitigations_auto_nosmt()) 359 362 cpu_smt_disable(false); ··· 421 424 */ 422 425 if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && 423 426 boot_cpu_has(X86_FEATURE_RTM))) 424 - static_branch_enable(&mds_user_clear); 427 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 425 428 else 426 429 static_branch_enable(&mmio_stale_data_clear); 427 430 ··· 481 484 if (cpu_mitigations_off()) 482 485 return; 483 486 484 - if (!static_key_enabled(&mds_user_clear)) 487 + if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) 485 488 goto out; 486 489 487 490 /* 488 - * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data 489 - * mitigation, if necessary. 491 + * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO 492 + * Stale Data mitigation, if necessary. 490 493 */ 491 494 if (mds_mitigation == MDS_MITIGATION_OFF && 492 495 boot_cpu_has_bug(X86_BUG_MDS)) {
-3
arch/x86/kernel/nmi.c
··· 563 563 } 564 564 if (this_cpu_dec_return(nmi_state)) 565 565 goto nmi_restart; 566 - 567 - if (user_mode(regs)) 568 - mds_user_clear_cpu_buffers(); 569 566 } 570 567 571 568 #if IS_ENABLED(CONFIG_KVM_INTEL)
+5 -2
arch/x86/kvm/vmx/run_flags.h
··· 2 2 #ifndef __KVM_X86_VMX_RUN_FLAGS_H 3 3 #define __KVM_X86_VMX_RUN_FLAGS_H 4 4 5 - #define VMX_RUN_VMRESUME (1 << 0) 6 - #define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) 5 + #define VMX_RUN_VMRESUME_SHIFT 0 6 + #define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 7 + 8 + #define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) 9 + #define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) 7 10 8 11 #endif /* __KVM_X86_VMX_RUN_FLAGS_H */
+6 -3
arch/x86/kvm/vmx/vmenter.S
··· 139 139 mov (%_ASM_SP), %_ASM_AX 140 140 141 141 /* Check if vmlaunch or vmresume is needed */ 142 - test $VMX_RUN_VMRESUME, %ebx 142 + bt $VMX_RUN_VMRESUME_SHIFT, %ebx 143 143 144 144 /* Load guest registers. Don't clobber flags. */ 145 145 mov VCPU_RCX(%_ASM_AX), %_ASM_CX ··· 161 161 /* Load guest RAX. This kills the @regs pointer! */ 162 162 mov VCPU_RAX(%_ASM_AX), %_ASM_AX 163 163 164 - /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */ 165 - jz .Lvmlaunch 164 + /* Clobbers EFLAGS.ZF */ 165 + CLEAR_CPU_BUFFERS 166 + 167 + /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ 168 + jnc .Lvmlaunch 166 169 167 170 /* 168 171 * After a successful VMRESUME/VMLAUNCH, control flow "magically"
+16 -4
arch/x86/kvm/vmx/vmx.c
··· 388 388 389 389 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 390 390 { 391 - vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 391 + /* 392 + * Disable VERW's behavior of clearing CPU buffers for the guest if the 393 + * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 394 + * the mitigation. Disabling the clearing behavior provides a 395 + * performance boost for guests that aren't aware that manually clearing 396 + * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 397 + * and VM-Exit. 398 + */ 399 + vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 400 + (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 392 401 !boot_cpu_has_bug(X86_BUG_MDS) && 393 402 !boot_cpu_has_bug(X86_BUG_TAA); 394 403 ··· 7233 7224 7234 7225 guest_state_enter_irqoff(); 7235 7226 7236 - /* L1D Flush includes CPU buffer clear to mitigate MDS */ 7227 + /* 7228 + * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7229 + * mitigation for MDS is done late in VMentry and is still 7230 + * executed in spite of L1D Flush. This is because an extra VERW 7231 + * should not matter much after the big hammer L1D Flush. 7232 + */ 7237 7233 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7238 7234 vmx_l1d_flush(vcpu); 7239 - else if (static_branch_unlikely(&mds_user_clear)) 7240 - mds_clear_cpu_buffers(); 7241 7235 else if (static_branch_unlikely(&mmio_stale_data_clear) && 7242 7236 kvm_arch_has_assigned_device(vcpu->kvm)) 7243 7237 mds_clear_cpu_buffers();