Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-misc-6.19' of https://github.com/kvm-x86/linux into HEAD

KVM x86 misc changes for 6.19:

- Fix an async #PF bug where KVM would clear the completion queue when the
guest transitioned in and out of paging mode, e.g. when handling an SMI and
then returning to paged mode via RSM.

- Fix a bug where TDX would effectively corrupt user-return MSR values if the
TDX Module rejects VP.ENTER and thus doesn't clobber host MSRs as expected.

- Leave the user-return notifier used to restore MSRs registered when
disabling virtualization, and instead pin kvm.ko. Restoring host MSRs via
IPI callback is either pointless (clean reboot) or dangerous (forced reboot)
since KVM has no idea what code it's interrupting.

- Use the checked version of {get,put}_user(), as Linus wants to kill them
off, and they're measurably faster on modern CPUs due to the unchecked
versions containing an LFENCE.

- Fix a long-lurking bug where KVM's lack of catch-up logic for periodic APIC
timers can result in a hard lockup in the host.

- Revert the periodic kvmclock sync logic now that KVM doesn't use a
clocksource that's subject to NPT corrections.

- Clean up KVM's handling of MMIO Stale Data and L1TF, and bury the latter
behind CONFIG_CPU_MITIGATIONS.

- Context switch XCR0, XSS, and PKRU outside of the entry/exit fastpath as
the only reason they were handled in the faspath was to paper of a bug in
the core #MC code that has long since been fixed.

- Add emulator support for AVX MOV instructions to play nice with emulated
devices whose PCI BARs guest drivers like to access with large multi-byte
instructions.

+702 -496
+5
arch/x86/include/asm/cpufeatures.h
··· 499 499 #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ 500 500 #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ 501 501 #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ 502 + #define X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO (21*32+17) /* 503 + * Clear CPU buffers before VM-Enter if the vCPU 504 + * can access host MMIO (ignored for all intents 505 + * and purposes if CLEAR_CPU_BUF_VM is set). 506 + */ 502 507 503 508 /* 504 509 * BUG word(s)
+2 -2
arch/x86/include/asm/hardirq.h
··· 5 5 #include <linux/threads.h> 6 6 7 7 typedef struct { 8 - #if IS_ENABLED(CONFIG_KVM_INTEL) 8 + #if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) 9 9 u8 kvm_cpu_l1tf_flush_l1d; 10 10 #endif 11 11 unsigned int __nmi_count; /* arch dependent */ ··· 68 68 DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending); 69 69 #define local_softirq_pending_ref __softirq_pending 70 70 71 - #if IS_ENABLED(CONFIG_KVM_INTEL) 71 + #if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) 72 72 /* 73 73 * This function is called from noinstr interrupt contexts 74 74 * and must be inlined to not get instrumentation.
+1 -6
arch/x86/include/asm/kvm_host.h
··· 1055 1055 /* be preempted when it's in kernel-mode(cpl=0) */ 1056 1056 bool preempted_in_kernel; 1057 1057 1058 - /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ 1059 - bool l1tf_flush_l1d; 1060 - 1061 1058 /* Host CPU on which VM-entry was most recently attempted */ 1062 1059 int last_vmentry_cpu; 1063 1060 ··· 1453 1456 bool use_master_clock; 1454 1457 u64 master_kernel_ns; 1455 1458 u64 master_cycle_now; 1456 - struct delayed_work kvmclock_update_work; 1457 - struct delayed_work kvmclock_sync_work; 1458 1459 1459 1460 #ifdef CONFIG_KVM_HYPERV 1460 1461 struct kvm_hv hyperv; ··· 2162 2167 void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); 2163 2168 2164 2169 void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); 2170 + void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason); 2165 2171 2166 2172 void kvm_enable_efer_bits(u64); 2167 2173 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); ··· 2374 2378 int kvm_add_user_return_msr(u32 msr); 2375 2379 int kvm_find_user_return_msr(u32 msr); 2376 2380 int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); 2377 - void kvm_user_return_msr_update_cache(unsigned int index, u64 val); 2378 2381 u64 kvm_get_user_return_msr(unsigned int slot); 2379 2382 2380 2383 static inline bool kvm_is_supported_user_return_msr(u32 msr)
+15 -15
arch/x86/include/asm/nospec-branch.h
··· 308 308 * CFLAGS.ZF. 309 309 * Note: Only the memory operand variant of VERW clears the CPU buffers. 310 310 */ 311 - .macro __CLEAR_CPU_BUFFERS feature 312 311 #ifdef CONFIG_X86_64 313 - ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature 312 + #define VERW verw x86_verw_sel(%rip) 314 313 #else 315 - /* 316 - * In 32bit mode, the memory operand must be a %cs reference. The data 317 - * segments may not be usable (vm86 mode), and the stack segment may not 318 - * be flat (ESPFIX32). 319 - */ 320 - ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature 314 + /* 315 + * In 32bit mode, the memory operand must be a %cs reference. The data segments 316 + * may not be usable (vm86 mode), and the stack segment may not be flat (ESPFIX32). 317 + */ 318 + #define VERW verw %cs:x86_verw_sel 321 319 #endif 322 - .endm 323 320 321 + /* 322 + * Provide a stringified VERW macro for simple usage, and a non-stringified 323 + * VERW macro for use in more elaborate sequences, e.g. to encode a conditional 324 + * VERW within an ALTERNATIVE. 325 + */ 326 + #define __CLEAR_CPU_BUFFERS __stringify(VERW) 327 + 328 + /* If necessary, emit VERW on exit-to-userspace to clear CPU buffers. */ 324 329 #define CLEAR_CPU_BUFFERS \ 325 - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF 326 - 327 - #define VM_CLEAR_CPU_BUFFERS \ 328 - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM 330 + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF 329 331 330 332 #ifdef CONFIG_X86_64 331 333 .macro CLEAR_BRANCH_HISTORY ··· 581 579 DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear); 582 580 583 581 DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); 584 - 585 - DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); 586 582 587 583 extern u16 x86_verw_sel; 588 584
+9 -13
arch/x86/kernel/cpu/bugs.c
··· 192 192 */ 193 193 DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); 194 194 195 - /* 196 - * Controls CPU Fill buffer clear before VMenter. This is a subset of 197 - * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only 198 - * mitigation is required. 199 - */ 200 - DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); 201 - EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); 202 - 203 195 #undef pr_fmt 204 196 #define pr_fmt(fmt) "mitigations: " fmt 205 197 ··· 481 489 IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; 482 490 483 491 /* 484 - * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing 485 - * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. 492 + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to 493 + * userspace *and* on entry to KVM guests. 486 494 */ 487 495 static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; 488 496 ··· 528 536 if (mds_mitigation == MDS_MITIGATION_FULL || 529 537 mds_mitigation == MDS_MITIGATION_VMWERV) { 530 538 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 539 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 531 540 if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && 532 541 (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) 533 542 cpu_smt_disable(false); ··· 640 647 * present on host, enable the mitigation for UCODE_NEEDED as well. 641 648 */ 642 649 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 650 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 643 651 644 652 if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) 645 653 cpu_smt_disable(false); ··· 742 748 */ 743 749 if (verw_clear_cpu_buf_mitigation_selected) { 744 750 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 745 - static_branch_disable(&cpu_buf_vm_clear); 751 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 746 752 } else { 747 - static_branch_enable(&cpu_buf_vm_clear); 753 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO); 748 754 } 749 755 750 756 /* ··· 833 839 834 840 static void __init rfds_apply_mitigation(void) 835 841 { 836 - if (rfds_mitigation == RFDS_MITIGATION_VERW) 842 + if (rfds_mitigation == RFDS_MITIGATION_VERW) { 837 843 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 844 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 845 + } 838 846 } 839 847 840 848 static __init int rfds_parse_cmdline(char *str)
+223 -96
arch/x86/kvm/emulate.c
··· 81 81 */ 82 82 83 83 /* Operand sizes: 8-bit operands or specified/overridden size. */ 84 - #define ByteOp (1<<0) /* 8-bit operands. */ 85 - /* Destination operand type. */ 86 - #define DstShift 1 84 + #define ByteOp (1<<0) /* 8-bit operands. */ 85 + #define DstShift 1 /* Destination operand type at bits 1-5 */ 87 86 #define ImplicitOps (OpImplicit << DstShift) 88 87 #define DstReg (OpReg << DstShift) 89 88 #define DstMem (OpMem << DstShift) ··· 94 95 #define DstDX (OpDX << DstShift) 95 96 #define DstAccLo (OpAccLo << DstShift) 96 97 #define DstMask (OpMask << DstShift) 97 - /* Source operand type. */ 98 - #define SrcShift 6 98 + #define SrcShift 6 /* Source operand type at bits 6-10 */ 99 99 #define SrcNone (OpNone << SrcShift) 100 100 #define SrcReg (OpReg << SrcShift) 101 101 #define SrcMem (OpMem << SrcShift) ··· 117 119 #define SrcAccHi (OpAccHi << SrcShift) 118 120 #define SrcMask (OpMask << SrcShift) 119 121 #define BitOp (1<<11) 120 - #define MemAbs (1<<12) /* Memory operand is absolute displacement */ 122 + #define MemAbs (1<<12) /* Memory operand is absolute displacement */ 121 123 #define String (1<<13) /* String instruction (rep capable) */ 122 124 #define Stack (1<<14) /* Stack instruction (push/pop) */ 123 - #define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */ 125 + #define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */ 124 126 #define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ 125 127 #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ 126 128 #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ ··· 129 131 #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ 130 132 #define ModeDual (7<<15) /* Different instruction for 32/64 bit */ 131 133 #define Sse (1<<18) /* SSE Vector instruction */ 132 - /* Generic ModRM decode. */ 133 - #define ModRM (1<<19) 134 - /* Destination is only written; never read. */ 135 - #define Mov (1<<20) 136 - /* Misc flags */ 134 + #define ModRM (1<<19) /* Generic ModRM decode. */ 135 + #define Mov (1<<20) /* Destination is only written; never read. */ 137 136 #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ 138 137 #define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ 139 138 #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ ··· 138 143 #define Undefined (1<<25) /* No Such Instruction */ 139 144 #define Lock (1<<26) /* lock prefix is allowed for the instruction */ 140 145 #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 141 - #define No64 (1<<28) 146 + #define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */ 142 147 #define PageTable (1 << 29) /* instruction used to write page table */ 143 148 #define NotImpl (1 << 30) /* instruction is not implemented */ 144 - /* Source 2 operand type */ 145 - #define Src2Shift (31) 149 + #define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */ 150 + #define Src2Shift (32) /* Source 2 operand type at bits 32-36 */ 146 151 #define Src2None (OpNone << Src2Shift) 147 152 #define Src2Mem (OpMem << Src2Shift) 148 153 #define Src2CL (OpCL << Src2Shift) ··· 156 161 #define Src2FS (OpFS << Src2Shift) 157 162 #define Src2GS (OpGS << Src2Shift) 158 163 #define Src2Mask (OpMask << Src2Shift) 164 + /* free: 37-39 */ 159 165 #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ 160 - #define AlignMask ((u64)7 << 41) 166 + #define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */ 161 167 #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ 162 168 #define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ 163 - #define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ 164 - #define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ 169 + #define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ 170 + /* free: 43-44 */ 165 171 #define NoWrite ((u64)1 << 45) /* No writeback */ 166 172 #define SrcWrite ((u64)1 << 46) /* Write back src operand */ 167 173 #define NoMod ((u64)1 << 47) /* Mod field is ignored */ ··· 237 241 X86_TRANSFER_CALL_JMP, 238 242 X86_TRANSFER_RET, 239 243 X86_TRANSFER_TASK_SWITCH, 244 + }; 245 + 246 + enum rex_bits { 247 + REX_B = 1, 248 + REX_X = 2, 249 + REX_R = 4, 250 + REX_W = 8, 240 251 }; 241 252 242 253 static void writeback_registers(struct x86_emulate_ctxt *ctxt) ··· 625 622 626 623 switch (alignment) { 627 624 case Unaligned: 628 - case Avx: 629 625 return 1; 630 626 case Aligned16: 631 627 return 16; ··· 926 924 int byteop) 927 925 { 928 926 void *p; 929 - int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; 927 + int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop; 930 928 931 929 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) 932 930 p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; ··· 1032 1030 op->val = *(u64 *)op->addr.reg; 1033 1031 break; 1034 1032 } 1033 + op->orig_val = op->val; 1035 1034 } 1036 1035 1037 1036 static int em_fninit(struct x86_emulate_ctxt *ctxt) ··· 1078 1075 return X86EMUL_CONTINUE; 1079 1076 } 1080 1077 1081 - static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 1082 - struct operand *op) 1078 + static void __decode_register_operand(struct x86_emulate_ctxt *ctxt, 1079 + struct operand *op, int reg) 1083 1080 { 1084 - unsigned int reg; 1085 - 1086 - if (ctxt->d & ModRM) 1087 - reg = ctxt->modrm_reg; 1088 - else 1089 - reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); 1090 - 1091 - if (ctxt->d & Sse) { 1081 + if ((ctxt->d & Avx) && ctxt->op_bytes == 32) { 1082 + op->type = OP_YMM; 1083 + op->bytes = 32; 1084 + op->addr.xmm = reg; 1085 + kvm_read_avx_reg(reg, &op->vec_val2); 1086 + return; 1087 + } 1088 + if (ctxt->d & (Avx|Sse)) { 1092 1089 op->type = OP_XMM; 1093 1090 op->bytes = 16; 1094 1091 op->addr.xmm = reg; ··· 1106 1103 op->type = OP_REG; 1107 1104 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1108 1105 op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); 1109 - 1110 1106 fetch_register_operand(op); 1111 - op->orig_val = op->val; 1107 + } 1108 + 1109 + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 1110 + struct operand *op) 1111 + { 1112 + unsigned int reg; 1113 + 1114 + if (ctxt->d & ModRM) 1115 + reg = ctxt->modrm_reg; 1116 + else 1117 + reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0); 1118 + 1119 + __decode_register_operand(ctxt, op, reg); 1112 1120 } 1113 1121 1114 1122 static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) ··· 1136 1122 int rc = X86EMUL_CONTINUE; 1137 1123 ulong modrm_ea = 0; 1138 1124 1139 - ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ 1140 - index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ 1141 - base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ 1125 + ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0); 1126 + index_reg = (ctxt->rex_bits & REX_X ? 8 : 0); 1127 + base_reg = (ctxt->rex_bits & REX_B ? 8 : 0); 1142 1128 1143 1129 ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; 1144 1130 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ··· 1146 1132 ctxt->modrm_seg = VCPU_SREG_DS; 1147 1133 1148 1134 if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { 1149 - op->type = OP_REG; 1150 - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1151 - op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1152 - ctxt->d & ByteOp); 1153 - if (ctxt->d & Sse) { 1154 - op->type = OP_XMM; 1155 - op->bytes = 16; 1156 - op->addr.xmm = ctxt->modrm_rm; 1157 - kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val); 1158 - return rc; 1159 - } 1160 - if (ctxt->d & Mmx) { 1161 - op->type = OP_MM; 1162 - op->bytes = 8; 1163 - op->addr.mm = ctxt->modrm_rm & 7; 1164 - return rc; 1165 - } 1166 - fetch_register_operand(op); 1135 + __decode_register_operand(ctxt, op, ctxt->modrm_rm); 1167 1136 return rc; 1168 1137 } 1169 1138 ··· 1780 1783 op->data, 1781 1784 op->bytes * op->count); 1782 1785 case OP_XMM: 1783 - kvm_write_sse_reg(op->addr.xmm, &op->vec_val); 1786 + if (!(ctxt->d & Avx)) { 1787 + kvm_write_sse_reg(op->addr.xmm, &op->vec_val); 1788 + break; 1789 + } 1790 + /* full YMM write but with high bytes cleared */ 1791 + memset(op->valptr + 16, 0, 16); 1792 + fallthrough; 1793 + case OP_YMM: 1794 + kvm_write_avx_reg(op->addr.xmm, &op->vec_val2); 1784 1795 break; 1785 1796 case OP_MM: 1786 1797 kvm_write_mmx_reg(op->addr.mm, &op->mm_val); ··· 2471 2466 2472 2467 setup_syscalls_segments(&cs, &ss); 2473 2468 2474 - if ((ctxt->rex_prefix & 0x8) != 0x0) 2469 + if (ctxt->rex_bits & REX_W) 2475 2470 usermode = X86EMUL_MODE_PROT64; 2476 2471 else 2477 2472 usermode = X86EMUL_MODE_PROT32; ··· 3963 3958 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3964 3959 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3965 3960 3961 + static const struct opcode ud = I(SrcNone, emulate_ud); 3962 + 3966 3963 static const struct opcode group7_rm0[] = { 3967 3964 N, 3968 3965 I(SrcNone | Priv | EmulateOnUD, em_hypercall), ··· 4121 4114 } }; 4122 4115 4123 4116 static const struct gprefix pfx_0f_6f_0f_7f = { 4124 - I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), 4117 + I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov), 4125 4118 }; 4126 4119 4127 4120 static const struct instr_dual instr_dual_0f_2b = { ··· 4140 4133 I(Aligned, em_mov), I(Aligned, em_mov), N, N, 4141 4134 }; 4142 4135 4143 - static const struct gprefix pfx_0f_e7 = { 4144 - N, I(Sse, em_mov), N, N, 4136 + static const struct gprefix pfx_0f_e7_0f_38_2a = { 4137 + N, I(Sse | Avx, em_mov), N, N, 4145 4138 }; 4146 4139 4147 4140 static const struct escape escape_d9 = { { ··· 4354 4347 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 4355 4348 N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, 4356 4349 /* 0x10 - 0x1F */ 4357 - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11), 4358 - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11), 4350 + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11), 4351 + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11), 4359 4352 N, N, N, N, N, N, 4360 4353 D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */ 4361 4354 D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, ··· 4371 4364 IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, 4372 4365 check_dr_write), 4373 4366 N, N, N, N, 4374 - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), 4375 - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), 4376 - N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b), 4367 + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29), 4368 + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29), 4369 + N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b), 4377 4370 N, N, N, N, 4378 4371 /* 0x30 - 0x3F */ 4379 4372 II(ImplicitOps | Priv, em_wrmsr, wrmsr), ··· 4438 4431 /* 0xD0 - 0xDF */ 4439 4432 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 4440 4433 /* 0xE0 - 0xEF */ 4441 - N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7), 4434 + N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a), 4442 4435 N, N, N, N, N, N, N, N, 4443 4436 /* 0xF0 - 0xFF */ 4444 4437 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N ··· 4465 4458 * byte. 4466 4459 */ 4467 4460 static const struct opcode opcode_map_0f_38[256] = { 4468 - /* 0x00 - 0x7f */ 4469 - X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), 4461 + /* 0x00 - 0x1f */ 4462 + X16(N), X16(N), 4463 + /* 0x20 - 0x2f */ 4464 + X8(N), 4465 + X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, 4466 + /* 0x30 - 0x7f */ 4467 + X16(N), X16(N), X16(N), X16(N), X16(N), 4470 4468 /* 0x80 - 0xef */ 4471 4469 X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), 4472 4470 /* 0xf0 - 0xf1 */ ··· 4630 4618 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4631 4619 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); 4632 4620 fetch_register_operand(op); 4633 - op->orig_val = op->val; 4634 4621 break; 4635 4622 case OpAccLo: 4636 4623 op->type = OP_REG; 4637 4624 op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; 4638 4625 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); 4639 4626 fetch_register_operand(op); 4640 - op->orig_val = op->val; 4641 4627 break; 4642 4628 case OpAccHi: 4643 4629 if (ctxt->d & ByteOp) { ··· 4646 4636 op->bytes = ctxt->op_bytes; 4647 4637 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); 4648 4638 fetch_register_operand(op); 4649 - op->orig_val = op->val; 4650 4639 break; 4651 4640 case OpDI: 4652 4641 op->type = OP_MEM; ··· 4764 4755 return rc; 4765 4756 } 4766 4757 4758 + static int x86_decode_avx(struct x86_emulate_ctxt *ctxt, 4759 + u8 vex_1st, u8 vex_2nd, struct opcode *opcode) 4760 + { 4761 + u8 vex_3rd, map, pp, l, v; 4762 + int rc = X86EMUL_CONTINUE; 4763 + 4764 + if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix) 4765 + goto ud; 4766 + 4767 + if (vex_1st == 0xc5) { 4768 + /* Expand RVVVVlpp to VEX3 format */ 4769 + vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */ 4770 + vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */ 4771 + } else { 4772 + vex_3rd = insn_fetch(u8, ctxt); 4773 + } 4774 + 4775 + /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */ 4776 + vex_2nd ^= 0xE0; /* binary 11100000 */ 4777 + vex_3rd ^= 0x78; /* binary 01111000 */ 4778 + 4779 + ctxt->rex_prefix = REX_PREFIX; 4780 + ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */ 4781 + ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */ 4782 + if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64) 4783 + goto ud; 4784 + 4785 + map = vex_2nd & 0x1f; 4786 + v = (vex_3rd >> 3) & 0xf; 4787 + l = vex_3rd & 0x4; 4788 + pp = vex_3rd & 0x3; 4789 + 4790 + ctxt->b = insn_fetch(u8, ctxt); 4791 + switch (map) { 4792 + case 1: 4793 + ctxt->opcode_len = 2; 4794 + *opcode = twobyte_table[ctxt->b]; 4795 + break; 4796 + case 2: 4797 + ctxt->opcode_len = 3; 4798 + *opcode = opcode_map_0f_38[ctxt->b]; 4799 + break; 4800 + case 3: 4801 + /* no 0f 3a instructions are supported yet */ 4802 + return X86EMUL_UNHANDLEABLE; 4803 + default: 4804 + goto ud; 4805 + } 4806 + 4807 + /* 4808 + * No three operand instructions are supported yet; those that 4809 + * *are* marked with the Avx flag reserve the VVVV flag. 4810 + */ 4811 + if (v) 4812 + goto ud; 4813 + 4814 + if (l) 4815 + ctxt->op_bytes = 32; 4816 + else 4817 + ctxt->op_bytes = 16; 4818 + 4819 + switch (pp) { 4820 + case 0: break; 4821 + case 1: ctxt->op_prefix = true; break; 4822 + case 2: ctxt->rep_prefix = 0xf3; break; 4823 + case 3: ctxt->rep_prefix = 0xf2; break; 4824 + } 4825 + 4826 + done: 4827 + return rc; 4828 + ud: 4829 + *opcode = ud; 4830 + return rc; 4831 + } 4832 + 4767 4833 int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) 4768 4834 { 4769 4835 int rc = X86EMUL_CONTINUE; 4770 4836 int mode = ctxt->mode; 4771 4837 int def_op_bytes, def_ad_bytes, goffset, simd_prefix; 4772 - bool op_prefix = false; 4838 + bool vex_prefix = false; 4773 4839 bool has_seg_override = false; 4774 4840 struct opcode opcode; 4775 4841 u16 dummy; ··· 4896 4812 for (;;) { 4897 4813 switch (ctxt->b = insn_fetch(u8, ctxt)) { 4898 4814 case 0x66: /* operand-size override */ 4899 - op_prefix = true; 4815 + ctxt->op_prefix = true; 4900 4816 /* switch between 2/4 bytes */ 4901 4817 ctxt->op_bytes = def_op_bytes ^ 6; 4902 4818 break; ··· 4935 4851 case 0x40 ... 0x4f: /* REX */ 4936 4852 if (mode != X86EMUL_MODE_PROT64) 4937 4853 goto done_prefixes; 4938 - ctxt->rex_prefix = ctxt->b; 4854 + ctxt->rex_prefix = REX_PREFIX; 4855 + ctxt->rex_bits = ctxt->b & 0xf; 4939 4856 continue; 4940 4857 case 0xf0: /* LOCK */ 4941 4858 ctxt->lock_prefix = 1; ··· 4950 4865 } 4951 4866 4952 4867 /* Any legacy prefix after a REX prefix nullifies its effect. */ 4953 - 4954 - ctxt->rex_prefix = 0; 4868 + ctxt->rex_prefix = REX_NONE; 4869 + ctxt->rex_bits = 0; 4955 4870 } 4956 4871 4957 4872 done_prefixes: 4958 4873 4959 4874 /* REX prefix. */ 4960 - if (ctxt->rex_prefix & 8) 4961 - ctxt->op_bytes = 8; /* REX.W */ 4875 + if (ctxt->rex_bits & REX_W) 4876 + ctxt->op_bytes = 8; 4962 4877 4963 4878 /* Opcode byte(s). */ 4964 - opcode = opcode_table[ctxt->b]; 4965 - /* Two-byte opcode? */ 4966 - if (ctxt->b == 0x0f) { 4879 + if (ctxt->b == 0xc4 || ctxt->b == 0xc5) { 4880 + /* VEX or LDS/LES */ 4881 + u8 vex_2nd = insn_fetch(u8, ctxt); 4882 + if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) { 4883 + opcode = opcode_table[ctxt->b]; 4884 + ctxt->modrm = vex_2nd; 4885 + /* the Mod/RM byte has been fetched already! */ 4886 + goto done_modrm; 4887 + } 4888 + 4889 + vex_prefix = true; 4890 + rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode); 4891 + if (rc != X86EMUL_CONTINUE) 4892 + goto done; 4893 + } else if (ctxt->b == 0x0f) { 4894 + /* Two- or three-byte opcode */ 4967 4895 ctxt->opcode_len = 2; 4968 4896 ctxt->b = insn_fetch(u8, ctxt); 4969 4897 opcode = twobyte_table[ctxt->b]; ··· 4987 4889 ctxt->b = insn_fetch(u8, ctxt); 4988 4890 opcode = opcode_map_0f_38[ctxt->b]; 4989 4891 } 4892 + } else { 4893 + /* Opcode byte(s). */ 4894 + opcode = opcode_table[ctxt->b]; 4990 4895 } 4991 - ctxt->d = opcode.flags; 4992 4896 4993 - if (ctxt->d & ModRM) 4897 + if (opcode.flags & ModRM) 4994 4898 ctxt->modrm = insn_fetch(u8, ctxt); 4995 4899 4996 - /* vex-prefix instructions are not implemented */ 4997 - if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && 4998 - (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { 4999 - ctxt->d = NotImpl; 5000 - } 5001 - 4900 + done_modrm: 4901 + ctxt->d = opcode.flags; 5002 4902 while (ctxt->d & GroupMask) { 5003 4903 switch (ctxt->d & GroupMask) { 5004 4904 case Group: ··· 5015 4919 opcode = opcode.u.group[goffset]; 5016 4920 break; 5017 4921 case Prefix: 5018 - if (ctxt->rep_prefix && op_prefix) 4922 + if (ctxt->rep_prefix && ctxt->op_prefix) 5019 4923 return EMULATION_FAILED; 5020 - simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; 4924 + simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix; 5021 4925 switch (simd_prefix) { 5022 4926 case 0x00: opcode = opcode.u.gprefix->pfx_no; break; 5023 4927 case 0x66: opcode = opcode.u.gprefix->pfx_66; break; ··· 5061 4965 /* Unrecognised? */ 5062 4966 if (ctxt->d == 0) 5063 4967 return EMULATION_FAILED; 4968 + 4969 + if (unlikely(vex_prefix)) { 4970 + /* 4971 + * Only specifically marked instructions support VEX. Since many 4972 + * instructions support it but are not annotated, return not implemented 4973 + * rather than #UD. 4974 + */ 4975 + if (!(ctxt->d & Avx)) 4976 + return EMULATION_FAILED; 4977 + 4978 + if (!(ctxt->d & AlignMask)) 4979 + ctxt->d |= Unaligned; 4980 + } 5064 4981 5065 4982 ctxt->execute = opcode.u.execute; 5066 4983 ··· 5145 5036 if ((ctxt->d & No16) && ctxt->op_bytes == 2) 5146 5037 ctxt->op_bytes = 4; 5147 5038 5148 - if (ctxt->d & Sse) 5149 - ctxt->op_bytes = 16; 5039 + if (vex_prefix) 5040 + ; 5041 + else if (ctxt->d & Sse) 5042 + ctxt->op_bytes = 16, ctxt->d &= ~Avx; 5150 5043 else if (ctxt->d & Mmx) 5151 5044 ctxt->op_bytes = 8; 5152 5045 } ··· 5248 5137 { 5249 5138 /* Clear fields that are set conditionally but read without a guard. */ 5250 5139 ctxt->rip_relative = false; 5251 - ctxt->rex_prefix = 0; 5140 + ctxt->rex_prefix = REX_NONE; 5141 + ctxt->rex_bits = 0; 5252 5142 ctxt->lock_prefix = 0; 5143 + ctxt->op_prefix = false; 5253 5144 ctxt->rep_prefix = 0; 5254 5145 ctxt->regs_valid = 0; 5255 5146 ctxt->regs_dirty = 0; ··· 5281 5168 } 5282 5169 5283 5170 if (unlikely(ctxt->d & 5284 - (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { 5171 + (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { 5285 5172 if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || 5286 5173 (ctxt->d & Undefined)) { 5287 5174 rc = emulate_ud(ctxt); 5288 5175 goto done; 5289 5176 } 5290 5177 5291 - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) 5292 - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { 5178 + if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) { 5293 5179 rc = emulate_ud(ctxt); 5294 5180 goto done; 5295 5181 } 5296 5182 5297 - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 5183 + if (ctxt->d & Avx) { 5184 + u64 xcr = 0; 5185 + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE) 5186 + || ops->get_xcr(ctxt, 0, &xcr) 5187 + || !(xcr & XFEATURE_MASK_YMM)) { 5188 + rc = emulate_ud(ctxt); 5189 + goto done; 5190 + } 5191 + } else if (ctxt->d & Sse) { 5192 + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) { 5193 + rc = emulate_ud(ctxt); 5194 + goto done; 5195 + } 5196 + } 5197 + 5198 + if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 5298 5199 rc = emulate_nm(ctxt); 5299 5200 goto done; 5300 5201 }
+66
arch/x86/kvm/fpu.h
··· 15 15 #define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; }) 16 16 #define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; }) 17 17 18 + typedef u32 __attribute__((vector_size(32))) avx256_t; 19 + 20 + static inline void _kvm_read_avx_reg(int reg, avx256_t *data) 21 + { 22 + switch (reg) { 23 + case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break; 24 + case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break; 25 + case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break; 26 + case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break; 27 + case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break; 28 + case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break; 29 + case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break; 30 + case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break; 31 + #ifdef CONFIG_X86_64 32 + case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break; 33 + case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break; 34 + case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break; 35 + case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break; 36 + case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break; 37 + case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break; 38 + case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break; 39 + case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break; 40 + #endif 41 + default: BUG(); 42 + } 43 + } 44 + 45 + static inline void _kvm_write_avx_reg(int reg, const avx256_t *data) 46 + { 47 + switch (reg) { 48 + case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break; 49 + case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break; 50 + case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break; 51 + case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break; 52 + case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break; 53 + case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break; 54 + case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break; 55 + case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break; 56 + #ifdef CONFIG_X86_64 57 + case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break; 58 + case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break; 59 + case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break; 60 + case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break; 61 + case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break; 62 + case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break; 63 + case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break; 64 + case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break; 65 + #endif 66 + default: BUG(); 67 + } 68 + } 69 + 18 70 static inline void _kvm_read_sse_reg(int reg, sse128_t *data) 19 71 { 20 72 switch (reg) { ··· 159 107 static inline void kvm_fpu_put(void) 160 108 { 161 109 fpregs_unlock(); 110 + } 111 + 112 + static inline void kvm_read_avx_reg(int reg, avx256_t *data) 113 + { 114 + kvm_fpu_get(); 115 + _kvm_read_avx_reg(reg, data); 116 + kvm_fpu_put(); 117 + } 118 + 119 + static inline void kvm_write_avx_reg(int reg, const avx256_t *data) 120 + { 121 + kvm_fpu_get(); 122 + _kvm_write_avx_reg(reg, data); 123 + kvm_fpu_put(); 162 124 } 163 125 164 126 static inline void kvm_read_sse_reg(int reg, sse128_t *data)
+1 -1
arch/x86/kvm/hyperv.c
··· 1568 1568 * only, there can be valuable data in the rest which needs 1569 1569 * to be preserved e.g. on migration. 1570 1570 */ 1571 - if (__put_user(0, (u32 __user *)addr)) 1571 + if (put_user(0, (u32 __user *)addr)) 1572 1572 return 1; 1573 1573 hv_vcpu->hv_vapic = data; 1574 1574 kvm_vcpu_mark_page_dirty(vcpu, gfn);
+16 -4
arch/x86/kvm/kvm_emulate.h
··· 237 237 bool (*is_smm)(struct x86_emulate_ctxt *ctxt); 238 238 int (*leave_smm)(struct x86_emulate_ctxt *ctxt); 239 239 void (*triple_fault)(struct x86_emulate_ctxt *ctxt); 240 + int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr); 240 241 int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); 241 242 242 243 gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, ··· 249 248 250 249 /* Type, address-of, and value of an instruction's operand. */ 251 250 struct operand { 252 - enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; 251 + enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type; 253 252 unsigned int bytes; 254 253 unsigned int count; 255 254 union { ··· 268 267 union { 269 268 unsigned long val; 270 269 u64 val64; 271 - char valptr[sizeof(sse128_t)]; 270 + char valptr[sizeof(avx256_t)]; 272 271 sse128_t vec_val; 272 + avx256_t vec_val2; 273 273 u64 mm_val; 274 274 void *data; 275 - }; 275 + } __aligned(32); 276 276 }; 277 277 278 278 #define X86_MAX_INSTRUCTION_LENGTH 15 ··· 319 317 #define NR_EMULATOR_GPRS 8 320 318 #endif 321 319 320 + /* 321 + * Distinguish between no prefix, REX, or in the future REX2. 322 + */ 323 + enum rex_type { 324 + REX_NONE, 325 + REX_PREFIX, 326 + }; 327 + 322 328 struct x86_emulate_ctxt { 323 329 void *vcpu; 324 330 const struct x86_emulate_ops *ops; ··· 358 348 u8 opcode_len; 359 349 u8 b; 360 350 u8 intercept; 351 + bool op_prefix; 361 352 u8 op_bytes; 362 353 u8 ad_bytes; 363 354 union { ··· 368 357 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 369 358 370 359 bool rip_relative; 371 - u8 rex_prefix; 360 + enum rex_type rex_prefix; 361 + u8 rex_bits; 372 362 u8 lock_prefix; 373 363 u8 rep_prefix; 374 364 /* bitmaps of registers in _regs[] that can be read */
+31 -13
arch/x86/kvm/lapic.c
··· 2126 2126 2127 2127 static void advance_periodic_target_expiration(struct kvm_lapic *apic) 2128 2128 { 2129 + struct kvm_timer *ktimer = &apic->lapic_timer; 2129 2130 ktime_t now = ktime_get(); 2130 2131 u64 tscl = rdtsc(); 2131 2132 ktime_t delta; 2132 2133 2133 2134 /* 2134 - * Synchronize both deadlines to the same time source or 2135 - * differences in the periods (caused by differences in the 2136 - * underlying clocks or numerical approximation errors) will 2137 - * cause the two to drift apart over time as the errors 2138 - * accumulate. 2135 + * Use kernel time as the time source for both the hrtimer deadline and 2136 + * TSC-based deadline so that they stay synchronized. Computing each 2137 + * deadline independently will cause the two deadlines to drift apart 2138 + * over time as differences in the periods accumulate, e.g. due to 2139 + * differences in the underlying clocks or numerical approximation errors. 2139 2140 */ 2140 - apic->lapic_timer.target_expiration = 2141 - ktime_add_ns(apic->lapic_timer.target_expiration, 2142 - apic->lapic_timer.period); 2143 - delta = ktime_sub(apic->lapic_timer.target_expiration, now); 2144 - apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + 2145 - nsec_to_cycles(apic->vcpu, delta); 2141 + ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration, 2142 + ktimer->period); 2143 + 2144 + /* 2145 + * If the new expiration is in the past, e.g. because userspace stopped 2146 + * running the VM for an extended duration, then force the expiration 2147 + * to "now" and don't try to play catch-up with the missed events. KVM 2148 + * will only deliver a single interrupt regardless of how many events 2149 + * are pending, i.e. restarting the timer with an expiration in the 2150 + * past will do nothing more than waste host cycles, and can even lead 2151 + * to a hard lockup in extreme cases. 2152 + */ 2153 + if (ktime_before(ktimer->target_expiration, now)) 2154 + ktimer->target_expiration = now; 2155 + 2156 + /* 2157 + * Note, ensuring the expiration isn't in the past also prevents delta 2158 + * from going negative, which could cause the TSC deadline to become 2159 + * excessively large due to it an unsigned value. 2160 + */ 2161 + delta = ktime_sub(ktimer->target_expiration, now); 2162 + ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + 2163 + nsec_to_cycles(apic->vcpu, delta); 2146 2164 } 2147 2165 2148 2166 static void start_sw_period(struct kvm_lapic *apic) ··· 2988 2970 2989 2971 apic_timer_expired(apic, true); 2990 2972 2991 - if (lapic_is_periodic(apic)) { 2973 + if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) { 2992 2974 advance_periodic_target_expiration(apic); 2993 - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 2975 + hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration); 2994 2976 return HRTIMER_RESTART; 2995 2977 } else 2996 2978 return HRTIMER_NORESTART;
-2
arch/x86/kvm/mmu.h
··· 235 235 return -(u32)fault & errcode; 236 236 } 237 237 238 - bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); 239 - 240 238 int kvm_mmu_post_init_vm(struct kvm *kvm); 241 239 void kvm_mmu_pre_destroy_vm(struct kvm *kvm); 242 240
+1 -1
arch/x86/kvm/mmu/mmu.c
··· 4859 4859 */ 4860 4860 BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); 4861 4861 4862 - vcpu->arch.l1tf_flush_l1d = true; 4862 + kvm_request_l1tf_flush_l1d(); 4863 4863 if (!flags) { 4864 4864 trace_kvm_page_fault(vcpu, fault_address, error_code); 4865 4865
+1 -1
arch/x86/kvm/mmu/paging_tmpl.h
··· 402 402 goto error; 403 403 404 404 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 405 - if (unlikely(__get_user(pte, ptep_user))) 405 + if (unlikely(get_user(pte, ptep_user))) 406 406 goto error; 407 407 walker->ptep_user[walker->level - 1] = ptep_user; 408 408
+1 -1
arch/x86/kvm/mmu/spte.c
··· 292 292 mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); 293 293 } 294 294 295 - if (static_branch_unlikely(&cpu_buf_vm_clear) && 295 + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && 296 296 !kvm_vcpu_can_access_host_mmio(vcpu) && 297 297 kvm_is_mmio_pfn(pfn, &is_host_mmio)) 298 298 kvm_track_host_mmio_mapping(vcpu);
+10 -17
arch/x86/kvm/svm/svm.c
··· 3442 3442 3443 3443 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) 3444 3444 { 3445 - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); 3446 3445 dump_vmcb(vcpu); 3447 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3448 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 3449 - vcpu->run->internal.ndata = 2; 3450 - vcpu->run->internal.data[0] = exit_code; 3451 - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 3446 + kvm_prepare_unexpected_reason_exit(vcpu, exit_code); 3452 3447 return 0; 3453 3448 } 3454 3449 ··· 4246 4251 svm_set_dr6(vcpu, DR6_ACTIVE_LOW); 4247 4252 4248 4253 clgi(); 4249 - kvm_load_guest_xsave_state(vcpu); 4250 4254 4251 4255 /* 4252 4256 * Hardware only context switches DEBUGCTL if LBR virtualization is ··· 4288 4294 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4289 4295 update_debugctlmsr(vcpu->arch.host_debugctl); 4290 4296 4291 - kvm_load_host_xsave_state(vcpu); 4292 4297 stgi(); 4293 4298 4294 4299 /* Any pending NMI will happen here */ ··· 4318 4325 kvm_read_and_reset_apf_flags(); 4319 4326 4320 4327 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 4321 - 4322 - /* 4323 - * We need to handle MC intercepts here before the vcpu has a chance to 4324 - * change the physical cpu 4325 - */ 4326 - if (unlikely(svm->vmcb->control.exit_code == 4327 - SVM_EXIT_EXCP_BASE + MC_VECTOR)) 4328 - svm_handle_mce(vcpu); 4329 4328 4330 4329 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4331 4330 ··· 4607 4622 4608 4623 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4609 4624 { 4610 - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) 4625 + switch (to_svm(vcpu)->vmcb->control.exit_code) { 4626 + case SVM_EXIT_EXCP_BASE + MC_VECTOR: 4627 + svm_handle_mce(vcpu); 4628 + break; 4629 + case SVM_EXIT_INTR: 4611 4630 vcpu->arch.at_instruction_boundary = true; 4631 + break; 4632 + default: 4633 + break; 4634 + } 4612 4635 } 4613 4636 4614 4637 static void svm_setup_mce(struct kvm_vcpu *vcpu)
+4 -2
arch/x86/kvm/svm/vmenter.S
··· 92 92 jmp 901b 93 93 .endm 94 94 95 + #define SVM_CLEAR_CPU_BUFFERS \ 96 + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM 95 97 96 98 /** 97 99 * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode ··· 172 170 mov VCPU_RDI(%_ASM_DI), %_ASM_DI 173 171 174 172 /* Clobbers EFLAGS.ZF */ 175 - VM_CLEAR_CPU_BUFFERS 173 + SVM_CLEAR_CPU_BUFFERS 176 174 177 175 /* Enter guest mode */ 178 176 3: vmrun %_ASM_AX ··· 341 339 mov KVM_VMCB_pa(%rax), %rax 342 340 343 341 /* Clobbers EFLAGS.ZF */ 344 - VM_CLEAR_CPU_BUFFERS 342 + SVM_CLEAR_CPU_BUFFERS 345 343 346 344 /* Enter guest mode */ 347 345 1: vmrun %rax
+1 -1
arch/x86/kvm/vmx/nested.c
··· 3880 3880 goto vmentry_failed; 3881 3881 3882 3882 /* Hide L1D cache contents from the nested guest. */ 3883 - vmx->vcpu.arch.l1tf_flush_l1d = true; 3883 + kvm_request_l1tf_flush_l1d(); 3884 3884 3885 3885 /* 3886 3886 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
+3 -7
arch/x86/kvm/vmx/run_flags.h
··· 2 2 #ifndef __KVM_X86_VMX_RUN_FLAGS_H 3 3 #define __KVM_X86_VMX_RUN_FLAGS_H 4 4 5 - #define VMX_RUN_VMRESUME_SHIFT 0 6 - #define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 7 - #define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2 8 - 9 - #define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) 10 - #define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) 11 - #define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT) 5 + #define VMX_RUN_VMRESUME BIT(0) 6 + #define VMX_RUN_SAVE_SPEC_CTRL BIT(1) 7 + #define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2) 12 8 13 9 #endif /* __KVM_X86_VMX_RUN_FLAGS_H */
+24 -41
arch/x86/kvm/vmx/tdx.c
··· 763 763 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 764 764 } 765 765 766 - /* 767 - * Compared to vmx_prepare_switch_to_guest(), there is not much to do 768 - * as SEAMCALL/SEAMRET calls take care of most of save and restore. 769 - */ 770 - void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 771 - { 772 - struct vcpu_vt *vt = to_vt(vcpu); 773 - 774 - if (vt->guest_state_loaded) 775 - return; 776 - 777 - if (likely(is_64bit_mm(current->mm))) 778 - vt->msr_host_kernel_gs_base = current->thread.gsbase; 779 - else 780 - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 781 - 782 - vt->guest_state_loaded = true; 783 - } 784 - 785 766 struct tdx_uret_msr { 786 767 u32 msr; 787 768 unsigned int slot; ··· 776 795 {.msr = MSR_TSC_AUX,}, 777 796 }; 778 797 779 - static void tdx_user_return_msr_update_cache(void) 798 + void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 780 799 { 800 + struct vcpu_vt *vt = to_vt(vcpu); 781 801 int i; 782 802 803 + if (vt->guest_state_loaded) 804 + return; 805 + 806 + if (likely(is_64bit_mm(current->mm))) 807 + vt->msr_host_kernel_gs_base = current->thread.gsbase; 808 + else 809 + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 810 + 811 + vt->guest_state_loaded = true; 812 + 813 + /* 814 + * Explicitly set user-return MSRs that are clobbered by the TDX-Module 815 + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be 816 + * written by the TDX-Module. Don't rely on the TDX-Module to actually 817 + * clobber the MSRs, as the contract is poorly defined and not upheld. 818 + * E.g. the TDX-Module will synthesize an EPT Violation without doing 819 + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM 820 + * state. 821 + */ 783 822 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 784 - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, 785 - tdx_uret_msrs[i].defval); 823 + kvm_set_user_return_msr(tdx_uret_msrs[i].slot, 824 + tdx_uret_msrs[i].defval, -1ull); 786 825 } 787 826 788 827 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 789 828 { 790 829 struct vcpu_vt *vt = to_vt(vcpu); 791 - struct vcpu_tdx *tdx = to_tdx(vcpu); 792 830 793 831 if (!vt->guest_state_loaded) 794 832 return; 795 833 796 834 ++vcpu->stat.host_state_reload; 797 835 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 798 - 799 - if (tdx->guest_entered) { 800 - tdx_user_return_msr_update_cache(); 801 - tdx->guest_entered = false; 802 - } 803 836 804 837 vt->guest_state_loaded = false; 805 838 } ··· 1054 1059 update_debugctlmsr(vcpu->arch.host_debugctl); 1055 1060 1056 1061 tdx_load_host_xsave_state(vcpu); 1057 - tdx->guest_entered = true; 1058 1062 1059 1063 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1060 1064 ··· 1062 1068 1063 1069 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1064 1070 return EXIT_FASTPATH_NONE; 1065 - 1066 - if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 1067 - kvm_machine_check(); 1068 1071 1069 1072 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1070 1073 ··· 2136 2145 } 2137 2146 2138 2147 unhandled_exit: 2139 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2140 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 2141 - vcpu->run->internal.ndata = 2; 2142 - vcpu->run->internal.data[0] = vp_enter_ret; 2143 - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 2148 + kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret); 2144 2149 return 0; 2145 2150 } 2146 2151 ··· 3434 3447 /* 3435 3448 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3436 3449 * before returning to user space. 3437 - * 3438 - * this_cpu_ptr(user_return_msrs)->registered isn't checked 3439 - * because the registration is done at vcpu runtime by 3440 - * tdx_user_return_msr_update_cache(). 3441 3450 */ 3442 3451 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3443 3452 if (tdx_uret_msrs[i].slot == -1) {
-1
arch/x86/kvm/vmx/tdx.h
··· 67 67 u64 vp_enter_ret; 68 68 69 69 enum vcpu_tdx_state state; 70 - bool guest_entered; 71 70 72 71 u64 map_gpa_next; 73 72 u64 map_gpa_end;
+18 -11
arch/x86/kvm/vmx/vmenter.S
··· 71 71 * @regs: unsigned long * (to guest registers) 72 72 * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH 73 73 * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl 74 + * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO 74 75 * 75 76 * Returns: 76 77 * 0 on VM-Exit, 1 on VM-Fail ··· 93 92 /* Save @vmx for SPEC_CTRL handling */ 94 93 push %_ASM_ARG1 95 94 96 - /* Save @flags for SPEC_CTRL handling */ 95 + /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */ 97 96 push %_ASM_ARG3 98 97 99 98 /* ··· 101 100 * @regs is needed after VM-Exit to save the guest's register values. 102 101 */ 103 102 push %_ASM_ARG2 104 - 105 - /* Copy @flags to EBX, _ASM_ARG3 is volatile. */ 106 - mov %_ASM_ARG3L, %ebx 107 103 108 104 lea (%_ASM_SP), %_ASM_ARG2 109 105 call vmx_update_host_rsp ··· 135 137 /* Load @regs to RAX. */ 136 138 mov (%_ASM_SP), %_ASM_AX 137 139 138 - /* Check if vmlaunch or vmresume is needed */ 139 - bt $VMX_RUN_VMRESUME_SHIFT, %ebx 140 - 141 140 /* Load guest registers. Don't clobber flags. */ 142 141 mov VCPU_RCX(%_ASM_AX), %_ASM_CX 143 142 mov VCPU_RDX(%_ASM_AX), %_ASM_DX ··· 155 160 /* Load guest RAX. This kills the @regs pointer! */ 156 161 mov VCPU_RAX(%_ASM_AX), %_ASM_AX 157 162 158 - /* Clobbers EFLAGS.ZF */ 159 - CLEAR_CPU_BUFFERS 163 + /* 164 + * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is 165 + * enabled, do VERW unconditionally. If CPU_BUF_VM_MMIO is enabled, 166 + * check @flags to see if the vCPU has access to host MMIO, and if so, 167 + * do VERW. Else, do nothing (no mitigations needed/enabled). 168 + */ 169 + ALTERNATIVE_2 "", \ 170 + __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \ 171 + jz .Lskip_mmio_verw; \ 172 + VERW; \ 173 + .Lskip_mmio_verw:), \ 174 + X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, \ 175 + __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM 160 176 161 - /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ 162 - jnc .Lvmlaunch 177 + /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */ 178 + testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP) 179 + jz .Lvmlaunch 163 180 164 181 /* 165 182 * After a successful VMRESUME/VMLAUNCH, control flow "magically"
+134 -130
arch/x86/kvm/vmx/vmx.c
··· 203 203 204 204 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 205 205 206 + #ifdef CONFIG_CPU_MITIGATIONS 206 207 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 207 208 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 208 209 static DEFINE_MUTEX(vmx_l1d_flush_mutex); ··· 226 225 #define L1D_CACHE_ORDER 4 227 226 static void *vmx_l1d_flush_pages; 228 227 229 - static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 228 + static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 230 229 { 231 230 struct page *page; 232 231 unsigned int i; ··· 303 302 return 0; 304 303 } 305 304 305 + static int vmx_setup_l1d_flush(void) 306 + { 307 + /* 308 + * Hand the parameter mitigation value in which was stored in the pre 309 + * module init parser. If no parameter was given, it will contain 310 + * 'auto' which will be turned into the default 'cond' mitigation mode. 311 + */ 312 + return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); 313 + } 314 + 315 + static void vmx_cleanup_l1d_flush(void) 316 + { 317 + if (vmx_l1d_flush_pages) { 318 + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 319 + vmx_l1d_flush_pages = NULL; 320 + } 321 + /* Restore state so sysfs ignores VMX */ 322 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 323 + } 324 + 306 325 static int vmentry_l1d_flush_parse(const char *s) 307 326 { 308 327 unsigned int i; ··· 360 339 } 361 340 362 341 mutex_lock(&vmx_l1d_flush_mutex); 363 - ret = vmx_setup_l1d_flush(l1tf); 342 + ret = __vmx_setup_l1d_flush(l1tf); 364 343 mutex_unlock(&vmx_l1d_flush_mutex); 365 344 return ret; 366 345 } ··· 372 351 373 352 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 374 353 } 354 + 355 + /* 356 + * Software based L1D cache flush which is used when microcode providing 357 + * the cache control MSR is not loaded. 358 + * 359 + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 360 + * flush it is required to read in 64 KiB because the replacement algorithm 361 + * is not exactly LRU. This could be sized at runtime via topology 362 + * information but as all relevant affected CPUs have 32KiB L1D cache size 363 + * there is no point in doing so. 364 + */ 365 + static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 366 + { 367 + int size = PAGE_SIZE << L1D_CACHE_ORDER; 368 + 369 + if (!static_branch_unlikely(&vmx_l1d_should_flush)) 370 + return; 371 + 372 + /* 373 + * This code is only executed when the flush mode is 'cond' or 374 + * 'always' 375 + */ 376 + if (static_branch_likely(&vmx_l1d_flush_cond)) { 377 + /* 378 + * Clear the per-cpu flush bit, it gets set again if the vCPU 379 + * is reloaded, i.e. if the vCPU is scheduled out or if KVM 380 + * exits to userspace, or if KVM reaches one of the unsafe 381 + * VMEXIT handlers, e.g. if KVM calls into the emulator, 382 + * or from the interrupt handlers. 383 + */ 384 + if (!kvm_get_cpu_l1tf_flush_l1d()) 385 + return; 386 + kvm_clear_cpu_l1tf_flush_l1d(); 387 + } 388 + 389 + vcpu->stat.l1d_flush++; 390 + 391 + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 392 + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 393 + return; 394 + } 395 + 396 + asm volatile( 397 + /* First ensure the pages are in the TLB */ 398 + "xorl %%eax, %%eax\n" 399 + ".Lpopulate_tlb:\n\t" 400 + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 401 + "addl $4096, %%eax\n\t" 402 + "cmpl %%eax, %[size]\n\t" 403 + "jne .Lpopulate_tlb\n\t" 404 + "xorl %%eax, %%eax\n\t" 405 + "cpuid\n\t" 406 + /* Now fill the cache */ 407 + "xorl %%eax, %%eax\n" 408 + ".Lfill_cache:\n" 409 + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 410 + "addl $64, %%eax\n\t" 411 + "cmpl %%eax, %[size]\n\t" 412 + "jne .Lfill_cache\n\t" 413 + "lfence\n" 414 + :: [flush_pages] "r" (vmx_l1d_flush_pages), 415 + [size] "r" (size) 416 + : "eax", "ebx", "ecx", "edx"); 417 + } 418 + 419 + #else /* CONFIG_CPU_MITIGATIONS*/ 420 + static int vmx_setup_l1d_flush(void) 421 + { 422 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; 423 + return 0; 424 + } 425 + static void vmx_cleanup_l1d_flush(void) 426 + { 427 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 428 + } 429 + static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) 430 + { 431 + 432 + } 433 + static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 434 + { 435 + pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); 436 + return 0; 437 + } 438 + static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 439 + { 440 + return sysfs_emit(s, "never\n"); 441 + } 442 + #endif 443 + 444 + static const struct kernel_param_ops vmentry_l1d_flush_ops = { 445 + .set = vmentry_l1d_flush_set, 446 + .get = vmentry_l1d_flush_get, 447 + }; 448 + module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 375 449 376 450 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 377 451 { ··· 519 403 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 520 404 vmx->disable_fb_clear = false; 521 405 } 522 - 523 - static const struct kernel_param_ops vmentry_l1d_flush_ops = { 524 - .set = vmentry_l1d_flush_set, 525 - .get = vmentry_l1d_flush_get, 526 - }; 527 - module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 528 406 529 407 static u32 vmx_segment_access_rights(struct kvm_segment *var); 530 408 ··· 1013 903 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 1014 904 flags |= VMX_RUN_SAVE_SPEC_CTRL; 1015 905 1016 - if (static_branch_unlikely(&cpu_buf_vm_clear) && 906 + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && 1017 907 kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) 1018 908 flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; 1019 909 ··· 6741 6631 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6742 6632 6743 6633 unexpected_vmexit: 6744 - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6745 - exit_reason.full); 6746 6634 dump_vmcs(vcpu); 6747 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6748 - vcpu->run->internal.suberror = 6749 - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6750 - vcpu->run->internal.ndata = 2; 6751 - vcpu->run->internal.data[0] = exit_reason.full; 6752 - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6635 + kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); 6753 6636 return 0; 6754 6637 } 6755 6638 ··· 6762 6659 return 0; 6763 6660 } 6764 6661 return ret; 6765 - } 6766 - 6767 - /* 6768 - * Software based L1D cache flush which is used when microcode providing 6769 - * the cache control MSR is not loaded. 6770 - * 6771 - * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6772 - * flush it is required to read in 64 KiB because the replacement algorithm 6773 - * is not exactly LRU. This could be sized at runtime via topology 6774 - * information but as all relevant affected CPUs have 32KiB L1D cache size 6775 - * there is no point in doing so. 6776 - */ 6777 - static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6778 - { 6779 - int size = PAGE_SIZE << L1D_CACHE_ORDER; 6780 - 6781 - /* 6782 - * This code is only executed when the flush mode is 'cond' or 6783 - * 'always' 6784 - */ 6785 - if (static_branch_likely(&vmx_l1d_flush_cond)) { 6786 - bool flush_l1d; 6787 - 6788 - /* 6789 - * Clear the per-vcpu flush bit, it gets set again if the vCPU 6790 - * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6791 - * exits to userspace, or if KVM reaches one of the unsafe 6792 - * VMEXIT handlers, e.g. if KVM calls into the emulator. 6793 - */ 6794 - flush_l1d = vcpu->arch.l1tf_flush_l1d; 6795 - vcpu->arch.l1tf_flush_l1d = false; 6796 - 6797 - /* 6798 - * Clear the per-cpu flush bit, it gets set again from 6799 - * the interrupt handlers. 6800 - */ 6801 - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6802 - kvm_clear_cpu_l1tf_flush_l1d(); 6803 - 6804 - if (!flush_l1d) 6805 - return; 6806 - } 6807 - 6808 - vcpu->stat.l1d_flush++; 6809 - 6810 - if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6811 - native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6812 - return; 6813 - } 6814 - 6815 - asm volatile( 6816 - /* First ensure the pages are in the TLB */ 6817 - "xorl %%eax, %%eax\n" 6818 - ".Lpopulate_tlb:\n\t" 6819 - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6820 - "addl $4096, %%eax\n\t" 6821 - "cmpl %%eax, %[size]\n\t" 6822 - "jne .Lpopulate_tlb\n\t" 6823 - "xorl %%eax, %%eax\n\t" 6824 - "cpuid\n\t" 6825 - /* Now fill the cache */ 6826 - "xorl %%eax, %%eax\n" 6827 - ".Lfill_cache:\n" 6828 - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6829 - "addl $64, %%eax\n\t" 6830 - "cmpl %%eax, %[size]\n\t" 6831 - "jne .Lfill_cache\n\t" 6832 - "lfence\n" 6833 - :: [flush_pages] "r" (vmx_l1d_flush_pages), 6834 - [size] "r" (size) 6835 - : "eax", "ebx", "ecx", "edx"); 6836 6662 } 6837 6663 6838 6664 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) ··· 7082 7050 if (to_vt(vcpu)->emulation_required) 7083 7051 return; 7084 7052 7085 - if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7053 + switch (vmx_get_exit_reason(vcpu).basic) { 7054 + case EXIT_REASON_EXTERNAL_INTERRUPT: 7086 7055 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7087 - else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) 7056 + break; 7057 + case EXIT_REASON_EXCEPTION_NMI: 7088 7058 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7059 + break; 7060 + case EXIT_REASON_MCE_DURING_VMENTRY: 7061 + kvm_machine_check(); 7062 + break; 7063 + default: 7064 + break; 7065 + } 7089 7066 } 7090 7067 7091 7068 /* ··· 7369 7328 7370 7329 guest_state_enter_irqoff(); 7371 7330 7372 - /* 7373 - * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7374 - * mitigation for MDS is done late in VMentry and is still 7375 - * executed in spite of L1D Flush. This is because an extra VERW 7376 - * should not matter much after the big hammer L1D Flush. 7377 - * 7378 - * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, 7379 - * and is affected by MMIO Stale Data. In such cases mitigation in only 7380 - * needed against an MMIO capable guest. 7381 - */ 7382 - if (static_branch_unlikely(&vmx_l1d_should_flush)) 7383 - vmx_l1d_flush(vcpu); 7384 - else if (static_branch_unlikely(&cpu_buf_vm_clear) && 7385 - (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) 7386 - x86_clear_cpu_buffers(); 7331 + vmx_l1d_flush(vcpu); 7387 7332 7388 7333 vmx_disable_fb_clear(vmx); 7389 7334 ··· 7481 7454 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7482 7455 vmx_set_interrupt_shadow(vcpu, 0); 7483 7456 7484 - kvm_load_guest_xsave_state(vcpu); 7485 - 7486 7457 pt_guest_enter(vmx); 7487 7458 7488 7459 atomic_switch_perf_msrs(vmx); ··· 7524 7499 7525 7500 pt_guest_exit(vmx); 7526 7501 7527 - kvm_load_host_xsave_state(vcpu); 7528 - 7529 7502 if (is_guest_mode(vcpu)) { 7530 7503 /* 7531 7504 * Track VMLAUNCH/VMRESUME that have made past guest state ··· 7538 7515 7539 7516 if (unlikely(vmx->fail)) 7540 7517 return EXIT_FASTPATH_NONE; 7541 - 7542 - if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7543 - kvm_machine_check(); 7544 7518 7545 7519 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7546 7520 ··· 8699 8679 return r; 8700 8680 } 8701 8681 8702 - static void vmx_cleanup_l1d_flush(void) 8703 - { 8704 - if (vmx_l1d_flush_pages) { 8705 - free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8706 - vmx_l1d_flush_pages = NULL; 8707 - } 8708 - /* Restore state so sysfs ignores VMX */ 8709 - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8710 - } 8711 - 8712 8682 void vmx_exit(void) 8713 8683 { 8714 8684 allow_smaller_maxphyaddr = false; ··· 8734 8724 if (r) 8735 8725 return r; 8736 8726 8737 - /* 8738 - * Must be called after common x86 init so enable_ept is properly set 8739 - * up. Hand the parameter mitigation value in which was stored in 8740 - * the pre module init parser. If no parameter was given, it will 8741 - * contain 'auto' which will be turned into the default 'cond' 8742 - * mitigation mode. 8743 - */ 8744 - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8727 + /* Must be called after common x86 init so enable_ept is setup. */ 8728 + r = vmx_setup_l1d_flush(); 8745 8729 if (r) 8746 8730 goto err_l1d_flush; 8747 8731
+122 -129
arch/x86/kvm/x86.c
··· 159 159 unsigned int min_timer_period_us = 200; 160 160 module_param(min_timer_period_us, uint, 0644); 161 161 162 - static bool __read_mostly kvmclock_periodic_sync = true; 163 - module_param(kvmclock_periodic_sync, bool, 0444); 164 - 165 162 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ 166 163 static u32 __read_mostly tsc_tolerance_ppm = 250; 167 164 module_param(tsc_tolerance_ppm, uint, 0644); ··· 209 212 u32 __read_mostly kvm_nr_uret_msrs; 210 213 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); 211 214 static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; 212 - static struct kvm_user_return_msrs __percpu *user_return_msrs; 215 + static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); 213 216 214 217 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ 215 218 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ ··· 572 575 vcpu->arch.apf.gfns[i] = ~0; 573 576 } 574 577 578 + static void kvm_destroy_user_return_msrs(void) 579 + { 580 + int cpu; 581 + 582 + for_each_possible_cpu(cpu) 583 + WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); 584 + 585 + kvm_nr_uret_msrs = 0; 586 + } 587 + 575 588 static void kvm_on_user_return(struct user_return_notifier *urn) 576 589 { 577 590 unsigned slot; 578 591 struct kvm_user_return_msrs *msrs 579 592 = container_of(urn, struct kvm_user_return_msrs, urn); 580 593 struct kvm_user_return_msr_values *values; 581 - unsigned long flags; 582 594 583 - /* 584 - * Disabling irqs at this point since the following code could be 585 - * interrupted and executed through kvm_arch_disable_virtualization_cpu() 586 - */ 587 - local_irq_save(flags); 588 - if (msrs->registered) { 589 - msrs->registered = false; 590 - user_return_notifier_unregister(urn); 591 - } 592 - local_irq_restore(flags); 595 + msrs->registered = false; 596 + user_return_notifier_unregister(urn); 597 + 593 598 for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { 594 599 values = &msrs->values[slot]; 595 600 if (values->host != values->curr) { ··· 642 643 643 644 static void kvm_user_return_msr_cpu_online(void) 644 645 { 645 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 646 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); 646 647 u64 value; 647 648 int i; 648 649 ··· 664 665 665 666 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) 666 667 { 667 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 668 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); 668 669 int err; 669 670 670 671 value = (value & mask) | (msrs->values[slot].host & ~mask); ··· 680 681 } 681 682 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); 682 683 683 - void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) 684 - { 685 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 686 - 687 - msrs->values[slot].curr = value; 688 - kvm_user_return_register_notifier(msrs); 689 - } 690 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache); 691 - 692 684 u64 kvm_get_user_return_msr(unsigned int slot) 693 685 { 694 - return this_cpu_ptr(user_return_msrs)->values[slot].curr; 686 + return this_cpu_ptr(&user_return_msrs)->values[slot].curr; 695 687 } 696 688 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); 697 689 698 690 static void drop_user_return_notifiers(void) 699 691 { 700 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 692 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); 701 693 702 694 if (msrs->registered) 703 695 kvm_on_user_return(&msrs->urn); ··· 1035 1045 } 1036 1046 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); 1037 1047 1048 + static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) 1049 + { 1050 + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; 1051 + 1052 + return (vcpu->arch.apf.msr_en_val & mask) == mask; 1053 + } 1054 + 1038 1055 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) 1039 1056 { 1040 1057 return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); ··· 1134 1137 } 1135 1138 1136 1139 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 1137 - kvm_clear_async_pf_completion_queue(vcpu); 1138 - kvm_async_pf_hash_reset(vcpu); 1139 - 1140 1140 /* 1141 1141 * Clearing CR0.PG is defined to flush the TLB from the guest's 1142 1142 * perspective. 1143 1143 */ 1144 1144 if (!(cr0 & X86_CR0_PG)) 1145 1145 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1146 + /* 1147 + * Check for async #PF completion events when enabling paging, 1148 + * as the vCPU may have previously encountered async #PFs (it's 1149 + * entirely legal for the guest to toggle paging on/off without 1150 + * waiting for the async #PF queue to drain). 1151 + */ 1152 + else if (kvm_pv_async_pf_enabled(vcpu)) 1153 + kvm_make_request(KVM_REQ_APF_READY, vcpu); 1146 1154 } 1147 1155 1148 1156 if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) ··· 1205 1203 } 1206 1204 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); 1207 1205 1208 - void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) 1206 + static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest) 1209 1207 { 1210 1208 if (vcpu->arch.guest_state_protected) 1211 1209 return; 1212 1210 1213 - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { 1211 + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) 1212 + return; 1214 1213 1215 - if (vcpu->arch.xcr0 != kvm_host.xcr0) 1216 - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 1214 + if (vcpu->arch.xcr0 != kvm_host.xcr0) 1215 + xsetbv(XCR_XFEATURE_ENABLED_MASK, 1216 + load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0); 1217 1217 1218 - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && 1219 - vcpu->arch.ia32_xss != kvm_host.xss) 1220 - wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); 1221 - } 1218 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && 1219 + vcpu->arch.ia32_xss != kvm_host.xss) 1220 + wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss); 1221 + } 1222 + 1223 + static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) 1224 + { 1225 + if (vcpu->arch.guest_state_protected) 1226 + return; 1222 1227 1223 1228 if (cpu_feature_enabled(X86_FEATURE_PKU) && 1224 1229 vcpu->arch.pkru != vcpu->arch.host_pkru && ··· 1233 1224 kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) 1234 1225 wrpkru(vcpu->arch.pkru); 1235 1226 } 1236 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state); 1237 1227 1238 - void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) 1228 + static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) 1239 1229 { 1240 1230 if (vcpu->arch.guest_state_protected) 1241 1231 return; ··· 1246 1238 if (vcpu->arch.pkru != vcpu->arch.host_pkru) 1247 1239 wrpkru(vcpu->arch.host_pkru); 1248 1240 } 1249 - 1250 - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { 1251 - 1252 - if (vcpu->arch.xcr0 != kvm_host.xcr0) 1253 - xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1254 - 1255 - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && 1256 - vcpu->arch.ia32_xss != kvm_host.xss) 1257 - wrmsrq(MSR_IA32_XSS, kvm_host.xss); 1258 - } 1259 - 1260 1241 } 1261 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); 1262 1242 1263 1243 #ifdef CONFIG_X86_64 1264 1244 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) ··· 3501 3505 /* 3502 3506 * kvmclock updates which are isolated to a given vcpu, such as 3503 3507 * vcpu->cpu migration, should not allow system_timestamp from 3504 - * the rest of the vcpus to remain static. Otherwise ntp frequency 3505 - * correction applies to one vcpu's system_timestamp but not 3506 - * the others. 3508 + * the rest of the vcpus to remain static. 3507 3509 * 3508 3510 * So in those cases, request a kvmclock update for all vcpus. 3509 - * We need to rate-limit these requests though, as they can 3510 - * considerably slow guests that have a large number of vcpus. 3511 - * The time for a remote vcpu to update its kvmclock is bound 3512 - * by the delay we use to rate-limit the updates. 3511 + * The worst case for a remote vcpu to update its kvmclock 3512 + * is then bounded by maximum nohz sleep latency. 3513 3513 */ 3514 - 3515 - #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) 3516 - 3517 - static void kvmclock_update_fn(struct work_struct *work) 3514 + static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) 3518 3515 { 3519 3516 unsigned long i; 3520 - struct delayed_work *dwork = to_delayed_work(work); 3521 - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 3522 - kvmclock_update_work); 3523 - struct kvm *kvm = container_of(ka, struct kvm, arch); 3524 3517 struct kvm_vcpu *vcpu; 3518 + struct kvm *kvm = v->kvm; 3525 3519 3526 3520 kvm_for_each_vcpu(i, vcpu, kvm) { 3527 3521 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 3528 3522 kvm_vcpu_kick(vcpu); 3529 3523 } 3530 - } 3531 - 3532 - static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) 3533 - { 3534 - struct kvm *kvm = v->kvm; 3535 - 3536 - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 3537 - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 3538 - KVMCLOCK_UPDATE_DELAY); 3539 - } 3540 - 3541 - #define KVMCLOCK_SYNC_PERIOD (300 * HZ) 3542 - 3543 - static void kvmclock_sync_fn(struct work_struct *work) 3544 - { 3545 - struct delayed_work *dwork = to_delayed_work(work); 3546 - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 3547 - kvmclock_sync_work); 3548 - struct kvm *kvm = container_of(ka, struct kvm, arch); 3549 - 3550 - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); 3551 - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 3552 - KVMCLOCK_SYNC_PERIOD); 3553 3524 } 3554 3525 3555 3526 /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ ··· 3611 3648 return 1; 3612 3649 } 3613 3650 return 0; 3614 - } 3615 - 3616 - static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) 3617 - { 3618 - u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; 3619 - 3620 - return (vcpu->arch.apf.msr_en_val & mask) == mask; 3621 3651 } 3622 3652 3623 3653 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) ··· 4138 4182 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 4139 4183 return 1; 4140 4184 if (data & 0x1) { 4141 - vcpu->arch.apf.pageready_pending = false; 4185 + /* 4186 + * Pairs with the smp_mb__after_atomic() in 4187 + * kvm_arch_async_page_present_queued(). 4188 + */ 4189 + smp_store_mb(vcpu->arch.apf.pageready_pending, false); 4190 + 4142 4191 kvm_check_async_pf_completion(vcpu); 4143 4192 } 4144 4193 break; ··· 5149 5188 { 5150 5189 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 5151 5190 5152 - vcpu->arch.l1tf_flush_l1d = true; 5191 + kvm_request_l1tf_flush_l1d(); 5153 5192 5154 5193 if (vcpu->scheduled_out && pmu->version && pmu->event_count) { 5155 5194 pmu->need_cleanup = true; ··· 7959 7998 unsigned int bytes, struct x86_exception *exception) 7960 7999 { 7961 8000 /* kvm_write_guest_virt_system can pull in tons of pages. */ 7962 - vcpu->arch.l1tf_flush_l1d = true; 8001 + kvm_request_l1tf_flush_l1d(); 7963 8002 7964 8003 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 7965 8004 PFERR_WRITE_MASK, exception); ··· 8803 8842 kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); 8804 8843 } 8805 8844 8845 + static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr) 8846 + { 8847 + if (index != XCR_XFEATURE_ENABLED_MASK) 8848 + return 1; 8849 + *xcr = emul_to_vcpu(ctxt)->arch.xcr0; 8850 + return 0; 8851 + } 8852 + 8806 8853 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) 8807 8854 { 8808 8855 return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); ··· 8883 8914 .is_smm = emulator_is_smm, 8884 8915 .leave_smm = emulator_leave_smm, 8885 8916 .triple_fault = emulator_triple_fault, 8917 + .get_xcr = emulator_get_xcr, 8886 8918 .set_xcr = emulator_set_xcr, 8887 8919 .get_untagged_addr = emulator_get_untagged_addr, 8888 8920 .is_canonical_addr = emulator_is_canonical_addr, ··· 9078 9108 run->internal.ndata = ndata; 9079 9109 } 9080 9110 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); 9111 + 9112 + void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason) 9113 + { 9114 + vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason); 9115 + 9116 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 9117 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 9118 + vcpu->run->internal.ndata = 2; 9119 + vcpu->run->internal.data[0] = exit_reason; 9120 + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 9121 + } 9122 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit); 9081 9123 9082 9124 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) 9083 9125 { ··· 9376 9394 return handle_emulation_failure(vcpu, emulation_type); 9377 9395 } 9378 9396 9379 - vcpu->arch.l1tf_flush_l1d = true; 9397 + kvm_request_l1tf_flush_l1d(); 9380 9398 9381 9399 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 9382 9400 kvm_clear_exception_queue(vcpu); ··· 10013 10031 return -ENOMEM; 10014 10032 } 10015 10033 10016 - user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); 10017 - if (!user_return_msrs) { 10018 - pr_err("failed to allocate percpu kvm_user_return_msrs\n"); 10019 - r = -ENOMEM; 10020 - goto out_free_x86_emulator_cache; 10021 - } 10022 - kvm_nr_uret_msrs = 0; 10023 - 10024 10034 r = kvm_mmu_vendor_module_init(); 10025 10035 if (r) 10026 - goto out_free_percpu; 10036 + goto out_free_x86_emulator_cache; 10027 10037 10028 10038 kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); 10029 10039 kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; ··· 10039 10065 10040 10066 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) 10041 10067 rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); 10068 + 10069 + WARN_ON_ONCE(kvm_nr_uret_msrs); 10042 10070 10043 10071 r = ops->hardware_setup(); 10044 10072 if (r != 0) ··· 10114 10138 kvm_x86_ops.enable_virtualization_cpu = NULL; 10115 10139 kvm_x86_call(hardware_unsetup)(); 10116 10140 out_mmu_exit: 10141 + kvm_destroy_user_return_msrs(); 10117 10142 kvm_mmu_vendor_module_exit(); 10118 - out_free_percpu: 10119 - free_percpu(user_return_msrs); 10120 10143 out_free_x86_emulator_cache: 10121 10144 kmem_cache_destroy(x86_emulator_cache); 10122 10145 return r; ··· 10143 10168 cancel_work_sync(&pvclock_gtod_work); 10144 10169 #endif 10145 10170 kvm_x86_call(hardware_unsetup)(); 10171 + kvm_destroy_user_return_msrs(); 10146 10172 kvm_mmu_vendor_module_exit(); 10147 - free_percpu(user_return_msrs); 10148 10173 kmem_cache_destroy(x86_emulator_cache); 10149 10174 #ifdef CONFIG_KVM_XEN 10150 10175 static_key_deferred_flush(&kvm_xen_enabled); ··· 11266 11291 if (vcpu->arch.guest_fpu.xfd_err) 11267 11292 wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 11268 11293 11294 + kvm_load_xfeatures(vcpu, true); 11295 + 11269 11296 if (unlikely(vcpu->arch.switch_db_regs && 11270 11297 !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { 11271 11298 set_debugreg(DR7_FIXED_1, 7); ··· 11296 11319 11297 11320 guest_timing_enter_irqoff(); 11298 11321 11322 + /* 11323 + * Swap PKRU with hardware breakpoints disabled to minimize the number 11324 + * of flows where non-KVM code can run with guest state loaded. 11325 + */ 11326 + kvm_load_guest_pkru(vcpu); 11327 + 11299 11328 for (;;) { 11300 11329 /* 11301 11330 * Assert that vCPU vs. VM APICv state is consistent. An APICv ··· 11329 11346 /* Note, VM-Exits that go down the "slow" path are accounted below. */ 11330 11347 ++vcpu->stat.exits; 11331 11348 } 11349 + 11350 + kvm_load_host_pkru(vcpu); 11332 11351 11333 11352 /* 11334 11353 * Do this here before restoring debug registers on the host. And ··· 11361 11376 11362 11377 vcpu->mode = OUTSIDE_GUEST_MODE; 11363 11378 smp_wmb(); 11379 + 11380 + kvm_load_xfeatures(vcpu, false); 11364 11381 11365 11382 /* 11366 11383 * Sync xfd before calling handle_exit_irqoff() which may ··· 12721 12734 12722 12735 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 12723 12736 { 12724 - struct kvm *kvm = vcpu->kvm; 12725 - 12726 12737 if (mutex_lock_killable(&vcpu->mutex)) 12727 12738 return; 12728 12739 vcpu_load(vcpu); ··· 12731 12746 vcpu->arch.msr_kvm_poll_control = 1; 12732 12747 12733 12748 mutex_unlock(&vcpu->mutex); 12734 - 12735 - if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) 12736 - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 12737 - KVMCLOCK_SYNC_PERIOD); 12738 12749 } 12739 12750 12740 12751 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) ··· 13069 13088 void kvm_arch_disable_virtualization_cpu(void) 13070 13089 { 13071 13090 kvm_x86_call(disable_virtualization_cpu)(); 13072 - drop_user_return_notifiers(); 13091 + 13092 + /* 13093 + * Leave the user-return notifiers as-is when disabling virtualization 13094 + * for reboot, i.e. when disabling via IPI function call, and instead 13095 + * pin kvm.ko (if it's a module) to defend against use-after-free (in 13096 + * the *very* unlikely scenario module unload is racing with reboot). 13097 + * On a forced reboot, tasks aren't frozen before shutdown, and so KVM 13098 + * could be actively modifying user-return MSR state when the IPI to 13099 + * disable virtualization arrives. Handle the extreme edge case here 13100 + * instead of trying to account for it in the normal flows. 13101 + */ 13102 + if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) 13103 + drop_user_return_notifiers(); 13104 + else 13105 + __module_get(THIS_MODULE); 13073 13106 } 13074 13107 13075 13108 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) ··· 13154 13159 spin_lock_init(&kvm->arch.hv_root_tdp_lock); 13155 13160 kvm->arch.hv_root_tdp = INVALID_PAGE; 13156 13161 #endif 13157 - 13158 - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); 13159 - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); 13160 13162 13161 13163 kvm_apicv_init(kvm); 13162 13164 kvm_hv_init_vm(kvm); ··· 13261 13269 * is unsafe, i.e. will lead to use-after-free. The PIT also needs to 13262 13270 * be stopped before IRQ routing is freed. 13263 13271 */ 13264 - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); 13265 - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); 13266 - 13267 13272 #ifdef CONFIG_KVM_IOAPIC 13268 13273 kvm_free_pit(kvm); 13269 13274 #endif ··· 13877 13888 if ((work->wakeup_all || work->notpresent_injected) && 13878 13889 kvm_pv_async_pf_enabled(vcpu) && 13879 13890 !apf_put_user_ready(vcpu, work->arch.token)) { 13880 - vcpu->arch.apf.pageready_pending = true; 13891 + WRITE_ONCE(vcpu->arch.apf.pageready_pending, true); 13881 13892 kvm_apic_set_irq(vcpu, &irq, NULL); 13882 13893 } 13883 13894 ··· 13888 13899 void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) 13889 13900 { 13890 13901 kvm_make_request(KVM_REQ_APF_READY, vcpu); 13891 - if (!vcpu->arch.apf.pageready_pending) 13902 + 13903 + /* Pairs with smp_store_mb() in kvm_set_msr_common(). */ 13904 + smp_mb__after_atomic(); 13905 + 13906 + if (!READ_ONCE(vcpu->arch.apf.pageready_pending)) 13892 13907 kvm_vcpu_kick(vcpu); 13893 13908 } 13894 13909
+14 -2
arch/x86/kvm/x86.h
··· 420 420 return !(kvm->arch.disabled_quirks & quirk); 421 421 } 422 422 423 + static __always_inline void kvm_request_l1tf_flush_l1d(void) 424 + { 425 + #if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) 426 + /* 427 + * Use a raw write to set the per-CPU flag, as KVM will ensure a flush 428 + * even if preemption is currently enabled.. If the current vCPU task 429 + * is migrated to a different CPU (or userspace runs the vCPU on a 430 + * different task) before the next VM-Entry, then kvm_arch_vcpu_load() 431 + * will request a flush on the new CPU. 432 + */ 433 + raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); 434 + #endif 435 + } 436 + 423 437 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 424 438 425 439 u64 get_kvmclock_ns(struct kvm *kvm); ··· 636 622 #endif 637 623 } 638 624 639 - void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); 640 - void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); 641 625 int kvm_spec_ctrl_test_value(u64 value); 642 626 int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, 643 627 struct x86_exception *e);