Merge tag 'kvm-x86-misc-6.19' of https://github.com/kvm-x86/linux into HEAD

+5

arch/x86/include/asm/cpufeatures.h

··· 499 499 #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ 500 500 #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ 501 501 #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ 502 + #define X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO (21*32+17) /* 503 + * Clear CPU buffers before VM-Enter if the vCPU 504 + * can access host MMIO (ignored for all intents 505 + * and purposes if CLEAR_CPU_BUF_VM is set). 506 + */ 502 507 503 508 /* 504 509 * BUG word(s)

+2 -2

arch/x86/include/asm/hardirq.h

··· 5 5 #include <linux/threads.h> 6 6 7 7 typedef struct { 8 - #if IS_ENABLED(CONFIG_KVM_INTEL) 8 + #if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) 9 9 u8 kvm_cpu_l1tf_flush_l1d; 10 10 #endif 11 11 unsigned int __nmi_count; /* arch dependent */ ··· 68 68 DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending); 69 69 #define local_softirq_pending_ref __softirq_pending 70 70 71 - #if IS_ENABLED(CONFIG_KVM_INTEL) 71 + #if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) 72 72 /* 73 73 * This function is called from noinstr interrupt contexts 74 74 * and must be inlined to not get instrumentation.

+1 -6

arch/x86/include/asm/kvm_host.h

··· 1055 1055 /* be preempted when it's in kernel-mode(cpl=0) */ 1056 1056 bool preempted_in_kernel; 1057 1057 1058 - /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ 1059 - bool l1tf_flush_l1d; 1060 - 1061 1058 /* Host CPU on which VM-entry was most recently attempted */ 1062 1059 int last_vmentry_cpu; 1063 1060 ··· 1453 1456 bool use_master_clock; 1454 1457 u64 master_kernel_ns; 1455 1458 u64 master_cycle_now; 1456 - struct delayed_work kvmclock_update_work; 1457 - struct delayed_work kvmclock_sync_work; 1458 1459 1459 1460 #ifdef CONFIG_KVM_HYPERV 1460 1461 struct kvm_hv hyperv; ··· 2162 2167 void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); 2163 2168 2164 2169 void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); 2170 + void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason); 2165 2171 2166 2172 void kvm_enable_efer_bits(u64); 2167 2173 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); ··· 2374 2378 int kvm_add_user_return_msr(u32 msr); 2375 2379 int kvm_find_user_return_msr(u32 msr); 2376 2380 int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); 2377 - void kvm_user_return_msr_update_cache(unsigned int index, u64 val); 2378 2381 u64 kvm_get_user_return_msr(unsigned int slot); 2379 2382 2380 2383 static inline bool kvm_is_supported_user_return_msr(u32 msr)

+15 -15

arch/x86/include/asm/nospec-branch.h

··· 308 308 * CFLAGS.ZF. 309 309 * Note: Only the memory operand variant of VERW clears the CPU buffers. 310 310 */ 311 - .macro __CLEAR_CPU_BUFFERS feature 312 311 #ifdef CONFIG_X86_64 313 - ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature 312 + #define VERW verw x86_verw_sel(%rip) 314 313 #else 315 - /* 316 - * In 32bit mode, the memory operand must be a %cs reference. The data 317 - * segments may not be usable (vm86 mode), and the stack segment may not 318 - * be flat (ESPFIX32). 319 - */ 320 - ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature 314 + /* 315 + * In 32bit mode, the memory operand must be a %cs reference. The data segments 316 + * may not be usable (vm86 mode), and the stack segment may not be flat (ESPFIX32). 317 + */ 318 + #define VERW verw %cs:x86_verw_sel 321 319 #endif 322 - .endm 323 320 321 + /* 322 + * Provide a stringified VERW macro for simple usage, and a non-stringified 323 + * VERW macro for use in more elaborate sequences, e.g. to encode a conditional 324 + * VERW within an ALTERNATIVE. 325 + */ 326 + #define __CLEAR_CPU_BUFFERS __stringify(VERW) 327 + 328 + /* If necessary, emit VERW on exit-to-userspace to clear CPU buffers. */ 324 329 #define CLEAR_CPU_BUFFERS \ 325 - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF 326 - 327 - #define VM_CLEAR_CPU_BUFFERS \ 328 - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM 330 + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF 329 331 330 332 #ifdef CONFIG_X86_64 331 333 .macro CLEAR_BRANCH_HISTORY ··· 581 579 DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear); 582 580 583 581 DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); 584 - 585 - DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); 586 582 587 583 extern u16 x86_verw_sel; 588 584

+9 -13

arch/x86/kernel/cpu/bugs.c

··· 192 192 */ 193 193 DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); 194 194 195 - /* 196 - * Controls CPU Fill buffer clear before VMenter. This is a subset of 197 - * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only 198 - * mitigation is required. 199 - */ 200 - DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); 201 - EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); 202 - 203 195 #undef pr_fmt 204 196 #define pr_fmt(fmt) "mitigations: " fmt 205 197 ··· 481 489 IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; 482 490 483 491 /* 484 - * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing 485 - * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. 492 + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to 493 + * userspace *and* on entry to KVM guests. 486 494 */ 487 495 static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; 488 496 ··· 528 536 if (mds_mitigation == MDS_MITIGATION_FULL || 529 537 mds_mitigation == MDS_MITIGATION_VMWERV) { 530 538 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 539 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 531 540 if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && 532 541 (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) 533 542 cpu_smt_disable(false); ··· 640 647 * present on host, enable the mitigation for UCODE_NEEDED as well. 641 648 */ 642 649 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 650 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 643 651 644 652 if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) 645 653 cpu_smt_disable(false); ··· 742 748 */ 743 749 if (verw_clear_cpu_buf_mitigation_selected) { 744 750 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 745 - static_branch_disable(&cpu_buf_vm_clear); 751 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 746 752 } else { 747 - static_branch_enable(&cpu_buf_vm_clear); 753 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO); 748 754 } 749 755 750 756 /* ··· 833 839 834 840 static void __init rfds_apply_mitigation(void) 835 841 { 836 - if (rfds_mitigation == RFDS_MITIGATION_VERW) 842 + if (rfds_mitigation == RFDS_MITIGATION_VERW) { 837 843 setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); 844 + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); 845 + } 838 846 } 839 847 840 848 static __init int rfds_parse_cmdline(char *str)

+223 -96

arch/x86/kvm/emulate.c

··· 81 81 */ 82 82 83 83 /* Operand sizes: 8-bit operands or specified/overridden size. */ 84 - #define ByteOp (1<<0) /* 8-bit operands. */ 85 - /* Destination operand type. */ 86 - #define DstShift 1 84 + #define ByteOp (1<<0) /* 8-bit operands. */ 85 + #define DstShift 1 /* Destination operand type at bits 1-5 */ 87 86 #define ImplicitOps (OpImplicit << DstShift) 88 87 #define DstReg (OpReg << DstShift) 89 88 #define DstMem (OpMem << DstShift) ··· 94 95 #define DstDX (OpDX << DstShift) 95 96 #define DstAccLo (OpAccLo << DstShift) 96 97 #define DstMask (OpMask << DstShift) 97 - /* Source operand type. */ 98 - #define SrcShift 6 98 + #define SrcShift 6 /* Source operand type at bits 6-10 */ 99 99 #define SrcNone (OpNone << SrcShift) 100 100 #define SrcReg (OpReg << SrcShift) 101 101 #define SrcMem (OpMem << SrcShift) ··· 117 119 #define SrcAccHi (OpAccHi << SrcShift) 118 120 #define SrcMask (OpMask << SrcShift) 119 121 #define BitOp (1<<11) 120 - #define MemAbs (1<<12) /* Memory operand is absolute displacement */ 122 + #define MemAbs (1<<12) /* Memory operand is absolute displacement */ 121 123 #define String (1<<13) /* String instruction (rep capable) */ 122 124 #define Stack (1<<14) /* Stack instruction (push/pop) */ 123 - #define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */ 125 + #define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */ 124 126 #define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ 125 127 #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ 126 128 #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ ··· 129 131 #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ 130 132 #define ModeDual (7<<15) /* Different instruction for 32/64 bit */ 131 133 #define Sse (1<<18) /* SSE Vector instruction */ 132 - /* Generic ModRM decode. */ 133 - #define ModRM (1<<19) 134 - /* Destination is only written; never read. */ 135 - #define Mov (1<<20) 136 - /* Misc flags */ 134 + #define ModRM (1<<19) /* Generic ModRM decode. */ 135 + #define Mov (1<<20) /* Destination is only written; never read. */ 137 136 #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ 138 137 #define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ 139 138 #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ ··· 138 143 #define Undefined (1<<25) /* No Such Instruction */ 139 144 #define Lock (1<<26) /* lock prefix is allowed for the instruction */ 140 145 #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 141 - #define No64 (1<<28) 146 + #define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */ 142 147 #define PageTable (1 << 29) /* instruction used to write page table */ 143 148 #define NotImpl (1 << 30) /* instruction is not implemented */ 144 - /* Source 2 operand type */ 145 - #define Src2Shift (31) 149 + #define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */ 150 + #define Src2Shift (32) /* Source 2 operand type at bits 32-36 */ 146 151 #define Src2None (OpNone << Src2Shift) 147 152 #define Src2Mem (OpMem << Src2Shift) 148 153 #define Src2CL (OpCL << Src2Shift) ··· 156 161 #define Src2FS (OpFS << Src2Shift) 157 162 #define Src2GS (OpGS << Src2Shift) 158 163 #define Src2Mask (OpMask << Src2Shift) 164 + /* free: 37-39 */ 159 165 #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ 160 - #define AlignMask ((u64)7 << 41) 166 + #define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */ 161 167 #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ 162 168 #define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ 163 - #define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ 164 - #define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ 169 + #define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ 170 + /* free: 43-44 */ 165 171 #define NoWrite ((u64)1 << 45) /* No writeback */ 166 172 #define SrcWrite ((u64)1 << 46) /* Write back src operand */ 167 173 #define NoMod ((u64)1 << 47) /* Mod field is ignored */ ··· 237 241 X86_TRANSFER_CALL_JMP, 238 242 X86_TRANSFER_RET, 239 243 X86_TRANSFER_TASK_SWITCH, 244 + }; 245 + 246 + enum rex_bits { 247 + REX_B = 1, 248 + REX_X = 2, 249 + REX_R = 4, 250 + REX_W = 8, 240 251 }; 241 252 242 253 static void writeback_registers(struct x86_emulate_ctxt *ctxt) ··· 625 622 626 623 switch (alignment) { 627 624 case Unaligned: 628 - case Avx: 629 625 return 1; 630 626 case Aligned16: 631 627 return 16; ··· 926 924 int byteop) 927 925 { 928 926 void *p; 929 - int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; 927 + int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop; 930 928 931 929 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) 932 930 p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; ··· 1032 1030 op->val = *(u64 *)op->addr.reg; 1033 1031 break; 1034 1032 } 1033 + op->orig_val = op->val; 1035 1034 } 1036 1035 1037 1036 static int em_fninit(struct x86_emulate_ctxt *ctxt) ··· 1078 1075 return X86EMUL_CONTINUE; 1079 1076 } 1080 1077 1081 - static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 1082 - struct operand *op) 1078 + static void __decode_register_operand(struct x86_emulate_ctxt *ctxt, 1079 + struct operand *op, int reg) 1083 1080 { 1084 - unsigned int reg; 1085 - 1086 - if (ctxt->d & ModRM) 1087 - reg = ctxt->modrm_reg; 1088 - else 1089 - reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); 1090 - 1091 - if (ctxt->d & Sse) { 1081 + if ((ctxt->d & Avx) && ctxt->op_bytes == 32) { 1082 + op->type = OP_YMM; 1083 + op->bytes = 32; 1084 + op->addr.xmm = reg; 1085 + kvm_read_avx_reg(reg, &op->vec_val2); 1086 + return; 1087 + } 1088 + if (ctxt->d & (Avx|Sse)) { 1092 1089 op->type = OP_XMM; 1093 1090 op->bytes = 16; 1094 1091 op->addr.xmm = reg; ··· 1106 1103 op->type = OP_REG; 1107 1104 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1108 1105 op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); 1109 - 1110 1106 fetch_register_operand(op); 1111 - op->orig_val = op->val; 1107 + } 1108 + 1109 + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 1110 + struct operand *op) 1111 + { 1112 + unsigned int reg; 1113 + 1114 + if (ctxt->d & ModRM) 1115 + reg = ctxt->modrm_reg; 1116 + else 1117 + reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0); 1118 + 1119 + __decode_register_operand(ctxt, op, reg); 1112 1120 } 1113 1121 1114 1122 static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) ··· 1136 1122 int rc = X86EMUL_CONTINUE; 1137 1123 ulong modrm_ea = 0; 1138 1124 1139 - ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ 1140 - index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ 1141 - base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ 1125 + ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0); 1126 + index_reg = (ctxt->rex_bits & REX_X ? 8 : 0); 1127 + base_reg = (ctxt->rex_bits & REX_B ? 8 : 0); 1142 1128 1143 1129 ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; 1144 1130 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ··· 1146 1132 ctxt->modrm_seg = VCPU_SREG_DS; 1147 1133 1148 1134 if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { 1149 - op->type = OP_REG; 1150 - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1151 - op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1152 - ctxt->d & ByteOp); 1153 - if (ctxt->d & Sse) { 1154 - op->type = OP_XMM; 1155 - op->bytes = 16; 1156 - op->addr.xmm = ctxt->modrm_rm; 1157 - kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val); 1158 - return rc; 1159 - } 1160 - if (ctxt->d & Mmx) { 1161 - op->type = OP_MM; 1162 - op->bytes = 8; 1163 - op->addr.mm = ctxt->modrm_rm & 7; 1164 - return rc; 1165 - } 1166 - fetch_register_operand(op); 1135 + __decode_register_operand(ctxt, op, ctxt->modrm_rm); 1167 1136 return rc; 1168 1137 } 1169 1138 ··· 1780 1783 op->data, 1781 1784 op->bytes * op->count); 1782 1785 case OP_XMM: 1783 - kvm_write_sse_reg(op->addr.xmm, &op->vec_val); 1786 + if (!(ctxt->d & Avx)) { 1787 + kvm_write_sse_reg(op->addr.xmm, &op->vec_val); 1788 + break; 1789 + } 1790 + /* full YMM write but with high bytes cleared */ 1791 + memset(op->valptr + 16, 0, 16); 1792 + fallthrough; 1793 + case OP_YMM: 1794 + kvm_write_avx_reg(op->addr.xmm, &op->vec_val2); 1784 1795 break; 1785 1796 case OP_MM: 1786 1797 kvm_write_mmx_reg(op->addr.mm, &op->mm_val); ··· 2471 2466 2472 2467 setup_syscalls_segments(&cs, &ss); 2473 2468 2474 - if ((ctxt->rex_prefix & 0x8) != 0x0) 2469 + if (ctxt->rex_bits & REX_W) 2475 2470 usermode = X86EMUL_MODE_PROT64; 2476 2471 else 2477 2472 usermode = X86EMUL_MODE_PROT32; ··· 3963 3958 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3964 3959 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3965 3960 3961 + static const struct opcode ud = I(SrcNone, emulate_ud); 3962 + 3966 3963 static const struct opcode group7_rm0[] = { 3967 3964 N, 3968 3965 I(SrcNone | Priv | EmulateOnUD, em_hypercall), ··· 4121 4114 } }; 4122 4115 4123 4116 static const struct gprefix pfx_0f_6f_0f_7f = { 4124 - I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), 4117 + I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov), 4125 4118 }; 4126 4119 4127 4120 static const struct instr_dual instr_dual_0f_2b = { ··· 4140 4133 I(Aligned, em_mov), I(Aligned, em_mov), N, N, 4141 4134 }; 4142 4135 4143 - static const struct gprefix pfx_0f_e7 = { 4144 - N, I(Sse, em_mov), N, N, 4136 + static const struct gprefix pfx_0f_e7_0f_38_2a = { 4137 + N, I(Sse | Avx, em_mov), N, N, 4145 4138 }; 4146 4139 4147 4140 static const struct escape escape_d9 = { { ··· 4354 4347 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 4355 4348 N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, 4356 4349 /* 0x10 - 0x1F */ 4357 - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11), 4358 - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11), 4350 + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11), 4351 + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11), 4359 4352 N, N, N, N, N, N, 4360 4353 D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */ 4361 4354 D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, ··· 4371 4364 IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, 4372 4365 check_dr_write), 4373 4366 N, N, N, N, 4374 - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), 4375 - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), 4376 - N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b), 4367 + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29), 4368 + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29), 4369 + N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b), 4377 4370 N, N, N, N, 4378 4371 /* 0x30 - 0x3F */ 4379 4372 II(ImplicitOps | Priv, em_wrmsr, wrmsr), ··· 4438 4431 /* 0xD0 - 0xDF */ 4439 4432 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 4440 4433 /* 0xE0 - 0xEF */ 4441 - N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7), 4434 + N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a), 4442 4435 N, N, N, N, N, N, N, N, 4443 4436 /* 0xF0 - 0xFF */ 4444 4437 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N ··· 4465 4458 * byte. 4466 4459 */ 4467 4460 static const struct opcode opcode_map_0f_38[256] = { 4468 - /* 0x00 - 0x7f */ 4469 - X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), 4461 + /* 0x00 - 0x1f */ 4462 + X16(N), X16(N), 4463 + /* 0x20 - 0x2f */ 4464 + X8(N), 4465 + X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, 4466 + /* 0x30 - 0x7f */ 4467 + X16(N), X16(N), X16(N), X16(N), X16(N), 4470 4468 /* 0x80 - 0xef */ 4471 4469 X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), 4472 4470 /* 0xf0 - 0xf1 */ ··· 4630 4618 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4631 4619 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); 4632 4620 fetch_register_operand(op); 4633 - op->orig_val = op->val; 4634 4621 break; 4635 4622 case OpAccLo: 4636 4623 op->type = OP_REG; 4637 4624 op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; 4638 4625 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); 4639 4626 fetch_register_operand(op); 4640 - op->orig_val = op->val; 4641 4627 break; 4642 4628 case OpAccHi: 4643 4629 if (ctxt->d & ByteOp) { ··· 4646 4636 op->bytes = ctxt->op_bytes; 4647 4637 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); 4648 4638 fetch_register_operand(op); 4649 - op->orig_val = op->val; 4650 4639 break; 4651 4640 case OpDI: 4652 4641 op->type = OP_MEM; ··· 4764 4755 return rc; 4765 4756 } 4766 4757 4758 + static int x86_decode_avx(struct x86_emulate_ctxt *ctxt, 4759 + u8 vex_1st, u8 vex_2nd, struct opcode *opcode) 4760 + { 4761 + u8 vex_3rd, map, pp, l, v; 4762 + int rc = X86EMUL_CONTINUE; 4763 + 4764 + if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix) 4765 + goto ud; 4766 + 4767 + if (vex_1st == 0xc5) { 4768 + /* Expand RVVVVlpp to VEX3 format */ 4769 + vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */ 4770 + vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */ 4771 + } else { 4772 + vex_3rd = insn_fetch(u8, ctxt); 4773 + } 4774 + 4775 + /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */ 4776 + vex_2nd ^= 0xE0; /* binary 11100000 */ 4777 + vex_3rd ^= 0x78; /* binary 01111000 */ 4778 + 4779 + ctxt->rex_prefix = REX_PREFIX; 4780 + ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */ 4781 + ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */ 4782 + if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64) 4783 + goto ud; 4784 + 4785 + map = vex_2nd & 0x1f; 4786 + v = (vex_3rd >> 3) & 0xf; 4787 + l = vex_3rd & 0x4; 4788 + pp = vex_3rd & 0x3; 4789 + 4790 + ctxt->b = insn_fetch(u8, ctxt); 4791 + switch (map) { 4792 + case 1: 4793 + ctxt->opcode_len = 2; 4794 + *opcode = twobyte_table[ctxt->b]; 4795 + break; 4796 + case 2: 4797 + ctxt->opcode_len = 3; 4798 + *opcode = opcode_map_0f_38[ctxt->b]; 4799 + break; 4800 + case 3: 4801 + /* no 0f 3a instructions are supported yet */ 4802 + return X86EMUL_UNHANDLEABLE; 4803 + default: 4804 + goto ud; 4805 + } 4806 + 4807 + /* 4808 + * No three operand instructions are supported yet; those that 4809 + * *are* marked with the Avx flag reserve the VVVV flag. 4810 + */ 4811 + if (v) 4812 + goto ud; 4813 + 4814 + if (l) 4815 + ctxt->op_bytes = 32; 4816 + else 4817 + ctxt->op_bytes = 16; 4818 + 4819 + switch (pp) { 4820 + case 0: break; 4821 + case 1: ctxt->op_prefix = true; break; 4822 + case 2: ctxt->rep_prefix = 0xf3; break; 4823 + case 3: ctxt->rep_prefix = 0xf2; break; 4824 + } 4825 + 4826 + done: 4827 + return rc; 4828 + ud: 4829 + *opcode = ud; 4830 + return rc; 4831 + } 4832 + 4767 4833 int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) 4768 4834 { 4769 4835 int rc = X86EMUL_CONTINUE; 4770 4836 int mode = ctxt->mode; 4771 4837 int def_op_bytes, def_ad_bytes, goffset, simd_prefix; 4772 - bool op_prefix = false; 4838 + bool vex_prefix = false; 4773 4839 bool has_seg_override = false; 4774 4840 struct opcode opcode; 4775 4841 u16 dummy; ··· 4896 4812 for (;;) { 4897 4813 switch (ctxt->b = insn_fetch(u8, ctxt)) { 4898 4814 case 0x66: /* operand-size override */ 4899 - op_prefix = true; 4815 + ctxt->op_prefix = true; 4900 4816 /* switch between 2/4 bytes */ 4901 4817 ctxt->op_bytes = def_op_bytes ^ 6; 4902 4818 break; ··· 4935 4851 case 0x40 ... 0x4f: /* REX */ 4936 4852 if (mode != X86EMUL_MODE_PROT64) 4937 4853 goto done_prefixes; 4938 - ctxt->rex_prefix = ctxt->b; 4854 + ctxt->rex_prefix = REX_PREFIX; 4855 + ctxt->rex_bits = ctxt->b & 0xf; 4939 4856 continue; 4940 4857 case 0xf0: /* LOCK */ 4941 4858 ctxt->lock_prefix = 1; ··· 4950 4865 } 4951 4866 4952 4867 /* Any legacy prefix after a REX prefix nullifies its effect. */ 4953 - 4954 - ctxt->rex_prefix = 0; 4868 + ctxt->rex_prefix = REX_NONE; 4869 + ctxt->rex_bits = 0; 4955 4870 } 4956 4871 4957 4872 done_prefixes: 4958 4873 4959 4874 /* REX prefix. */ 4960 - if (ctxt->rex_prefix & 8) 4961 - ctxt->op_bytes = 8; /* REX.W */ 4875 + if (ctxt->rex_bits & REX_W) 4876 + ctxt->op_bytes = 8; 4962 4877 4963 4878 /* Opcode byte(s). */ 4964 - opcode = opcode_table[ctxt->b]; 4965 - /* Two-byte opcode? */ 4966 - if (ctxt->b == 0x0f) { 4879 + if (ctxt->b == 0xc4 || ctxt->b == 0xc5) { 4880 + /* VEX or LDS/LES */ 4881 + u8 vex_2nd = insn_fetch(u8, ctxt); 4882 + if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) { 4883 + opcode = opcode_table[ctxt->b]; 4884 + ctxt->modrm = vex_2nd; 4885 + /* the Mod/RM byte has been fetched already! */ 4886 + goto done_modrm; 4887 + } 4888 + 4889 + vex_prefix = true; 4890 + rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode); 4891 + if (rc != X86EMUL_CONTINUE) 4892 + goto done; 4893 + } else if (ctxt->b == 0x0f) { 4894 + /* Two- or three-byte opcode */ 4967 4895 ctxt->opcode_len = 2; 4968 4896 ctxt->b = insn_fetch(u8, ctxt); 4969 4897 opcode = twobyte_table[ctxt->b]; ··· 4987 4889 ctxt->b = insn_fetch(u8, ctxt); 4988 4890 opcode = opcode_map_0f_38[ctxt->b]; 4989 4891 } 4892 + } else { 4893 + /* Opcode byte(s). */ 4894 + opcode = opcode_table[ctxt->b]; 4990 4895 } 4991 - ctxt->d = opcode.flags; 4992 4896 4993 - if (ctxt->d & ModRM) 4897 + if (opcode.flags & ModRM) 4994 4898 ctxt->modrm = insn_fetch(u8, ctxt); 4995 4899 4996 - /* vex-prefix instructions are not implemented */ 4997 - if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && 4998 - (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { 4999 - ctxt->d = NotImpl; 5000 - } 5001 - 4900 + done_modrm: 4901 + ctxt->d = opcode.flags; 5002 4902 while (ctxt->d & GroupMask) { 5003 4903 switch (ctxt->d & GroupMask) { 5004 4904 case Group: ··· 5015 4919 opcode = opcode.u.group[goffset]; 5016 4920 break; 5017 4921 case Prefix: 5018 - if (ctxt->rep_prefix && op_prefix) 4922 + if (ctxt->rep_prefix && ctxt->op_prefix) 5019 4923 return EMULATION_FAILED; 5020 - simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; 4924 + simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix; 5021 4925 switch (simd_prefix) { 5022 4926 case 0x00: opcode = opcode.u.gprefix->pfx_no; break; 5023 4927 case 0x66: opcode = opcode.u.gprefix->pfx_66; break; ··· 5061 4965 /* Unrecognised? */ 5062 4966 if (ctxt->d == 0) 5063 4967 return EMULATION_FAILED; 4968 + 4969 + if (unlikely(vex_prefix)) { 4970 + /* 4971 + * Only specifically marked instructions support VEX. Since many 4972 + * instructions support it but are not annotated, return not implemented 4973 + * rather than #UD. 4974 + */ 4975 + if (!(ctxt->d & Avx)) 4976 + return EMULATION_FAILED; 4977 + 4978 + if (!(ctxt->d & AlignMask)) 4979 + ctxt->d |= Unaligned; 4980 + } 5064 4981 5065 4982 ctxt->execute = opcode.u.execute; 5066 4983 ··· 5145 5036 if ((ctxt->d & No16) && ctxt->op_bytes == 2) 5146 5037 ctxt->op_bytes = 4; 5147 5038 5148 - if (ctxt->d & Sse) 5149 - ctxt->op_bytes = 16; 5039 + if (vex_prefix) 5040 + ; 5041 + else if (ctxt->d & Sse) 5042 + ctxt->op_bytes = 16, ctxt->d &= ~Avx; 5150 5043 else if (ctxt->d & Mmx) 5151 5044 ctxt->op_bytes = 8; 5152 5045 } ··· 5248 5137 { 5249 5138 /* Clear fields that are set conditionally but read without a guard. */ 5250 5139 ctxt->rip_relative = false; 5251 - ctxt->rex_prefix = 0; 5140 + ctxt->rex_prefix = REX_NONE; 5141 + ctxt->rex_bits = 0; 5252 5142 ctxt->lock_prefix = 0; 5143 + ctxt->op_prefix = false; 5253 5144 ctxt->rep_prefix = 0; 5254 5145 ctxt->regs_valid = 0; 5255 5146 ctxt->regs_dirty = 0; ··· 5281 5168 } 5282 5169 5283 5170 if (unlikely(ctxt->d & 5284 - (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { 5171 + (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { 5285 5172 if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || 5286 5173 (ctxt->d & Undefined)) { 5287 5174 rc = emulate_ud(ctxt); 5288 5175 goto done; 5289 5176 } 5290 5177 5291 - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) 5292 - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { 5178 + if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) { 5293 5179 rc = emulate_ud(ctxt); 5294 5180 goto done; 5295 5181 } 5296 5182 5297 - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 5183 + if (ctxt->d & Avx) { 5184 + u64 xcr = 0; 5185 + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE) 5186 + || ops->get_xcr(ctxt, 0, &xcr) 5187 + || !(xcr & XFEATURE_MASK_YMM)) { 5188 + rc = emulate_ud(ctxt); 5189 + goto done; 5190 + } 5191 + } else if (ctxt->d & Sse) { 5192 + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) { 5193 + rc = emulate_ud(ctxt); 5194 + goto done; 5195 + } 5196 + } 5197 + 5198 + if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { 5298 5199 rc = emulate_nm(ctxt); 5299 5200 goto done; 5300 5201 }

+66

arch/x86/kvm/fpu.h

··· 15 15 #define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; }) 16 16 #define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; }) 17 17 18 + typedef u32 __attribute__((vector_size(32))) avx256_t; 19 + 20 + static inline void _kvm_read_avx_reg(int reg, avx256_t *data) 21 + { 22 + switch (reg) { 23 + case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break; 24 + case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break; 25 + case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break; 26 + case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break; 27 + case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break; 28 + case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break; 29 + case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break; 30 + case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break; 31 + #ifdef CONFIG_X86_64 32 + case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break; 33 + case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break; 34 + case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break; 35 + case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break; 36 + case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break; 37 + case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break; 38 + case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break; 39 + case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break; 40 + #endif 41 + default: BUG(); 42 + } 43 + } 44 + 45 + static inline void _kvm_write_avx_reg(int reg, const avx256_t *data) 46 + { 47 + switch (reg) { 48 + case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break; 49 + case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break; 50 + case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break; 51 + case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break; 52 + case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break; 53 + case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break; 54 + case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break; 55 + case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break; 56 + #ifdef CONFIG_X86_64 57 + case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break; 58 + case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break; 59 + case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break; 60 + case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break; 61 + case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break; 62 + case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break; 63 + case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break; 64 + case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break; 65 + #endif 66 + default: BUG(); 67 + } 68 + } 69 + 18 70 static inline void _kvm_read_sse_reg(int reg, sse128_t *data) 19 71 { 20 72 switch (reg) { ··· 159 107 static inline void kvm_fpu_put(void) 160 108 { 161 109 fpregs_unlock(); 110 + } 111 + 112 + static inline void kvm_read_avx_reg(int reg, avx256_t *data) 113 + { 114 + kvm_fpu_get(); 115 + _kvm_read_avx_reg(reg, data); 116 + kvm_fpu_put(); 117 + } 118 + 119 + static inline void kvm_write_avx_reg(int reg, const avx256_t *data) 120 + { 121 + kvm_fpu_get(); 122 + _kvm_write_avx_reg(reg, data); 123 + kvm_fpu_put(); 162 124 } 163 125 164 126 static inline void kvm_read_sse_reg(int reg, sse128_t *data)

+1 -1

arch/x86/kvm/hyperv.c

··· 1568 1568 * only, there can be valuable data in the rest which needs 1569 1569 * to be preserved e.g. on migration. 1570 1570 */ 1571 - if (__put_user(0, (u32 __user *)addr)) 1571 + if (put_user(0, (u32 __user *)addr)) 1572 1572 return 1; 1573 1573 hv_vcpu->hv_vapic = data; 1574 1574 kvm_vcpu_mark_page_dirty(vcpu, gfn);

+16 -4

arch/x86/kvm/kvm_emulate.h

··· 237 237 bool (*is_smm)(struct x86_emulate_ctxt *ctxt); 238 238 int (*leave_smm)(struct x86_emulate_ctxt *ctxt); 239 239 void (*triple_fault)(struct x86_emulate_ctxt *ctxt); 240 + int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr); 240 241 int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); 241 242 242 243 gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, ··· 249 248 250 249 /* Type, address-of, and value of an instruction's operand. */ 251 250 struct operand { 252 - enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; 251 + enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type; 253 252 unsigned int bytes; 254 253 unsigned int count; 255 254 union { ··· 268 267 union { 269 268 unsigned long val; 270 269 u64 val64; 271 - char valptr[sizeof(sse128_t)]; 270 + char valptr[sizeof(avx256_t)]; 272 271 sse128_t vec_val; 272 + avx256_t vec_val2; 273 273 u64 mm_val; 274 274 void *data; 275 - }; 275 + } __aligned(32); 276 276 }; 277 277 278 278 #define X86_MAX_INSTRUCTION_LENGTH 15 ··· 319 317 #define NR_EMULATOR_GPRS 8 320 318 #endif 321 319 320 + /* 321 + * Distinguish between no prefix, REX, or in the future REX2. 322 + */ 323 + enum rex_type { 324 + REX_NONE, 325 + REX_PREFIX, 326 + }; 327 + 322 328 struct x86_emulate_ctxt { 323 329 void *vcpu; 324 330 const struct x86_emulate_ops *ops; ··· 358 348 u8 opcode_len; 359 349 u8 b; 360 350 u8 intercept; 351 + bool op_prefix; 361 352 u8 op_bytes; 362 353 u8 ad_bytes; 363 354 union { ··· 368 357 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 369 358 370 359 bool rip_relative; 371 - u8 rex_prefix; 360 + enum rex_type rex_prefix; 361 + u8 rex_bits; 372 362 u8 lock_prefix; 373 363 u8 rep_prefix; 374 364 /* bitmaps of registers in _regs[] that can be read */

+31 -13

arch/x86/kvm/lapic.c

··· 2126 2126 2127 2127 static void advance_periodic_target_expiration(struct kvm_lapic *apic) 2128 2128 { 2129 + struct kvm_timer *ktimer = &apic->lapic_timer; 2129 2130 ktime_t now = ktime_get(); 2130 2131 u64 tscl = rdtsc(); 2131 2132 ktime_t delta; 2132 2133 2133 2134 /* 2134 - * Synchronize both deadlines to the same time source or 2135 - * differences in the periods (caused by differences in the 2136 - * underlying clocks or numerical approximation errors) will 2137 - * cause the two to drift apart over time as the errors 2138 - * accumulate. 2135 + * Use kernel time as the time source for both the hrtimer deadline and 2136 + * TSC-based deadline so that they stay synchronized. Computing each 2137 + * deadline independently will cause the two deadlines to drift apart 2138 + * over time as differences in the periods accumulate, e.g. due to 2139 + * differences in the underlying clocks or numerical approximation errors. 2139 2140 */ 2140 - apic->lapic_timer.target_expiration = 2141 - ktime_add_ns(apic->lapic_timer.target_expiration, 2142 - apic->lapic_timer.period); 2143 - delta = ktime_sub(apic->lapic_timer.target_expiration, now); 2144 - apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + 2145 - nsec_to_cycles(apic->vcpu, delta); 2141 + ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration, 2142 + ktimer->period); 2143 + 2144 + /* 2145 + * If the new expiration is in the past, e.g. because userspace stopped 2146 + * running the VM for an extended duration, then force the expiration 2147 + * to "now" and don't try to play catch-up with the missed events. KVM 2148 + * will only deliver a single interrupt regardless of how many events 2149 + * are pending, i.e. restarting the timer with an expiration in the 2150 + * past will do nothing more than waste host cycles, and can even lead 2151 + * to a hard lockup in extreme cases. 2152 + */ 2153 + if (ktime_before(ktimer->target_expiration, now)) 2154 + ktimer->target_expiration = now; 2155 + 2156 + /* 2157 + * Note, ensuring the expiration isn't in the past also prevents delta 2158 + * from going negative, which could cause the TSC deadline to become 2159 + * excessively large due to it an unsigned value. 2160 + */ 2161 + delta = ktime_sub(ktimer->target_expiration, now); 2162 + ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + 2163 + nsec_to_cycles(apic->vcpu, delta); 2146 2164 } 2147 2165 2148 2166 static void start_sw_period(struct kvm_lapic *apic) ··· 2988 2970 2989 2971 apic_timer_expired(apic, true); 2990 2972 2991 - if (lapic_is_periodic(apic)) { 2973 + if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) { 2992 2974 advance_periodic_target_expiration(apic); 2993 - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 2975 + hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration); 2994 2976 return HRTIMER_RESTART; 2995 2977 } else 2996 2978 return HRTIMER_NORESTART;

-2

arch/x86/kvm/mmu.h

··· 235 235 return -(u32)fault & errcode; 236 236 } 237 237 238 - bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); 239 - 240 238 int kvm_mmu_post_init_vm(struct kvm *kvm); 241 239 void kvm_mmu_pre_destroy_vm(struct kvm *kvm); 242 240

+1 -1

arch/x86/kvm/mmu/mmu.c

··· 4859 4859 */ 4860 4860 BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); 4861 4861 4862 - vcpu->arch.l1tf_flush_l1d = true; 4862 + kvm_request_l1tf_flush_l1d(); 4863 4863 if (!flags) { 4864 4864 trace_kvm_page_fault(vcpu, fault_address, error_code); 4865 4865

+1 -1

arch/x86/kvm/mmu/paging_tmpl.h

··· 402 402 goto error; 403 403 404 404 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 405 - if (unlikely(__get_user(pte, ptep_user))) 405 + if (unlikely(get_user(pte, ptep_user))) 406 406 goto error; 407 407 walker->ptep_user[walker->level - 1] = ptep_user; 408 408

+1 -1

arch/x86/kvm/mmu/spte.c

··· 292 292 mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); 293 293 } 294 294 295 - if (static_branch_unlikely(&cpu_buf_vm_clear) && 295 + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && 296 296 !kvm_vcpu_can_access_host_mmio(vcpu) && 297 297 kvm_is_mmio_pfn(pfn, &is_host_mmio)) 298 298 kvm_track_host_mmio_mapping(vcpu);

+10 -17

arch/x86/kvm/svm/svm.c

··· 3442 3442 3443 3443 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) 3444 3444 { 3445 - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); 3446 3445 dump_vmcb(vcpu); 3447 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3448 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 3449 - vcpu->run->internal.ndata = 2; 3450 - vcpu->run->internal.data[0] = exit_code; 3451 - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 3446 + kvm_prepare_unexpected_reason_exit(vcpu, exit_code); 3452 3447 return 0; 3453 3448 } 3454 3449 ··· 4246 4251 svm_set_dr6(vcpu, DR6_ACTIVE_LOW); 4247 4252 4248 4253 clgi(); 4249 - kvm_load_guest_xsave_state(vcpu); 4250 4254 4251 4255 /* 4252 4256 * Hardware only context switches DEBUGCTL if LBR virtualization is ··· 4288 4294 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4289 4295 update_debugctlmsr(vcpu->arch.host_debugctl); 4290 4296 4291 - kvm_load_host_xsave_state(vcpu); 4292 4297 stgi(); 4293 4298 4294 4299 /* Any pending NMI will happen here */ ··· 4318 4325 kvm_read_and_reset_apf_flags(); 4319 4326 4320 4327 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 4321 - 4322 - /* 4323 - * We need to handle MC intercepts here before the vcpu has a chance to 4324 - * change the physical cpu 4325 - */ 4326 - if (unlikely(svm->vmcb->control.exit_code == 4327 - SVM_EXIT_EXCP_BASE + MC_VECTOR)) 4328 - svm_handle_mce(vcpu); 4329 4328 4330 4329 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4331 4330 ··· 4607 4622 4608 4623 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4609 4624 { 4610 - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) 4625 + switch (to_svm(vcpu)->vmcb->control.exit_code) { 4626 + case SVM_EXIT_EXCP_BASE + MC_VECTOR: 4627 + svm_handle_mce(vcpu); 4628 + break; 4629 + case SVM_EXIT_INTR: 4611 4630 vcpu->arch.at_instruction_boundary = true; 4631 + break; 4632 + default: 4633 + break; 4634 + } 4612 4635 } 4613 4636 4614 4637 static void svm_setup_mce(struct kvm_vcpu *vcpu)

+4 -2

arch/x86/kvm/svm/vmenter.S

··· 92 92 jmp 901b 93 93 .endm 94 94 95 + #define SVM_CLEAR_CPU_BUFFERS \ 96 + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM 95 97 96 98 /** 97 99 * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode ··· 172 170 mov VCPU_RDI(%_ASM_DI), %_ASM_DI 173 171 174 172 /* Clobbers EFLAGS.ZF */ 175 - VM_CLEAR_CPU_BUFFERS 173 + SVM_CLEAR_CPU_BUFFERS 176 174 177 175 /* Enter guest mode */ 178 176 3: vmrun %_ASM_AX ··· 341 339 mov KVM_VMCB_pa(%rax), %rax 342 340 343 341 /* Clobbers EFLAGS.ZF */ 344 - VM_CLEAR_CPU_BUFFERS 342 + SVM_CLEAR_CPU_BUFFERS 345 343 346 344 /* Enter guest mode */ 347 345 1: vmrun %rax

+1 -1

arch/x86/kvm/vmx/nested.c

··· 3880 3880 goto vmentry_failed; 3881 3881 3882 3882 /* Hide L1D cache contents from the nested guest. */ 3883 - vmx->vcpu.arch.l1tf_flush_l1d = true; 3883 + kvm_request_l1tf_flush_l1d(); 3884 3884 3885 3885 /* 3886 3886 * Must happen outside of nested_vmx_enter_non_root_mode() as it will

+3 -7

arch/x86/kvm/vmx/run_flags.h

··· 2 2 #ifndef __KVM_X86_VMX_RUN_FLAGS_H 3 3 #define __KVM_X86_VMX_RUN_FLAGS_H 4 4 5 - #define VMX_RUN_VMRESUME_SHIFT 0 6 - #define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 7 - #define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2 8 - 9 - #define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) 10 - #define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) 11 - #define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT) 5 + #define VMX_RUN_VMRESUME BIT(0) 6 + #define VMX_RUN_SAVE_SPEC_CTRL BIT(1) 7 + #define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2) 12 8 13 9 #endif /* __KVM_X86_VMX_RUN_FLAGS_H */

+24 -41

arch/x86/kvm/vmx/tdx.c

··· 763 763 return tdx_vcpu_state_details_intr_pending(vcpu_state_details); 764 764 } 765 765 766 - /* 767 - * Compared to vmx_prepare_switch_to_guest(), there is not much to do 768 - * as SEAMCALL/SEAMRET calls take care of most of save and restore. 769 - */ 770 - void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 771 - { 772 - struct vcpu_vt *vt = to_vt(vcpu); 773 - 774 - if (vt->guest_state_loaded) 775 - return; 776 - 777 - if (likely(is_64bit_mm(current->mm))) 778 - vt->msr_host_kernel_gs_base = current->thread.gsbase; 779 - else 780 - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 781 - 782 - vt->guest_state_loaded = true; 783 - } 784 - 785 766 struct tdx_uret_msr { 786 767 u32 msr; 787 768 unsigned int slot; ··· 776 795 {.msr = MSR_TSC_AUX,}, 777 796 }; 778 797 779 - static void tdx_user_return_msr_update_cache(void) 798 + void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 780 799 { 800 + struct vcpu_vt *vt = to_vt(vcpu); 781 801 int i; 782 802 803 + if (vt->guest_state_loaded) 804 + return; 805 + 806 + if (likely(is_64bit_mm(current->mm))) 807 + vt->msr_host_kernel_gs_base = current->thread.gsbase; 808 + else 809 + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 810 + 811 + vt->guest_state_loaded = true; 812 + 813 + /* 814 + * Explicitly set user-return MSRs that are clobbered by the TDX-Module 815 + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be 816 + * written by the TDX-Module. Don't rely on the TDX-Module to actually 817 + * clobber the MSRs, as the contract is poorly defined and not upheld. 818 + * E.g. the TDX-Module will synthesize an EPT Violation without doing 819 + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM 820 + * state. 821 + */ 783 822 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) 784 - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, 785 - tdx_uret_msrs[i].defval); 823 + kvm_set_user_return_msr(tdx_uret_msrs[i].slot, 824 + tdx_uret_msrs[i].defval, -1ull); 786 825 } 787 826 788 827 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) 789 828 { 790 829 struct vcpu_vt *vt = to_vt(vcpu); 791 - struct vcpu_tdx *tdx = to_tdx(vcpu); 792 830 793 831 if (!vt->guest_state_loaded) 794 832 return; 795 833 796 834 ++vcpu->stat.host_state_reload; 797 835 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); 798 - 799 - if (tdx->guest_entered) { 800 - tdx_user_return_msr_update_cache(); 801 - tdx->guest_entered = false; 802 - } 803 836 804 837 vt->guest_state_loaded = false; 805 838 } ··· 1054 1059 update_debugctlmsr(vcpu->arch.host_debugctl); 1055 1060 1056 1061 tdx_load_host_xsave_state(vcpu); 1057 - tdx->guest_entered = true; 1058 1062 1059 1063 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; 1060 1064 ··· 1062 1068 1063 1069 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) 1064 1070 return EXIT_FASTPATH_NONE; 1065 - 1066 - if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 1067 - kvm_machine_check(); 1068 1071 1069 1072 trace_kvm_exit(vcpu, KVM_ISA_VMX); 1070 1073 ··· 2136 2145 } 2137 2146 2138 2147 unhandled_exit: 2139 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2140 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 2141 - vcpu->run->internal.ndata = 2; 2142 - vcpu->run->internal.data[0] = vp_enter_ret; 2143 - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 2148 + kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret); 2144 2149 return 0; 2145 2150 } 2146 2151 ··· 3434 3447 /* 3435 3448 * Check if MSRs (tdx_uret_msrs) can be saved/restored 3436 3449 * before returning to user space. 3437 - * 3438 - * this_cpu_ptr(user_return_msrs)->registered isn't checked 3439 - * because the registration is done at vcpu runtime by 3440 - * tdx_user_return_msr_update_cache(). 3441 3450 */ 3442 3451 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); 3443 3452 if (tdx_uret_msrs[i].slot == -1) {

-1

arch/x86/kvm/vmx/tdx.h

··· 67 67 u64 vp_enter_ret; 68 68 69 69 enum vcpu_tdx_state state; 70 - bool guest_entered; 71 70 72 71 u64 map_gpa_next; 73 72 u64 map_gpa_end;

+18 -11

arch/x86/kvm/vmx/vmenter.S

··· 71 71 * @regs: unsigned long * (to guest registers) 72 72 * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH 73 73 * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl 74 + * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO 74 75 * 75 76 * Returns: 76 77 * 0 on VM-Exit, 1 on VM-Fail ··· 93 92 /* Save @vmx for SPEC_CTRL handling */ 94 93 push %_ASM_ARG1 95 94 96 - /* Save @flags for SPEC_CTRL handling */ 95 + /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */ 97 96 push %_ASM_ARG3 98 97 99 98 /* ··· 101 100 * @regs is needed after VM-Exit to save the guest's register values. 102 101 */ 103 102 push %_ASM_ARG2 104 - 105 - /* Copy @flags to EBX, _ASM_ARG3 is volatile. */ 106 - mov %_ASM_ARG3L, %ebx 107 103 108 104 lea (%_ASM_SP), %_ASM_ARG2 109 105 call vmx_update_host_rsp ··· 135 137 /* Load @regs to RAX. */ 136 138 mov (%_ASM_SP), %_ASM_AX 137 139 138 - /* Check if vmlaunch or vmresume is needed */ 139 - bt $VMX_RUN_VMRESUME_SHIFT, %ebx 140 - 141 140 /* Load guest registers. Don't clobber flags. */ 142 141 mov VCPU_RCX(%_ASM_AX), %_ASM_CX 143 142 mov VCPU_RDX(%_ASM_AX), %_ASM_DX ··· 155 160 /* Load guest RAX. This kills the @regs pointer! */ 156 161 mov VCPU_RAX(%_ASM_AX), %_ASM_AX 157 162 158 - /* Clobbers EFLAGS.ZF */ 159 - CLEAR_CPU_BUFFERS 163 + /* 164 + * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is 165 + * enabled, do VERW unconditionally. If CPU_BUF_VM_MMIO is enabled, 166 + * check @flags to see if the vCPU has access to host MMIO, and if so, 167 + * do VERW. Else, do nothing (no mitigations needed/enabled). 168 + */ 169 + ALTERNATIVE_2 "", \ 170 + __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \ 171 + jz .Lskip_mmio_verw; \ 172 + VERW; \ 173 + .Lskip_mmio_verw:), \ 174 + X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, \ 175 + __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM 160 176 161 - /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ 162 - jnc .Lvmlaunch 177 + /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */ 178 + testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP) 179 + jz .Lvmlaunch 163 180 164 181 /* 165 182 * After a successful VMRESUME/VMLAUNCH, control flow "magically"

+134 -130

arch/x86/kvm/vmx/vmx.c

··· 203 203 204 204 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 205 205 206 + #ifdef CONFIG_CPU_MITIGATIONS 206 207 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 207 208 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 208 209 static DEFINE_MUTEX(vmx_l1d_flush_mutex); ··· 226 225 #define L1D_CACHE_ORDER 4 227 226 static void *vmx_l1d_flush_pages; 228 227 229 - static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 228 + static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 230 229 { 231 230 struct page *page; 232 231 unsigned int i; ··· 303 302 return 0; 304 303 } 305 304 305 + static int vmx_setup_l1d_flush(void) 306 + { 307 + /* 308 + * Hand the parameter mitigation value in which was stored in the pre 309 + * module init parser. If no parameter was given, it will contain 310 + * 'auto' which will be turned into the default 'cond' mitigation mode. 311 + */ 312 + return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); 313 + } 314 + 315 + static void vmx_cleanup_l1d_flush(void) 316 + { 317 + if (vmx_l1d_flush_pages) { 318 + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 319 + vmx_l1d_flush_pages = NULL; 320 + } 321 + /* Restore state so sysfs ignores VMX */ 322 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 323 + } 324 + 306 325 static int vmentry_l1d_flush_parse(const char *s) 307 326 { 308 327 unsigned int i; ··· 360 339 } 361 340 362 341 mutex_lock(&vmx_l1d_flush_mutex); 363 - ret = vmx_setup_l1d_flush(l1tf); 342 + ret = __vmx_setup_l1d_flush(l1tf); 364 343 mutex_unlock(&vmx_l1d_flush_mutex); 365 344 return ret; 366 345 } ··· 372 351 373 352 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 374 353 } 354 + 355 + /* 356 + * Software based L1D cache flush which is used when microcode providing 357 + * the cache control MSR is not loaded. 358 + * 359 + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 360 + * flush it is required to read in 64 KiB because the replacement algorithm 361 + * is not exactly LRU. This could be sized at runtime via topology 362 + * information but as all relevant affected CPUs have 32KiB L1D cache size 363 + * there is no point in doing so. 364 + */ 365 + static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 366 + { 367 + int size = PAGE_SIZE << L1D_CACHE_ORDER; 368 + 369 + if (!static_branch_unlikely(&vmx_l1d_should_flush)) 370 + return; 371 + 372 + /* 373 + * This code is only executed when the flush mode is 'cond' or 374 + * 'always' 375 + */ 376 + if (static_branch_likely(&vmx_l1d_flush_cond)) { 377 + /* 378 + * Clear the per-cpu flush bit, it gets set again if the vCPU 379 + * is reloaded, i.e. if the vCPU is scheduled out or if KVM 380 + * exits to userspace, or if KVM reaches one of the unsafe 381 + * VMEXIT handlers, e.g. if KVM calls into the emulator, 382 + * or from the interrupt handlers. 383 + */ 384 + if (!kvm_get_cpu_l1tf_flush_l1d()) 385 + return; 386 + kvm_clear_cpu_l1tf_flush_l1d(); 387 + } 388 + 389 + vcpu->stat.l1d_flush++; 390 + 391 + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 392 + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 393 + return; 394 + } 395 + 396 + asm volatile( 397 + /* First ensure the pages are in the TLB */ 398 + "xorl %%eax, %%eax\n" 399 + ".Lpopulate_tlb:\n\t" 400 + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 401 + "addl $4096, %%eax\n\t" 402 + "cmpl %%eax, %[size]\n\t" 403 + "jne .Lpopulate_tlb\n\t" 404 + "xorl %%eax, %%eax\n\t" 405 + "cpuid\n\t" 406 + /* Now fill the cache */ 407 + "xorl %%eax, %%eax\n" 408 + ".Lfill_cache:\n" 409 + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 410 + "addl $64, %%eax\n\t" 411 + "cmpl %%eax, %[size]\n\t" 412 + "jne .Lfill_cache\n\t" 413 + "lfence\n" 414 + :: [flush_pages] "r" (vmx_l1d_flush_pages), 415 + [size] "r" (size) 416 + : "eax", "ebx", "ecx", "edx"); 417 + } 418 + 419 + #else /* CONFIG_CPU_MITIGATIONS*/ 420 + static int vmx_setup_l1d_flush(void) 421 + { 422 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; 423 + return 0; 424 + } 425 + static void vmx_cleanup_l1d_flush(void) 426 + { 427 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 428 + } 429 + static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) 430 + { 431 + 432 + } 433 + static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 434 + { 435 + pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); 436 + return 0; 437 + } 438 + static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 439 + { 440 + return sysfs_emit(s, "never\n"); 441 + } 442 + #endif 443 + 444 + static const struct kernel_param_ops vmentry_l1d_flush_ops = { 445 + .set = vmentry_l1d_flush_set, 446 + .get = vmentry_l1d_flush_get, 447 + }; 448 + module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 375 449 376 450 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 377 451 { ··· 519 403 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 520 404 vmx->disable_fb_clear = false; 521 405 } 522 - 523 - static const struct kernel_param_ops vmentry_l1d_flush_ops = { 524 - .set = vmentry_l1d_flush_set, 525 - .get = vmentry_l1d_flush_get, 526 - }; 527 - module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 528 406 529 407 static u32 vmx_segment_access_rights(struct kvm_segment *var); 530 408 ··· 1013 903 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 1014 904 flags |= VMX_RUN_SAVE_SPEC_CTRL; 1015 905 1016 - if (static_branch_unlikely(&cpu_buf_vm_clear) && 906 + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && 1017 907 kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) 1018 908 flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; 1019 909 ··· 6741 6631 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6742 6632 6743 6633 unexpected_vmexit: 6744 - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6745 - exit_reason.full); 6746 6634 dump_vmcs(vcpu); 6747 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6748 - vcpu->run->internal.suberror = 6749 - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6750 - vcpu->run->internal.ndata = 2; 6751 - vcpu->run->internal.data[0] = exit_reason.full; 6752 - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6635 + kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); 6753 6636 return 0; 6754 6637 } 6755 6638 ··· 6762 6659 return 0; 6763 6660 } 6764 6661 return ret; 6765 - } 6766 - 6767 - /* 6768 - * Software based L1D cache flush which is used when microcode providing 6769 - * the cache control MSR is not loaded. 6770 - * 6771 - * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6772 - * flush it is required to read in 64 KiB because the replacement algorithm 6773 - * is not exactly LRU. This could be sized at runtime via topology 6774 - * information but as all relevant affected CPUs have 32KiB L1D cache size 6775 - * there is no point in doing so. 6776 - */ 6777 - static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6778 - { 6779 - int size = PAGE_SIZE << L1D_CACHE_ORDER; 6780 - 6781 - /* 6782 - * This code is only executed when the flush mode is 'cond' or 6783 - * 'always' 6784 - */ 6785 - if (static_branch_likely(&vmx_l1d_flush_cond)) { 6786 - bool flush_l1d; 6787 - 6788 - /* 6789 - * Clear the per-vcpu flush bit, it gets set again if the vCPU 6790 - * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6791 - * exits to userspace, or if KVM reaches one of the unsafe 6792 - * VMEXIT handlers, e.g. if KVM calls into the emulator. 6793 - */ 6794 - flush_l1d = vcpu->arch.l1tf_flush_l1d; 6795 - vcpu->arch.l1tf_flush_l1d = false; 6796 - 6797 - /* 6798 - * Clear the per-cpu flush bit, it gets set again from 6799 - * the interrupt handlers. 6800 - */ 6801 - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6802 - kvm_clear_cpu_l1tf_flush_l1d(); 6803 - 6804 - if (!flush_l1d) 6805 - return; 6806 - } 6807 - 6808 - vcpu->stat.l1d_flush++; 6809 - 6810 - if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6811 - native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6812 - return; 6813 - } 6814 - 6815 - asm volatile( 6816 - /* First ensure the pages are in the TLB */ 6817 - "xorl %%eax, %%eax\n" 6818 - ".Lpopulate_tlb:\n\t" 6819 - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6820 - "addl $4096, %%eax\n\t" 6821 - "cmpl %%eax, %[size]\n\t" 6822 - "jne .Lpopulate_tlb\n\t" 6823 - "xorl %%eax, %%eax\n\t" 6824 - "cpuid\n\t" 6825 - /* Now fill the cache */ 6826 - "xorl %%eax, %%eax\n" 6827 - ".Lfill_cache:\n" 6828 - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6829 - "addl $64, %%eax\n\t" 6830 - "cmpl %%eax, %[size]\n\t" 6831 - "jne .Lfill_cache\n\t" 6832 - "lfence\n" 6833 - :: [flush_pages] "r" (vmx_l1d_flush_pages), 6834 - [size] "r" (size) 6835 - : "eax", "ebx", "ecx", "edx"); 6836 6662 } 6837 6663 6838 6664 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) ··· 7082 7050 if (to_vt(vcpu)->emulation_required) 7083 7051 return; 7084 7052 7085 - if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7053 + switch (vmx_get_exit_reason(vcpu).basic) { 7054 + case EXIT_REASON_EXTERNAL_INTERRUPT: 7086 7055 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7087 - else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) 7056 + break; 7057 + case EXIT_REASON_EXCEPTION_NMI: 7088 7058 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7059 + break; 7060 + case EXIT_REASON_MCE_DURING_VMENTRY: 7061 + kvm_machine_check(); 7062 + break; 7063 + default: 7064 + break; 7065 + } 7089 7066 } 7090 7067 7091 7068 /* ··· 7369 7328 7370 7329 guest_state_enter_irqoff(); 7371 7330 7372 - /* 7373 - * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7374 - * mitigation for MDS is done late in VMentry and is still 7375 - * executed in spite of L1D Flush. This is because an extra VERW 7376 - * should not matter much after the big hammer L1D Flush. 7377 - * 7378 - * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, 7379 - * and is affected by MMIO Stale Data. In such cases mitigation in only 7380 - * needed against an MMIO capable guest. 7381 - */ 7382 - if (static_branch_unlikely(&vmx_l1d_should_flush)) 7383 - vmx_l1d_flush(vcpu); 7384 - else if (static_branch_unlikely(&cpu_buf_vm_clear) && 7385 - (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) 7386 - x86_clear_cpu_buffers(); 7331 + vmx_l1d_flush(vcpu); 7387 7332 7388 7333 vmx_disable_fb_clear(vmx); 7389 7334 ··· 7481 7454 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7482 7455 vmx_set_interrupt_shadow(vcpu, 0); 7483 7456 7484 - kvm_load_guest_xsave_state(vcpu); 7485 - 7486 7457 pt_guest_enter(vmx); 7487 7458 7488 7459 atomic_switch_perf_msrs(vmx); ··· 7524 7499 7525 7500 pt_guest_exit(vmx); 7526 7501 7527 - kvm_load_host_xsave_state(vcpu); 7528 - 7529 7502 if (is_guest_mode(vcpu)) { 7530 7503 /* 7531 7504 * Track VMLAUNCH/VMRESUME that have made past guest state ··· 7538 7515 7539 7516 if (unlikely(vmx->fail)) 7540 7517 return EXIT_FASTPATH_NONE; 7541 - 7542 - if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7543 - kvm_machine_check(); 7544 7518 7545 7519 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7546 7520 ··· 8699 8679 return r; 8700 8680 } 8701 8681 8702 - static void vmx_cleanup_l1d_flush(void) 8703 - { 8704 - if (vmx_l1d_flush_pages) { 8705 - free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8706 - vmx_l1d_flush_pages = NULL; 8707 - } 8708 - /* Restore state so sysfs ignores VMX */ 8709 - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8710 - } 8711 - 8712 8682 void vmx_exit(void) 8713 8683 { 8714 8684 allow_smaller_maxphyaddr = false; ··· 8734 8724 if (r) 8735 8725 return r; 8736 8726 8737 - /* 8738 - * Must be called after common x86 init so enable_ept is properly set 8739 - * up. Hand the parameter mitigation value in which was stored in 8740 - * the pre module init parser. If no parameter was given, it will 8741 - * contain 'auto' which will be turned into the default 'cond' 8742 - * mitigation mode. 8743 - */ 8744 - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8727 + /* Must be called after common x86 init so enable_ept is setup. */ 8728 + r = vmx_setup_l1d_flush(); 8745 8729 if (r) 8746 8730 goto err_l1d_flush; 8747 8731

+122 -129

arch/x86/kvm/x86.c

··· 159 159 unsigned int min_timer_period_us = 200; 160 160 module_param(min_timer_period_us, uint, 0644); 161 161 162 - static bool __read_mostly kvmclock_periodic_sync = true; 163 - module_param(kvmclock_periodic_sync, bool, 0444); 164 - 165 162 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ 166 163 static u32 __read_mostly tsc_tolerance_ppm = 250; 167 164 module_param(tsc_tolerance_ppm, uint, 0644); ··· 209 212 u32 __read_mostly kvm_nr_uret_msrs; 210 213 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); 211 214 static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; 212 - static struct kvm_user_return_msrs __percpu *user_return_msrs; 215 + static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); 213 216 214 217 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ 215 218 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ ··· 572 575 vcpu->arch.apf.gfns[i] = ~0; 573 576 } 574 577 578 + static void kvm_destroy_user_return_msrs(void) 579 + { 580 + int cpu; 581 + 582 + for_each_possible_cpu(cpu) 583 + WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); 584 + 585 + kvm_nr_uret_msrs = 0; 586 + } 587 + 575 588 static void kvm_on_user_return(struct user_return_notifier *urn) 576 589 { 577 590 unsigned slot; 578 591 struct kvm_user_return_msrs *msrs 579 592 = container_of(urn, struct kvm_user_return_msrs, urn); 580 593 struct kvm_user_return_msr_values *values; 581 - unsigned long flags; 582 594 583 - /* 584 - * Disabling irqs at this point since the following code could be 585 - * interrupted and executed through kvm_arch_disable_virtualization_cpu() 586 - */ 587 - local_irq_save(flags); 588 - if (msrs->registered) { 589 - msrs->registered = false; 590 - user_return_notifier_unregister(urn); 591 - } 592 - local_irq_restore(flags); 595 + msrs->registered = false; 596 + user_return_notifier_unregister(urn); 597 + 593 598 for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { 594 599 values = &msrs->values[slot]; 595 600 if (values->host != values->curr) { ··· 642 643 643 644 static void kvm_user_return_msr_cpu_online(void) 644 645 { 645 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 646 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); 646 647 u64 value; 647 648 int i; 648 649 ··· 664 665 665 666 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) 666 667 { 667 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 668 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); 668 669 int err; 669 670 670 671 value = (value & mask) | (msrs->values[slot].host & ~mask); ··· 680 681 } 681 682 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); 682 683 683 - void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) 684 - { 685 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 686 - 687 - msrs->values[slot].curr = value; 688 - kvm_user_return_register_notifier(msrs); 689 - } 690 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache); 691 - 692 684 u64 kvm_get_user_return_msr(unsigned int slot) 693 685 { 694 - return this_cpu_ptr(user_return_msrs)->values[slot].curr; 686 + return this_cpu_ptr(&user_return_msrs)->values[slot].curr; 695 687 } 696 688 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); 697 689 698 690 static void drop_user_return_notifiers(void) 699 691 { 700 - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 692 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); 701 693 702 694 if (msrs->registered) 703 695 kvm_on_user_return(&msrs->urn); ··· 1035 1045 } 1036 1046 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); 1037 1047 1048 + static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) 1049 + { 1050 + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; 1051 + 1052 + return (vcpu->arch.apf.msr_en_val & mask) == mask; 1053 + } 1054 + 1038 1055 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) 1039 1056 { 1040 1057 return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); ··· 1134 1137 } 1135 1138 1136 1139 if ((cr0 ^ old_cr0) & X86_CR0_PG) { 1137 - kvm_clear_async_pf_completion_queue(vcpu); 1138 - kvm_async_pf_hash_reset(vcpu); 1139 - 1140 1140 /* 1141 1141 * Clearing CR0.PG is defined to flush the TLB from the guest's 1142 1142 * perspective. 1143 1143 */ 1144 1144 if (!(cr0 & X86_CR0_PG)) 1145 1145 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1146 + /* 1147 + * Check for async #PF completion events when enabling paging, 1148 + * as the vCPU may have previously encountered async #PFs (it's 1149 + * entirely legal for the guest to toggle paging on/off without 1150 + * waiting for the async #PF queue to drain). 1151 + */ 1152 + else if (kvm_pv_async_pf_enabled(vcpu)) 1153 + kvm_make_request(KVM_REQ_APF_READY, vcpu); 1146 1154 } 1147 1155 1148 1156 if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) ··· 1205 1203 } 1206 1204 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); 1207 1205 1208 - void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) 1206 + static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest) 1209 1207 { 1210 1208 if (vcpu->arch.guest_state_protected) 1211 1209 return; 1212 1210 1213 - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { 1211 + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) 1212 + return; 1214 1213 1215 - if (vcpu->arch.xcr0 != kvm_host.xcr0) 1216 - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 1214 + if (vcpu->arch.xcr0 != kvm_host.xcr0) 1215 + xsetbv(XCR_XFEATURE_ENABLED_MASK, 1216 + load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0); 1217 1217 1218 - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && 1219 - vcpu->arch.ia32_xss != kvm_host.xss) 1220 - wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); 1221 - } 1218 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && 1219 + vcpu->arch.ia32_xss != kvm_host.xss) 1220 + wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss); 1221 + } 1222 + 1223 + static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) 1224 + { 1225 + if (vcpu->arch.guest_state_protected) 1226 + return; 1222 1227 1223 1228 if (cpu_feature_enabled(X86_FEATURE_PKU) && 1224 1229 vcpu->arch.pkru != vcpu->arch.host_pkru && ··· 1233 1224 kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) 1234 1225 wrpkru(vcpu->arch.pkru); 1235 1226 } 1236 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state); 1237 1227 1238 - void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) 1228 + static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) 1239 1229 { 1240 1230 if (vcpu->arch.guest_state_protected) 1241 1231 return; ··· 1246 1238 if (vcpu->arch.pkru != vcpu->arch.host_pkru) 1247 1239 wrpkru(vcpu->arch.host_pkru); 1248 1240 } 1249 - 1250 - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { 1251 - 1252 - if (vcpu->arch.xcr0 != kvm_host.xcr0) 1253 - xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); 1254 - 1255 - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && 1256 - vcpu->arch.ia32_xss != kvm_host.xss) 1257 - wrmsrq(MSR_IA32_XSS, kvm_host.xss); 1258 - } 1259 - 1260 1241 } 1261 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); 1262 1242 1263 1243 #ifdef CONFIG_X86_64 1264 1244 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) ··· 3501 3505 /* 3502 3506 * kvmclock updates which are isolated to a given vcpu, such as 3503 3507 * vcpu->cpu migration, should not allow system_timestamp from 3504 - * the rest of the vcpus to remain static. Otherwise ntp frequency 3505 - * correction applies to one vcpu's system_timestamp but not 3506 - * the others. 3508 + * the rest of the vcpus to remain static. 3507 3509 * 3508 3510 * So in those cases, request a kvmclock update for all vcpus. 3509 - * We need to rate-limit these requests though, as they can 3510 - * considerably slow guests that have a large number of vcpus. 3511 - * The time for a remote vcpu to update its kvmclock is bound 3512 - * by the delay we use to rate-limit the updates. 3511 + * The worst case for a remote vcpu to update its kvmclock 3512 + * is then bounded by maximum nohz sleep latency. 3513 3513 */ 3514 - 3515 - #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) 3516 - 3517 - static void kvmclock_update_fn(struct work_struct *work) 3514 + static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) 3518 3515 { 3519 3516 unsigned long i; 3520 - struct delayed_work *dwork = to_delayed_work(work); 3521 - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 3522 - kvmclock_update_work); 3523 - struct kvm *kvm = container_of(ka, struct kvm, arch); 3524 3517 struct kvm_vcpu *vcpu; 3518 + struct kvm *kvm = v->kvm; 3525 3519 3526 3520 kvm_for_each_vcpu(i, vcpu, kvm) { 3527 3521 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 3528 3522 kvm_vcpu_kick(vcpu); 3529 3523 } 3530 - } 3531 - 3532 - static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) 3533 - { 3534 - struct kvm *kvm = v->kvm; 3535 - 3536 - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 3537 - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 3538 - KVMCLOCK_UPDATE_DELAY); 3539 - } 3540 - 3541 - #define KVMCLOCK_SYNC_PERIOD (300 * HZ) 3542 - 3543 - static void kvmclock_sync_fn(struct work_struct *work) 3544 - { 3545 - struct delayed_work *dwork = to_delayed_work(work); 3546 - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, 3547 - kvmclock_sync_work); 3548 - struct kvm *kvm = container_of(ka, struct kvm, arch); 3549 - 3550 - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); 3551 - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 3552 - KVMCLOCK_SYNC_PERIOD); 3553 3524 } 3554 3525 3555 3526 /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ ··· 3611 3648 return 1; 3612 3649 } 3613 3650 return 0; 3614 - } 3615 - 3616 - static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) 3617 - { 3618 - u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; 3619 - 3620 - return (vcpu->arch.apf.msr_en_val & mask) == mask; 3621 3651 } 3622 3652 3623 3653 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) ··· 4138 4182 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 4139 4183 return 1; 4140 4184 if (data & 0x1) { 4141 - vcpu->arch.apf.pageready_pending = false; 4185 + /* 4186 + * Pairs with the smp_mb__after_atomic() in 4187 + * kvm_arch_async_page_present_queued(). 4188 + */ 4189 + smp_store_mb(vcpu->arch.apf.pageready_pending, false); 4190 + 4142 4191 kvm_check_async_pf_completion(vcpu); 4143 4192 } 4144 4193 break; ··· 5149 5188 { 5150 5189 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 5151 5190 5152 - vcpu->arch.l1tf_flush_l1d = true; 5191 + kvm_request_l1tf_flush_l1d(); 5153 5192 5154 5193 if (vcpu->scheduled_out && pmu->version && pmu->event_count) { 5155 5194 pmu->need_cleanup = true; ··· 7959 7998 unsigned int bytes, struct x86_exception *exception) 7960 7999 { 7961 8000 /* kvm_write_guest_virt_system can pull in tons of pages. */ 7962 - vcpu->arch.l1tf_flush_l1d = true; 8001 + kvm_request_l1tf_flush_l1d(); 7963 8002 7964 8003 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 7965 8004 PFERR_WRITE_MASK, exception); ··· 8803 8842 kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); 8804 8843 } 8805 8844 8845 + static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr) 8846 + { 8847 + if (index != XCR_XFEATURE_ENABLED_MASK) 8848 + return 1; 8849 + *xcr = emul_to_vcpu(ctxt)->arch.xcr0; 8850 + return 0; 8851 + } 8852 + 8806 8853 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) 8807 8854 { 8808 8855 return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); ··· 8883 8914 .is_smm = emulator_is_smm, 8884 8915 .leave_smm = emulator_leave_smm, 8885 8916 .triple_fault = emulator_triple_fault, 8917 + .get_xcr = emulator_get_xcr, 8886 8918 .set_xcr = emulator_set_xcr, 8887 8919 .get_untagged_addr = emulator_get_untagged_addr, 8888 8920 .is_canonical_addr = emulator_is_canonical_addr, ··· 9078 9108 run->internal.ndata = ndata; 9079 9109 } 9080 9110 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); 9111 + 9112 + void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason) 9113 + { 9114 + vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason); 9115 + 9116 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 9117 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 9118 + vcpu->run->internal.ndata = 2; 9119 + vcpu->run->internal.data[0] = exit_reason; 9120 + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 9121 + } 9122 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit); 9081 9123 9082 9124 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) 9083 9125 { ··· 9376 9394 return handle_emulation_failure(vcpu, emulation_type); 9377 9395 } 9378 9396 9379 - vcpu->arch.l1tf_flush_l1d = true; 9397 + kvm_request_l1tf_flush_l1d(); 9380 9398 9381 9399 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 9382 9400 kvm_clear_exception_queue(vcpu); ··· 10013 10031 return -ENOMEM; 10014 10032 } 10015 10033 10016 - user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); 10017 - if (!user_return_msrs) { 10018 - pr_err("failed to allocate percpu kvm_user_return_msrs\n"); 10019 - r = -ENOMEM; 10020 - goto out_free_x86_emulator_cache; 10021 - } 10022 - kvm_nr_uret_msrs = 0; 10023 - 10024 10034 r = kvm_mmu_vendor_module_init(); 10025 10035 if (r) 10026 - goto out_free_percpu; 10036 + goto out_free_x86_emulator_cache; 10027 10037 10028 10038 kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); 10029 10039 kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; ··· 10039 10065 10040 10066 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) 10041 10067 rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); 10068 + 10069 + WARN_ON_ONCE(kvm_nr_uret_msrs); 10042 10070 10043 10071 r = ops->hardware_setup(); 10044 10072 if (r != 0) ··· 10114 10138 kvm_x86_ops.enable_virtualization_cpu = NULL; 10115 10139 kvm_x86_call(hardware_unsetup)(); 10116 10140 out_mmu_exit: 10141 + kvm_destroy_user_return_msrs(); 10117 10142 kvm_mmu_vendor_module_exit(); 10118 - out_free_percpu: 10119 - free_percpu(user_return_msrs); 10120 10143 out_free_x86_emulator_cache: 10121 10144 kmem_cache_destroy(x86_emulator_cache); 10122 10145 return r; ··· 10143 10168 cancel_work_sync(&pvclock_gtod_work); 10144 10169 #endif 10145 10170 kvm_x86_call(hardware_unsetup)(); 10171 + kvm_destroy_user_return_msrs(); 10146 10172 kvm_mmu_vendor_module_exit(); 10147 - free_percpu(user_return_msrs); 10148 10173 kmem_cache_destroy(x86_emulator_cache); 10149 10174 #ifdef CONFIG_KVM_XEN 10150 10175 static_key_deferred_flush(&kvm_xen_enabled); ··· 11266 11291 if (vcpu->arch.guest_fpu.xfd_err) 11267 11292 wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 11268 11293 11294 + kvm_load_xfeatures(vcpu, true); 11295 + 11269 11296 if (unlikely(vcpu->arch.switch_db_regs && 11270 11297 !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { 11271 11298 set_debugreg(DR7_FIXED_1, 7); ··· 11296 11319 11297 11320 guest_timing_enter_irqoff(); 11298 11321 11322 + /* 11323 + * Swap PKRU with hardware breakpoints disabled to minimize the number 11324 + * of flows where non-KVM code can run with guest state loaded. 11325 + */ 11326 + kvm_load_guest_pkru(vcpu); 11327 + 11299 11328 for (;;) { 11300 11329 /* 11301 11330 * Assert that vCPU vs. VM APICv state is consistent. An APICv ··· 11329 11346 /* Note, VM-Exits that go down the "slow" path are accounted below. */ 11330 11347 ++vcpu->stat.exits; 11331 11348 } 11349 + 11350 + kvm_load_host_pkru(vcpu); 11332 11351 11333 11352 /* 11334 11353 * Do this here before restoring debug registers on the host. And ··· 11361 11376 11362 11377 vcpu->mode = OUTSIDE_GUEST_MODE; 11363 11378 smp_wmb(); 11379 + 11380 + kvm_load_xfeatures(vcpu, false); 11364 11381 11365 11382 /* 11366 11383 * Sync xfd before calling handle_exit_irqoff() which may ··· 12721 12734 12722 12735 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 12723 12736 { 12724 - struct kvm *kvm = vcpu->kvm; 12725 - 12726 12737 if (mutex_lock_killable(&vcpu->mutex)) 12727 12738 return; 12728 12739 vcpu_load(vcpu); ··· 12731 12746 vcpu->arch.msr_kvm_poll_control = 1; 12732 12747 12733 12748 mutex_unlock(&vcpu->mutex); 12734 - 12735 - if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) 12736 - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, 12737 - KVMCLOCK_SYNC_PERIOD); 12738 12749 } 12739 12750 12740 12751 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) ··· 13069 13088 void kvm_arch_disable_virtualization_cpu(void) 13070 13089 { 13071 13090 kvm_x86_call(disable_virtualization_cpu)(); 13072 - drop_user_return_notifiers(); 13091 + 13092 + /* 13093 + * Leave the user-return notifiers as-is when disabling virtualization 13094 + * for reboot, i.e. when disabling via IPI function call, and instead 13095 + * pin kvm.ko (if it's a module) to defend against use-after-free (in 13096 + * the *very* unlikely scenario module unload is racing with reboot). 13097 + * On a forced reboot, tasks aren't frozen before shutdown, and so KVM 13098 + * could be actively modifying user-return MSR state when the IPI to 13099 + * disable virtualization arrives. Handle the extreme edge case here 13100 + * instead of trying to account for it in the normal flows. 13101 + */ 13102 + if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) 13103 + drop_user_return_notifiers(); 13104 + else 13105 + __module_get(THIS_MODULE); 13073 13106 } 13074 13107 13075 13108 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) ··· 13154 13159 spin_lock_init(&kvm->arch.hv_root_tdp_lock); 13155 13160 kvm->arch.hv_root_tdp = INVALID_PAGE; 13156 13161 #endif 13157 - 13158 - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); 13159 - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); 13160 13162 13161 13163 kvm_apicv_init(kvm); 13162 13164 kvm_hv_init_vm(kvm); ··· 13261 13269 * is unsafe, i.e. will lead to use-after-free. The PIT also needs to 13262 13270 * be stopped before IRQ routing is freed. 13263 13271 */ 13264 - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); 13265 - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); 13266 - 13267 13272 #ifdef CONFIG_KVM_IOAPIC 13268 13273 kvm_free_pit(kvm); 13269 13274 #endif ··· 13877 13888 if ((work->wakeup_all || work->notpresent_injected) && 13878 13889 kvm_pv_async_pf_enabled(vcpu) && 13879 13890 !apf_put_user_ready(vcpu, work->arch.token)) { 13880 - vcpu->arch.apf.pageready_pending = true; 13891 + WRITE_ONCE(vcpu->arch.apf.pageready_pending, true); 13881 13892 kvm_apic_set_irq(vcpu, &irq, NULL); 13882 13893 } 13883 13894 ··· 13888 13899 void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) 13889 13900 { 13890 13901 kvm_make_request(KVM_REQ_APF_READY, vcpu); 13891 - if (!vcpu->arch.apf.pageready_pending) 13902 + 13903 + /* Pairs with smp_store_mb() in kvm_set_msr_common(). */ 13904 + smp_mb__after_atomic(); 13905 + 13906 + if (!READ_ONCE(vcpu->arch.apf.pageready_pending)) 13892 13907 kvm_vcpu_kick(vcpu); 13893 13908 } 13894 13909

+14 -2

arch/x86/kvm/x86.h

··· 420 420 return !(kvm->arch.disabled_quirks & quirk); 421 421 } 422 422 423 + static __always_inline void kvm_request_l1tf_flush_l1d(void) 424 + { 425 + #if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) 426 + /* 427 + * Use a raw write to set the per-CPU flag, as KVM will ensure a flush 428 + * even if preemption is currently enabled.. If the current vCPU task 429 + * is migrated to a different CPU (or userspace runs the vCPU on a 430 + * different task) before the next VM-Entry, then kvm_arch_vcpu_load() 431 + * will request a flush on the new CPU. 432 + */ 433 + raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); 434 + #endif 435 + } 436 + 423 437 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 424 438 425 439 u64 get_kvmclock_ns(struct kvm *kvm); ··· 636 622 #endif 637 623 } 638 624 639 - void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); 640 - void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); 641 625 int kvm_spec_ctrl_test_value(u64 value); 642 626 int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, 643 627 struct x86_exception *e);

Configure Feed

Configure Feed