Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm
"This is a pretty large diffstat for this time of the release. The main
culprit is a reorganization of the AMD assembly trampoline, allowing
percpu variables to be accessed early.

This is needed for the return stack depth tracking retbleed mitigation
that will be in 6.2, but it also makes it possible to tighten the IBRS
restore on vmexit. The latter change is a long tail of the
spectrev2/retbleed patches (the corresponding Intel change was simpler
and went in already last June), which is why I am including it right
now instead of sharing a topic branch with tip.

Being assembly and being rich in comments makes the line count balloon
a bit, but I am pretty confident in the change (famous last words)
because the reorganization actually makes everything simpler and more
understandable than before. It has also had external review and has
been tested on the aforementioned 6.2 changes, which explode quite
brutally without the fix.

Apart from this, things are pretty normal.

s390:

- PCI fix

- PV clock fix

x86:

- Fix clash between PMU MSRs and other MSRs

- Prepare SVM assembly trampoline for 6.2 retbleed mitigation and
for...

- ... tightening IBRS restore on vmexit, moving it before the first
RET or indirect branch

- Fix log level for VMSA dump

- Block all page faults during kvm_zap_gfn_range()

Tools:

- kvm_stat: fix incorrect detection of debugfs

- kvm_stat: update vmexit definitions"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86/mmu: Block all page faults during kvm_zap_gfn_range()
KVM: x86/pmu: Limit the maximum number of supported AMD GP counters
KVM: x86/pmu: Limit the maximum number of supported Intel GP counters
KVM: x86/pmu: Do not speculatively query Intel GP PMCs that don't exist yet
KVM: SVM: Only dump VMSA to klog at KERN_DEBUG level
tools/kvm_stat: update exit reasons for vmx/svm/aarch64/userspace
tools/kvm_stat: fix incorrect detection of debugfs
x86, KVM: remove unnecessary argument to x86_virt_spec_ctrl and callers
KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly
KVM: SVM: restore host save area from assembly
KVM: SVM: move guest vmsave/vmload back to assembly
KVM: SVM: do not allocate struct svm_cpu_data dynamically
KVM: SVM: remove dead field from struct svm_cpu_data
KVM: SVM: remove unused field from struct vcpu_svm
KVM: SVM: retrieve VMCB from assembly
KVM: SVM: adjust register allocation for __svm_vcpu_run()
KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm
KVM: x86: use a separate asm-offsets.c file
KVM: s390: pci: Fix allocation size of aift kzdev elements
KVM: s390: pv: don't allow userspace to set the clock under PV

+435 -207
+3
Documentation/virt/kvm/devices/vm.rst
··· 215 215 :Parameters: address of a buffer in user space to store the data (u8) to 216 216 :Returns: -EFAULT if the given address is not accessible from kernel space; 217 217 -EINVAL if setting the TOD clock extension to != 0 is not supported 218 + -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor) 218 219 219 220 3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW 220 221 ----------------------------------- ··· 225 224 226 225 :Parameters: address of a buffer in user space to store the data (u64) to 227 226 :Returns: -EFAULT if the given address is not accessible from kernel space 227 + -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor) 228 228 229 229 3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT 230 230 ----------------------------------- ··· 239 237 (kvm_s390_vm_tod_clock) to 240 238 :Returns: -EFAULT if the given address is not accessible from kernel space; 241 239 -EINVAL if setting the TOD clock extension to != 0 is not supported 240 + -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor) 242 241 243 242 4. GROUP: KVM_S390_VM_CRYPTO 244 243 ============================
+17 -9
arch/s390/kvm/kvm-s390.c
··· 1207 1207 return 0; 1208 1208 } 1209 1209 1210 + static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); 1211 + 1210 1212 static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) 1211 1213 { 1212 1214 struct kvm_s390_vm_tod_clock gtod; ··· 1218 1216 1219 1217 if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx) 1220 1218 return -EINVAL; 1221 - kvm_s390_set_tod_clock(kvm, &gtod); 1219 + __kvm_s390_set_tod_clock(kvm, &gtod); 1222 1220 1223 1221 VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", 1224 1222 gtod.epoch_idx, gtod.tod); ··· 1249 1247 sizeof(gtod.tod))) 1250 1248 return -EFAULT; 1251 1249 1252 - kvm_s390_set_tod_clock(kvm, &gtod); 1250 + __kvm_s390_set_tod_clock(kvm, &gtod); 1253 1251 VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod); 1254 1252 return 0; 1255 1253 } ··· 1260 1258 1261 1259 if (attr->flags) 1262 1260 return -EINVAL; 1261 + 1262 + mutex_lock(&kvm->lock); 1263 + /* 1264 + * For protected guests, the TOD is managed by the ultravisor, so trying 1265 + * to change it will never bring the expected results. 1266 + */ 1267 + if (kvm_s390_pv_is_protected(kvm)) { 1268 + ret = -EOPNOTSUPP; 1269 + goto out_unlock; 1270 + } 1263 1271 1264 1272 switch (attr->attr) { 1265 1273 case KVM_S390_VM_TOD_EXT: ··· 1285 1273 ret = -ENXIO; 1286 1274 break; 1287 1275 } 1276 + 1277 + out_unlock: 1278 + mutex_unlock(&kvm->lock); 1288 1279 return ret; 1289 1280 } 1290 1281 ··· 4390 4375 4391 4376 kvm_s390_vcpu_unblock_all(kvm); 4392 4377 preempt_enable(); 4393 - } 4394 - 4395 - void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) 4396 - { 4397 - mutex_lock(&kvm->lock); 4398 - __kvm_s390_set_tod_clock(kvm, gtod); 4399 - mutex_unlock(&kvm->lock); 4400 4378 } 4401 4379 4402 4380 int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
-1
arch/s390/kvm/kvm-s390.h
··· 363 363 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); 364 364 365 365 /* implemented in kvm-s390.c */ 366 - void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); 367 366 int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); 368 367 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); 369 368 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
+1 -1
arch/s390/kvm/pci.c
··· 126 126 return -EPERM; 127 127 128 128 mutex_lock(&aift->aift_lock); 129 - aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev), 129 + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), 130 130 GFP_KERNEL); 131 131 if (!aift->kzdev) { 132 132 rc = -ENOMEM;
+6 -1
arch/x86/include/asm/kvm_host.h
··· 501 501 bool intr; 502 502 }; 503 503 504 + /* More counters may conflict with other existing Architectural MSRs */ 505 + #define KVM_INTEL_PMC_MAX_GENERIC 8 506 + #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) 507 + #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) 504 508 #define KVM_PMC_MAX_FIXED 3 509 + #define KVM_AMD_PMC_MAX_GENERIC 6 505 510 struct kvm_pmu { 506 511 unsigned nr_arch_gp_counters; 507 512 unsigned nr_arch_fixed_counters; ··· 521 516 u64 reserved_bits; 522 517 u64 raw_event_mask; 523 518 u8 version; 524 - struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; 519 + struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; 525 520 struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; 526 521 struct irq_work irq_work; 527 522 DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
+5 -5
arch/x86/include/asm/spec-ctrl.h
··· 13 13 * Takes the guest view of SPEC_CTRL MSR as a parameter and also 14 14 * the guest's version of VIRT_SPEC_CTRL, if emulated. 15 15 */ 16 - extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest); 16 + extern void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool guest); 17 17 18 18 /** 19 19 * x86_spec_ctrl_set_guest - Set speculation control registers for the guest ··· 24 24 * Avoids writing to the MSR if the content/bits are the same 25 25 */ 26 26 static inline 27 - void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) 27 + void x86_spec_ctrl_set_guest(u64 guest_virt_spec_ctrl) 28 28 { 29 - x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true); 29 + x86_virt_spec_ctrl(guest_virt_spec_ctrl, true); 30 30 } 31 31 32 32 /** ··· 38 38 * Avoids writing to the MSR if the content/bits are the same 39 39 */ 40 40 static inline 41 - void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) 41 + void x86_spec_ctrl_restore_host(u64 guest_virt_spec_ctrl) 42 42 { 43 - x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false); 43 + x86_virt_spec_ctrl(guest_virt_spec_ctrl, false); 44 44 } 45 45 46 46 /* AMD specific Speculative Store Bypass MSR data */
-6
arch/x86/kernel/asm-offsets.c
··· 19 19 #include <asm/suspend.h> 20 20 #include <asm/tlbflush.h> 21 21 #include <asm/tdx.h> 22 - #include "../kvm/vmx/vmx.h" 23 22 24 23 #ifdef CONFIG_XEN 25 24 #include <xen/interface/xen.h> ··· 107 108 OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); 108 109 OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); 109 110 OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); 110 - 111 - if (IS_ENABLED(CONFIG_KVM_INTEL)) { 112 - BLANK(); 113 - OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); 114 - } 115 111 }
+4 -11
arch/x86/kernel/cpu/bugs.c
··· 196 196 } 197 197 198 198 /* 199 - * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is 200 - * done in vmenter.S. 199 + * NOTE: This function is *only* called for SVM, since Intel uses 200 + * MSR_IA32_SPEC_CTRL for SSBD. 201 201 */ 202 202 void 203 - x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) 203 + x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) 204 204 { 205 - u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); 205 + u64 guestval, hostval; 206 206 struct thread_info *ti = current_thread_info(); 207 - 208 - if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { 209 - if (hostval != guestval) { 210 - msrval = setguest ? guestval : hostval; 211 - wrmsrl(MSR_IA32_SPEC_CTRL, msrval); 212 - } 213 - } 214 207 215 208 /* 216 209 * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+2
arch/x86/kvm/.gitignore
··· 1 + /kvm-asm-offsets.s 2 + /kvm-asm-offsets.h
+12
arch/x86/kvm/Makefile
··· 34 34 obj-$(CONFIG_KVM) += kvm.o 35 35 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 36 36 obj-$(CONFIG_KVM_AMD) += kvm-amd.o 37 + 38 + AFLAGS_svm/vmenter.o := -iquote $(obj) 39 + $(obj)/svm/vmenter.o: $(obj)/kvm-asm-offsets.h 40 + 41 + AFLAGS_vmx/vmenter.o := -iquote $(obj) 42 + $(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h 43 + 44 + $(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE 45 + $(call filechk,offsets,__KVM_ASM_OFFSETS_H__) 46 + 47 + targets += kvm-asm-offsets.s 48 + clean-files += kvm-asm-offsets.h
+29
arch/x86/kvm/kvm-asm-offsets.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Generate definitions needed by assembly language modules. 4 + * This code generates raw asm output which is post-processed to extract 5 + * and format the required data. 6 + */ 7 + #define COMPILE_OFFSETS 8 + 9 + #include <linux/kbuild.h> 10 + #include "vmx/vmx.h" 11 + #include "svm/svm.h" 12 + 13 + static void __used common(void) 14 + { 15 + if (IS_ENABLED(CONFIG_KVM_AMD)) { 16 + BLANK(); 17 + OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs); 18 + OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb); 19 + OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl); 20 + OFFSET(SVM_vmcb01, vcpu_svm, vmcb01); 21 + OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa); 22 + OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa); 23 + } 24 + 25 + if (IS_ENABLED(CONFIG_KVM_INTEL)) { 26 + BLANK(); 27 + OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); 28 + } 29 + }
+2 -2
arch/x86/kvm/mmu/mmu.c
··· 6056 6056 6057 6057 write_lock(&kvm->mmu_lock); 6058 6058 6059 - kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end); 6059 + kvm_mmu_invalidate_begin(kvm, 0, -1ul); 6060 6060 6061 6061 flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); 6062 6062 ··· 6070 6070 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, 6071 6071 gfn_end - gfn_start); 6072 6072 6073 - kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end); 6073 + kvm_mmu_invalidate_end(kvm, 0, -1ul); 6074 6074 6075 6075 write_unlock(&kvm->mmu_lock); 6076 6076 }
+1 -1
arch/x86/kvm/pmu.c
··· 56 56 * code. Each pmc, stored in kvm_pmc.idx field, is unique across 57 57 * all perf counters (both gp and fixed). The mapping relationship 58 58 * between pmc and perf counters is as the following: 59 - * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters 59 + * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters 60 60 * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed 61 61 * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H 62 62 * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
+4 -3
arch/x86/kvm/svm/pmu.c
··· 192 192 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 193 193 int i; 194 194 195 - BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC); 195 + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); 196 + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); 196 197 197 - for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) { 198 + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { 198 199 pmu->gp_counters[i].type = KVM_PMC_GP; 199 200 pmu->gp_counters[i].vcpu = vcpu; 200 201 pmu->gp_counters[i].idx = i; ··· 208 207 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 209 208 int i; 210 209 211 - for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) { 210 + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { 212 211 struct kvm_pmc *pmc = &pmu->gp_counters[i]; 213 212 214 213 pmc_stop_counter(pmc);
+3 -3
arch/x86/kvm/svm/sev.c
··· 196 196 __set_bit(sev->asid, sev_reclaim_asid_bitmap); 197 197 198 198 for_each_possible_cpu(cpu) { 199 - sd = per_cpu(svm_data, cpu); 199 + sd = per_cpu_ptr(&svm_data, cpu); 200 200 sd->sev_vmcbs[sev->asid] = NULL; 201 201 } 202 202 ··· 605 605 save->dr6 = svm->vcpu.arch.dr6; 606 606 607 607 pr_debug("Virtual Machine Save Area (VMSA):\n"); 608 - print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); 608 + print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); 609 609 610 610 return 0; 611 611 } ··· 2600 2600 2601 2601 void pre_sev_run(struct vcpu_svm *svm, int cpu) 2602 2602 { 2603 - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 2603 + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 2604 2604 int asid = sev_get_asid(svm->vcpu.kvm); 2605 2605 2606 2606 /* Assign the asid allocated with this SEV guest */
+36 -69
arch/x86/kvm/svm/svm.c
··· 245 245 u32 zero1; 246 246 } __attribute__((packed)); 247 247 248 - DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 248 + DEFINE_PER_CPU(struct svm_cpu_data, svm_data); 249 249 250 250 /* 251 251 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via ··· 581 581 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); 582 582 return -EINVAL; 583 583 } 584 - sd = per_cpu(svm_data, me); 585 - if (!sd) { 586 - pr_err("%s: svm_data is NULL on %d\n", __func__, me); 587 - return -EINVAL; 588 - } 589 - 584 + sd = per_cpu_ptr(&svm_data, me); 590 585 sd->asid_generation = 1; 591 586 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 592 587 sd->next_asid = sd->max_asid + 1; ··· 592 597 593 598 wrmsrl(MSR_EFER, efer | EFER_SVME); 594 599 595 - wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); 600 + wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); 596 601 597 602 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 598 603 /* ··· 641 646 642 647 static void svm_cpu_uninit(int cpu) 643 648 { 644 - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 649 + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 645 650 646 - if (!sd) 651 + if (!sd->save_area) 647 652 return; 648 653 649 - per_cpu(svm_data, cpu) = NULL; 650 654 kfree(sd->sev_vmcbs); 651 655 __free_page(sd->save_area); 652 - kfree(sd); 656 + sd->save_area_pa = 0; 657 + sd->save_area = NULL; 653 658 } 654 659 655 660 static int svm_cpu_init(int cpu) 656 661 { 657 - struct svm_cpu_data *sd; 662 + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 658 663 int ret = -ENOMEM; 659 664 660 - sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 661 - if (!sd) 662 - return ret; 663 - sd->cpu = cpu; 665 + memset(sd, 0, sizeof(struct svm_cpu_data)); 664 666 sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); 665 667 if (!sd->save_area) 666 - goto free_cpu_data; 668 + return ret; 667 669 668 670 ret = sev_cpu_init(sd); 669 671 if (ret) 670 672 goto free_save_area; 671 673 672 - per_cpu(svm_data, cpu) = sd; 673 - 674 + sd->save_area_pa = __sme_page_pa(sd->save_area); 674 675 return 0; 675 676 676 677 free_save_area: 677 678 __free_page(sd->save_area); 678 - free_cpu_data: 679 - kfree(sd); 679 + sd->save_area = NULL; 680 680 return ret; 681 681 682 682 } ··· 720 730 u32 offset; 721 731 u32 *msrpm; 722 732 733 + /* 734 + * For non-nested case: 735 + * If the L01 MSR bitmap does not intercept the MSR, then we need to 736 + * save it. 737 + * 738 + * For nested case: 739 + * If the L02 MSR bitmap does not intercept the MSR, then we need to 740 + * save it. 741 + */ 723 742 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 724 743 to_svm(vcpu)->msrpm; 725 744 ··· 1424 1425 int i; 1425 1426 1426 1427 for_each_online_cpu(i) 1427 - cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); 1428 + cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); 1428 1429 } 1429 1430 1430 1431 static void svm_vcpu_free(struct kvm_vcpu *vcpu) ··· 1449 1450 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1450 1451 { 1451 1452 struct vcpu_svm *svm = to_svm(vcpu); 1452 - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 1453 + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 1453 1454 1454 1455 if (sev_es_guest(vcpu->kvm)) 1455 1456 sev_es_unmap_ghcb(svm); ··· 1461 1462 * Save additional host state that will be restored on VMEXIT (sev-es) 1462 1463 * or subsequent vmload of host save area. 1463 1464 */ 1464 - vmsave(__sme_page_pa(sd->save_area)); 1465 + vmsave(sd->save_area_pa); 1465 1466 if (sev_es_guest(vcpu->kvm)) { 1466 1467 struct sev_es_save_area *hostsa; 1467 1468 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); ··· 1486 1487 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1487 1488 { 1488 1489 struct vcpu_svm *svm = to_svm(vcpu); 1489 - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 1490 + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 1490 1491 1491 1492 if (sd->current_vmcb != svm->vmcb) { 1492 1493 sd->current_vmcb = svm->vmcb; ··· 3442 3443 3443 3444 static void reload_tss(struct kvm_vcpu *vcpu) 3444 3445 { 3445 - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3446 + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 3446 3447 3447 3448 sd->tss_desc->type = 9; /* available 32/64-bit TSS */ 3448 3449 load_TR_desc(); ··· 3450 3451 3451 3452 static void pre_svm_run(struct kvm_vcpu *vcpu) 3452 3453 { 3453 - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3454 + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 3454 3455 struct vcpu_svm *svm = to_svm(vcpu); 3455 3456 3456 3457 /* ··· 3910 3911 return EXIT_FASTPATH_NONE; 3911 3912 } 3912 3913 3913 - static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) 3914 + static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) 3914 3915 { 3915 3916 struct vcpu_svm *svm = to_svm(vcpu); 3916 - unsigned long vmcb_pa = svm->current_vmcb->pa; 3917 3917 3918 3918 guest_state_enter_irqoff(); 3919 3919 3920 - if (sev_es_guest(vcpu->kvm)) { 3921 - __svm_sev_es_vcpu_run(vmcb_pa); 3922 - } else { 3923 - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3924 - 3925 - /* 3926 - * Use a single vmcb (vmcb01 because it's always valid) for 3927 - * context switching guest state via VMLOAD/VMSAVE, that way 3928 - * the state doesn't need to be copied between vmcb01 and 3929 - * vmcb02 when switching vmcbs for nested virtualization. 3930 - */ 3931 - vmload(svm->vmcb01.pa); 3932 - __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); 3933 - vmsave(svm->vmcb01.pa); 3934 - 3935 - vmload(__sme_page_pa(sd->save_area)); 3936 - } 3920 + if (sev_es_guest(vcpu->kvm)) 3921 + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); 3922 + else 3923 + __svm_vcpu_run(svm, spec_ctrl_intercepted); 3937 3924 3938 3925 guest_state_exit_irqoff(); 3939 3926 } ··· 3927 3942 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) 3928 3943 { 3929 3944 struct vcpu_svm *svm = to_svm(vcpu); 3945 + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); 3930 3946 3931 3947 trace_kvm_entry(vcpu); 3932 3948 ··· 3984 3998 * being speculatively taken. 3985 3999 */ 3986 4000 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3987 - x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); 4001 + x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); 3988 4002 3989 - svm_vcpu_enter_exit(vcpu); 3990 - 3991 - /* 3992 - * We do not use IBRS in the kernel. If this vCPU has used the 3993 - * SPEC_CTRL MSR it may have left it on; save the value and 3994 - * turn it off. This is much more efficient than blindly adding 3995 - * it to the atomic save/restore list. Especially as the former 3996 - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 3997 - * 3998 - * For non-nested case: 3999 - * If the L01 MSR bitmap does not intercept the MSR, then we need to 4000 - * save it. 4001 - * 4002 - * For nested case: 4003 - * If the L02 MSR bitmap does not intercept the MSR, then we need to 4004 - * save it. 4005 - */ 4006 - if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && 4007 - unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 4008 - svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 4003 + svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); 4009 4004 4010 4005 if (!sev_es_guest(vcpu->kvm)) 4011 4006 reload_tss(vcpu); 4012 4007 4013 4008 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4014 - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); 4009 + x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); 4015 4010 4016 4011 if (!sev_es_guest(vcpu->kvm)) { 4017 4012 vcpu->arch.cr2 = svm->vmcb->save.cr2;
+5 -6
arch/x86/kvm/svm/svm.h
··· 209 209 struct vmcb *vmcb; 210 210 struct kvm_vmcb_info vmcb01; 211 211 struct kvm_vmcb_info *current_vmcb; 212 - struct svm_cpu_data *svm_data; 213 212 u32 asid; 214 213 u32 sysenter_esp_hi; 215 214 u32 sysenter_eip_hi; ··· 280 281 }; 281 282 282 283 struct svm_cpu_data { 283 - int cpu; 284 - 285 284 u64 asid_generation; 286 285 u32 max_asid; 287 286 u32 next_asid; ··· 287 290 struct kvm_ldttss_desc *tss_desc; 288 291 289 292 struct page *save_area; 293 + unsigned long save_area_pa; 294 + 290 295 struct vmcb *current_vmcb; 291 296 292 297 /* index = sev_asid, value = vmcb pointer */ 293 298 struct vmcb **sev_vmcbs; 294 299 }; 295 300 296 - DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); 301 + DECLARE_PER_CPU(struct svm_cpu_data, svm_data); 297 302 298 303 void recalc_intercepts(struct vcpu_svm *svm); 299 304 ··· 682 683 683 684 /* vmenter.S */ 684 685 685 - void __svm_sev_es_vcpu_run(unsigned long vmcb_pa); 686 - void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); 686 + void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); 687 + void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); 687 688 688 689 #endif
-5
arch/x86/kvm/svm/svm_ops.h
··· 61 61 svm_asm1(vmsave, "a" (pa), "memory"); 62 62 } 63 63 64 - static __always_inline void vmload(unsigned long pa) 65 - { 66 - svm_asm1(vmload, "a" (pa), "memory"); 67 - } 68 - 69 64 #endif /* __KVM_X86_SVM_OPS_H */
+209 -51
arch/x86/kvm/svm/vmenter.S
··· 4 4 #include <asm/bitsperlong.h> 5 5 #include <asm/kvm_vcpu_regs.h> 6 6 #include <asm/nospec-branch.h> 7 + #include "kvm-asm-offsets.h" 7 8 8 9 #define WORD_SIZE (BITS_PER_LONG / 8) 9 10 10 11 /* Intentionally omit RAX as it's context switched by hardware */ 11 - #define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE 12 - #define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE 13 - #define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE 12 + #define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE) 13 + #define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE) 14 + #define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE) 14 15 /* Intentionally omit RSP as it's context switched by hardware */ 15 - #define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE 16 - #define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE 17 - #define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE 16 + #define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE) 17 + #define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE) 18 + #define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE) 18 19 19 20 #ifdef CONFIG_X86_64 20 - #define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE 21 - #define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE 22 - #define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE 23 - #define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE 24 - #define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE 25 - #define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE 26 - #define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE 27 - #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE 21 + #define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE) 22 + #define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE) 23 + #define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE) 24 + #define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE) 25 + #define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE) 26 + #define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE) 27 + #define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE) 28 + #define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE) 28 29 #endif 30 + 31 + #define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa) 29 32 30 33 .section .noinstr.text, "ax" 31 34 35 + .macro RESTORE_GUEST_SPEC_CTRL 36 + /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */ 37 + ALTERNATIVE_2 "", \ 38 + "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \ 39 + "", X86_FEATURE_V_SPEC_CTRL 40 + 801: 41 + .endm 42 + .macro RESTORE_GUEST_SPEC_CTRL_BODY 43 + 800: 44 + /* 45 + * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the 46 + * host's, write the MSR. This is kept out-of-line so that the common 47 + * case does not have to jump. 48 + * 49 + * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, 50 + * there must not be any returns or indirect branches between this code 51 + * and vmentry. 52 + */ 53 + movl SVM_spec_ctrl(%_ASM_DI), %eax 54 + cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax 55 + je 801b 56 + mov $MSR_IA32_SPEC_CTRL, %ecx 57 + xor %edx, %edx 58 + wrmsr 59 + jmp 801b 60 + .endm 61 + 62 + .macro RESTORE_HOST_SPEC_CTRL 63 + /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */ 64 + ALTERNATIVE_2 "", \ 65 + "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \ 66 + "", X86_FEATURE_V_SPEC_CTRL 67 + 901: 68 + .endm 69 + .macro RESTORE_HOST_SPEC_CTRL_BODY 70 + 900: 71 + /* Same for after vmexit. */ 72 + mov $MSR_IA32_SPEC_CTRL, %ecx 73 + 74 + /* 75 + * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, 76 + * if it was not intercepted during guest execution. 77 + */ 78 + cmpb $0, (%_ASM_SP) 79 + jnz 998f 80 + rdmsr 81 + movl %eax, SVM_spec_ctrl(%_ASM_DI) 82 + 998: 83 + 84 + /* Now restore the host value of the MSR if different from the guest's. */ 85 + movl PER_CPU_VAR(x86_spec_ctrl_current), %eax 86 + cmp SVM_spec_ctrl(%_ASM_DI), %eax 87 + je 901b 88 + xor %edx, %edx 89 + wrmsr 90 + jmp 901b 91 + .endm 92 + 93 + 32 94 /** 33 95 * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode 34 - * @vmcb_pa: unsigned long 35 - * @regs: unsigned long * (to guest registers) 96 + * @svm: struct vcpu_svm * 97 + * @spec_ctrl_intercepted: bool 36 98 */ 37 99 SYM_FUNC_START(__svm_vcpu_run) 38 100 push %_ASM_BP ··· 109 47 #endif 110 48 push %_ASM_BX 111 49 112 - /* Save @regs. */ 50 + /* 51 + * Save variables needed after vmexit on the stack, in inverse 52 + * order compared to when they are needed. 53 + */ 54 + 55 + /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ 113 56 push %_ASM_ARG2 114 57 115 - /* Save @vmcb. */ 58 + /* Needed to restore access to percpu variables. */ 59 + __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa) 60 + 61 + /* Finally save @svm. */ 116 62 push %_ASM_ARG1 117 63 118 - /* Move @regs to RAX. */ 119 - mov %_ASM_ARG2, %_ASM_AX 64 + .ifnc _ASM_ARG1, _ASM_DI 65 + /* 66 + * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX 67 + * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. 68 + */ 69 + mov %_ASM_ARG1, %_ASM_DI 70 + .endif 71 + 72 + /* Clobbers RAX, RCX, RDX. */ 73 + RESTORE_GUEST_SPEC_CTRL 74 + 75 + /* 76 + * Use a single vmcb (vmcb01 because it's always valid) for 77 + * context switching guest state via VMLOAD/VMSAVE, that way 78 + * the state doesn't need to be copied between vmcb01 and 79 + * vmcb02 when switching vmcbs for nested virtualization. 80 + */ 81 + mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX 82 + 1: vmload %_ASM_AX 83 + 2: 84 + 85 + /* Get svm->current_vmcb->pa into RAX. */ 86 + mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX 87 + mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX 120 88 121 89 /* Load guest registers. */ 122 - mov VCPU_RCX(%_ASM_AX), %_ASM_CX 123 - mov VCPU_RDX(%_ASM_AX), %_ASM_DX 124 - mov VCPU_RBX(%_ASM_AX), %_ASM_BX 125 - mov VCPU_RBP(%_ASM_AX), %_ASM_BP 126 - mov VCPU_RSI(%_ASM_AX), %_ASM_SI 127 - mov VCPU_RDI(%_ASM_AX), %_ASM_DI 90 + mov VCPU_RCX(%_ASM_DI), %_ASM_CX 91 + mov VCPU_RDX(%_ASM_DI), %_ASM_DX 92 + mov VCPU_RBX(%_ASM_DI), %_ASM_BX 93 + mov VCPU_RBP(%_ASM_DI), %_ASM_BP 94 + mov VCPU_RSI(%_ASM_DI), %_ASM_SI 128 95 #ifdef CONFIG_X86_64 129 - mov VCPU_R8 (%_ASM_AX), %r8 130 - mov VCPU_R9 (%_ASM_AX), %r9 131 - mov VCPU_R10(%_ASM_AX), %r10 132 - mov VCPU_R11(%_ASM_AX), %r11 133 - mov VCPU_R12(%_ASM_AX), %r12 134 - mov VCPU_R13(%_ASM_AX), %r13 135 - mov VCPU_R14(%_ASM_AX), %r14 136 - mov VCPU_R15(%_ASM_AX), %r15 96 + mov VCPU_R8 (%_ASM_DI), %r8 97 + mov VCPU_R9 (%_ASM_DI), %r9 98 + mov VCPU_R10(%_ASM_DI), %r10 99 + mov VCPU_R11(%_ASM_DI), %r11 100 + mov VCPU_R12(%_ASM_DI), %r12 101 + mov VCPU_R13(%_ASM_DI), %r13 102 + mov VCPU_R14(%_ASM_DI), %r14 103 + mov VCPU_R15(%_ASM_DI), %r15 137 104 #endif 138 - 139 - /* "POP" @vmcb to RAX. */ 140 - pop %_ASM_AX 105 + mov VCPU_RDI(%_ASM_DI), %_ASM_DI 141 106 142 107 /* Enter guest mode */ 143 108 sti 144 109 145 - 1: vmrun %_ASM_AX 110 + 3: vmrun %_ASM_AX 111 + 4: 112 + cli 146 113 147 - 2: cli 148 - 149 - #ifdef CONFIG_RETPOLINE 150 - /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 151 - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 152 - #endif 153 - 154 - /* "POP" @regs to RAX. */ 114 + /* Pop @svm to RAX while it's the only available register. */ 155 115 pop %_ASM_AX 156 116 157 117 /* Save all guest registers. */ ··· 193 109 mov %r14, VCPU_R14(%_ASM_AX) 194 110 mov %r15, VCPU_R15(%_ASM_AX) 195 111 #endif 112 + 113 + /* @svm can stay in RDI from now on. */ 114 + mov %_ASM_AX, %_ASM_DI 115 + 116 + mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX 117 + 5: vmsave %_ASM_AX 118 + 6: 119 + 120 + /* Restores GSBASE among other things, allowing access to percpu data. */ 121 + pop %_ASM_AX 122 + 7: vmload %_ASM_AX 123 + 8: 124 + 125 + #ifdef CONFIG_RETPOLINE 126 + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 127 + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 128 + #endif 129 + 130 + /* Clobbers RAX, RCX, RDX. */ 131 + RESTORE_HOST_SPEC_CTRL 196 132 197 133 /* 198 134 * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be ··· 249 145 xor %r15d, %r15d 250 146 #endif 251 147 148 + /* "Pop" @spec_ctrl_intercepted. */ 149 + pop %_ASM_BX 150 + 252 151 pop %_ASM_BX 253 152 254 153 #ifdef CONFIG_X86_64 ··· 266 159 pop %_ASM_BP 267 160 RET 268 161 269 - 3: cmpb $0, kvm_rebooting 162 + RESTORE_GUEST_SPEC_CTRL_BODY 163 + RESTORE_HOST_SPEC_CTRL_BODY 164 + 165 + 10: cmpb $0, kvm_rebooting 270 166 jne 2b 271 167 ud2 168 + 30: cmpb $0, kvm_rebooting 169 + jne 4b 170 + ud2 171 + 50: cmpb $0, kvm_rebooting 172 + jne 6b 173 + ud2 174 + 70: cmpb $0, kvm_rebooting 175 + jne 8b 176 + ud2 272 177 273 - _ASM_EXTABLE(1b, 3b) 178 + _ASM_EXTABLE(1b, 10b) 179 + _ASM_EXTABLE(3b, 30b) 180 + _ASM_EXTABLE(5b, 50b) 181 + _ASM_EXTABLE(7b, 70b) 274 182 275 183 SYM_FUNC_END(__svm_vcpu_run) 276 184 277 185 /** 278 186 * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode 279 - * @vmcb_pa: unsigned long 187 + * @svm: struct vcpu_svm * 188 + * @spec_ctrl_intercepted: bool 280 189 */ 281 190 SYM_FUNC_START(__svm_sev_es_vcpu_run) 282 191 push %_ASM_BP ··· 307 184 #endif 308 185 push %_ASM_BX 309 186 310 - /* Move @vmcb to RAX. */ 311 - mov %_ASM_ARG1, %_ASM_AX 187 + /* 188 + * Save variables needed after vmexit on the stack, in inverse 189 + * order compared to when they are needed. 190 + */ 191 + 192 + /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ 193 + push %_ASM_ARG2 194 + 195 + /* Save @svm. */ 196 + push %_ASM_ARG1 197 + 198 + .ifnc _ASM_ARG1, _ASM_DI 199 + /* 200 + * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX 201 + * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. 202 + */ 203 + mov %_ASM_ARG1, %_ASM_DI 204 + .endif 205 + 206 + /* Clobbers RAX, RCX, RDX. */ 207 + RESTORE_GUEST_SPEC_CTRL 208 + 209 + /* Get svm->current_vmcb->pa into RAX. */ 210 + mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX 211 + mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX 312 212 313 213 /* Enter guest mode */ 314 214 sti ··· 340 194 341 195 2: cli 342 196 197 + /* Pop @svm to RDI, guest registers have been saved already. */ 198 + pop %_ASM_DI 199 + 343 200 #ifdef CONFIG_RETPOLINE 344 201 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 345 202 FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE 346 203 #endif 204 + 205 + /* Clobbers RAX, RCX, RDX. */ 206 + RESTORE_HOST_SPEC_CTRL 347 207 348 208 /* 349 209 * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be ··· 359 207 * from the kernel. 360 208 */ 361 209 UNTRAIN_RET 210 + 211 + /* "Pop" @spec_ctrl_intercepted. */ 212 + pop %_ASM_BX 362 213 363 214 pop %_ASM_BX 364 215 ··· 376 221 #endif 377 222 pop %_ASM_BP 378 223 RET 224 + 225 + RESTORE_GUEST_SPEC_CTRL_BODY 226 + RESTORE_HOST_SPEC_CTRL_BODY 379 227 380 228 3: cmpb $0, kvm_rebooting 381 229 jne 2b
+2 -2
arch/x86/kvm/vmx/pmu_intel.c
··· 617 617 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 618 618 struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); 619 619 620 - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { 620 + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { 621 621 pmu->gp_counters[i].type = KVM_PMC_GP; 622 622 pmu->gp_counters[i].vcpu = vcpu; 623 623 pmu->gp_counters[i].idx = i; ··· 643 643 struct kvm_pmc *pmc = NULL; 644 644 int i; 645 645 646 - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { 646 + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { 647 647 pmc = &pmu->gp_counters[i]; 648 648 649 649 pmc_stop_counter(pmc);
+1 -1
arch/x86/kvm/vmx/vmenter.S
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #include <linux/linkage.h> 3 3 #include <asm/asm.h> 4 - #include <asm/asm-offsets.h> 5 4 #include <asm/bitsperlong.h> 6 5 #include <asm/kvm_vcpu_regs.h> 7 6 #include <asm/nospec-branch.h> 8 7 #include <asm/percpu.h> 9 8 #include <asm/segment.h> 9 + #include "kvm-asm-offsets.h" 10 10 #include "run_flags.h" 11 11 12 12 #define WORD_SIZE (BITS_PER_LONG / 8)
+10 -15
arch/x86/kvm/x86.c
··· 1438 1438 MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 1439 1439 MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 1440 1440 MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 1441 + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 1442 + 1443 + /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ 1441 1444 MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 1442 1445 MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 1443 1446 MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 1444 1447 MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 1445 - MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, 1446 - MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, 1447 - MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, 1448 - MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, 1449 - MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, 1450 1448 MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 1451 1449 MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 1452 1450 MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 1453 1451 MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 1454 - MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, 1455 - MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, 1456 - MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, 1457 - MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, 1458 - MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, 1459 - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 1460 1452 1461 1453 MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 1462 1454 MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 1455 + 1456 + /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ 1463 1457 MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 1464 1458 MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 1465 1459 MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 1466 1460 MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 1461 + 1467 1462 MSR_IA32_XFD, MSR_IA32_XFD_ERR, 1468 1463 }; 1469 1464 ··· 7036 7041 intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) 7037 7042 continue; 7038 7043 break; 7039 - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: 7044 + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: 7040 7045 if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= 7041 - min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) 7046 + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) 7042 7047 continue; 7043 7048 break; 7044 - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: 7049 + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: 7045 7050 if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= 7046 - min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) 7051 + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) 7047 7052 continue; 7048 7053 break; 7049 7054 case MSR_IA32_XFD:
+83 -15
tools/kvm/kvm_stat/kvm_stat
··· 41 41 'EXCEPTION_NMI': 0, 42 42 'EXTERNAL_INTERRUPT': 1, 43 43 'TRIPLE_FAULT': 2, 44 - 'PENDING_INTERRUPT': 7, 44 + 'INIT_SIGNAL': 3, 45 + 'SIPI_SIGNAL': 4, 46 + 'INTERRUPT_WINDOW': 7, 45 47 'NMI_WINDOW': 8, 46 48 'TASK_SWITCH': 9, 47 49 'CPUID': 10, 48 50 'HLT': 12, 51 + 'INVD': 13, 49 52 'INVLPG': 14, 50 53 'RDPMC': 15, 51 54 'RDTSC': 16, ··· 68 65 'MSR_READ': 31, 69 66 'MSR_WRITE': 32, 70 67 'INVALID_STATE': 33, 68 + 'MSR_LOAD_FAIL': 34, 71 69 'MWAIT_INSTRUCTION': 36, 70 + 'MONITOR_TRAP_FLAG': 37, 72 71 'MONITOR_INSTRUCTION': 39, 73 72 'PAUSE_INSTRUCTION': 40, 74 73 'MCE_DURING_VMENTRY': 41, 75 74 'TPR_BELOW_THRESHOLD': 43, 76 75 'APIC_ACCESS': 44, 76 + 'EOI_INDUCED': 45, 77 + 'GDTR_IDTR': 46, 78 + 'LDTR_TR': 47, 77 79 'EPT_VIOLATION': 48, 78 80 'EPT_MISCONFIG': 49, 81 + 'INVEPT': 50, 82 + 'RDTSCP': 51, 83 + 'PREEMPTION_TIMER': 52, 84 + 'INVVPID': 53, 79 85 'WBINVD': 54, 80 86 'XSETBV': 55, 81 87 'APIC_WRITE': 56, 88 + 'RDRAND': 57, 82 89 'INVPCID': 58, 90 + 'VMFUNC': 59, 91 + 'ENCLS': 60, 92 + 'RDSEED': 61, 93 + 'PML_FULL': 62, 94 + 'XSAVES': 63, 95 + 'XRSTORS': 64, 96 + 'UMWAIT': 67, 97 + 'TPAUSE': 68, 98 + 'BUS_LOCK': 74, 99 + 'NOTIFY': 75, 83 100 } 84 101 85 102 SVM_EXIT_REASONS = { 86 103 'READ_CR0': 0x000, 104 + 'READ_CR2': 0x002, 87 105 'READ_CR3': 0x003, 88 106 'READ_CR4': 0x004, 89 107 'READ_CR8': 0x008, 90 108 'WRITE_CR0': 0x010, 109 + 'WRITE_CR2': 0x012, 91 110 'WRITE_CR3': 0x013, 92 111 'WRITE_CR4': 0x014, 93 112 'WRITE_CR8': 0x018, ··· 130 105 'WRITE_DR6': 0x036, 131 106 'WRITE_DR7': 0x037, 132 107 'EXCP_BASE': 0x040, 108 + 'LAST_EXCP': 0x05f, 133 109 'INTR': 0x060, 134 110 'NMI': 0x061, 135 111 'SMI': 0x062, ··· 177 151 'MWAIT': 0x08b, 178 152 'MWAIT_COND': 0x08c, 179 153 'XSETBV': 0x08d, 154 + 'RDPRU': 0x08e, 155 + 'EFER_WRITE_TRAP': 0x08f, 156 + 'CR0_WRITE_TRAP': 0x090, 157 + 'CR1_WRITE_TRAP': 0x091, 158 + 'CR2_WRITE_TRAP': 0x092, 159 + 'CR3_WRITE_TRAP': 0x093, 160 + 'CR4_WRITE_TRAP': 0x094, 161 + 'CR5_WRITE_TRAP': 0x095, 162 + 'CR6_WRITE_TRAP': 0x096, 163 + 'CR7_WRITE_TRAP': 0x097, 164 + 'CR8_WRITE_TRAP': 0x098, 165 + 'CR9_WRITE_TRAP': 0x099, 166 + 'CR10_WRITE_TRAP': 0x09a, 167 + 'CR11_WRITE_TRAP': 0x09b, 168 + 'CR12_WRITE_TRAP': 0x09c, 169 + 'CR13_WRITE_TRAP': 0x09d, 170 + 'CR14_WRITE_TRAP': 0x09e, 171 + 'CR15_WRITE_TRAP': 0x09f, 172 + 'INVPCID': 0x0a2, 180 173 'NPF': 0x400, 174 + 'AVIC_INCOMPLETE_IPI': 0x401, 175 + 'AVIC_UNACCELERATED_ACCESS': 0x402, 176 + 'VMGEXIT': 0x403, 181 177 } 182 178 183 - # EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h) 179 + # EC definition of HSR (from arch/arm64/include/asm/esr.h) 184 180 AARCH64_EXIT_REASONS = { 185 181 'UNKNOWN': 0x00, 186 - 'WFI': 0x01, 182 + 'WFx': 0x01, 187 183 'CP15_32': 0x03, 188 184 'CP15_64': 0x04, 189 185 'CP14_MR': 0x05, 190 186 'CP14_LS': 0x06, 191 187 'FP_ASIMD': 0x07, 192 188 'CP10_ID': 0x08, 189 + 'PAC': 0x09, 193 190 'CP14_64': 0x0C, 194 - 'ILL_ISS': 0x0E, 191 + 'BTI': 0x0D, 192 + 'ILL': 0x0E, 195 193 'SVC32': 0x11, 196 194 'HVC32': 0x12, 197 195 'SMC32': 0x13, ··· 223 173 'HVC64': 0x16, 224 174 'SMC64': 0x17, 225 175 'SYS64': 0x18, 226 - 'IABT': 0x20, 227 - 'IABT_HYP': 0x21, 176 + 'SVE': 0x19, 177 + 'ERET': 0x1A, 178 + 'FPAC': 0x1C, 179 + 'SME': 0x1D, 180 + 'IMP_DEF': 0x1F, 181 + 'IABT_LOW': 0x20, 182 + 'IABT_CUR': 0x21, 228 183 'PC_ALIGN': 0x22, 229 - 'DABT': 0x24, 230 - 'DABT_HYP': 0x25, 184 + 'DABT_LOW': 0x24, 185 + 'DABT_CUR': 0x25, 231 186 'SP_ALIGN': 0x26, 232 187 'FP_EXC32': 0x28, 233 188 'FP_EXC64': 0x2C, 234 189 'SERROR': 0x2F, 235 - 'BREAKPT': 0x30, 236 - 'BREAKPT_HYP': 0x31, 237 - 'SOFTSTP': 0x32, 238 - 'SOFTSTP_HYP': 0x33, 239 - 'WATCHPT': 0x34, 240 - 'WATCHPT_HYP': 0x35, 190 + 'BREAKPT_LOW': 0x30, 191 + 'BREAKPT_CUR': 0x31, 192 + 'SOFTSTP_LOW': 0x32, 193 + 'SOFTSTP_CUR': 0x33, 194 + 'WATCHPT_LOW': 0x34, 195 + 'WATCHPT_CUR': 0x35, 241 196 'BKPT32': 0x38, 242 197 'VECTOR32': 0x3A, 243 198 'BRK64': 0x3C, ··· 275 220 'S390_TSCH': 22, 276 221 'EPR': 23, 277 222 'SYSTEM_EVENT': 24, 223 + 'S390_STSI': 25, 224 + 'IOAPIC_EOI': 26, 225 + 'HYPERV': 27, 226 + 'ARM_NISV': 28, 227 + 'X86_RDMSR': 29, 228 + 'X86_WRMSR': 30, 229 + 'DIRTY_RING_FULL': 31, 230 + 'AP_RESET_HOLD': 32, 231 + 'X86_BUS_LOCK': 33, 232 + 'XEN': 34, 233 + 'RISCV_SBI': 35, 234 + 'RISCV_CSR': 36, 235 + 'NOTIFY': 37, 278 236 } 279 237 280 238 IOCTL_NUMBERS = { ··· 1824 1756 1825 1757 debugfs = '' 1826 1758 for line in open('/proc/mounts'): 1827 - if line.split(' ')[0] == 'debugfs': 1759 + if line.split(' ')[2] == 'debugfs': 1828 1760 debugfs = line.split(' ')[1] 1829 1761 break 1830 1762 if debugfs == '':