Merge tag 'kvm-x86-misc-6.17' of https://github.com/kvm-x86/linux into HEAD

+24 -1

Documentation/virt/kvm/api.rst

··· 2006 2006 2007 2007 If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also 2008 2008 be used as a vm ioctl to set the initial tsc frequency of subsequently 2009 - created vCPUs. 2009 + created vCPUs. Note, the vm ioctl is only allowed prior to creating vCPUs. 2010 2010 2011 2011 For TSC protected Confidential Computing (CoCo) VMs where TSC frequency 2012 2012 is configured once at VM scope and remains unchanged during VM's ··· 7851 7851 #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) 7852 7852 #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) 7853 7853 #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) 7854 + #define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4) 7854 7855 7855 7856 Enabling this capability on a VM provides userspace with a way to no 7856 7857 longer intercept some instructions for improved latency in some ··· 7861 7860 all such vmexits. 7862 7861 7863 7862 Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. 7863 + 7864 + Virtualizing the ``IA32_APERF`` and ``IA32_MPERF`` MSRs requires more 7865 + than just disabling APERF/MPERF exits. While both Intel and AMD 7866 + document strict usage conditions for these MSRs--emphasizing that only 7867 + the ratio of their deltas over a time interval (T0 to T1) is 7868 + architecturally defined--simply passing through the MSRs can still 7869 + produce an incorrect ratio. 7870 + 7871 + This erroneous ratio can occur if, between T0 and T1: 7872 + 7873 + 1. The vCPU thread migrates between logical processors. 7874 + 2. Live migration or suspend/resume operations take place. 7875 + 3. Another task shares the vCPU's logical processor. 7876 + 4. C-states lower than C0 are emulated (e.g., via HLT interception). 7877 + 5. The guest TSC frequency doesn't match the host TSC frequency. 7878 + 7879 + Due to these complexities, KVM does not automatically associate this 7880 + passthrough capability with the guest CPUID bit, 7881 + ``CPUID.6:ECX.APERFMPERF[bit 0]``. Userspace VMMs that deem this 7882 + mechanism adequate for virtualizing the ``IA32_APERF`` and 7883 + ``IA32_MPERF`` MSRs must set the guest CPUID bit explicitly. 7884 + 7864 7885 7865 7886 7.14 KVM_CAP_S390_HPAGE_1M 7866 7887 --------------------------

+1 -2

arch/x86/include/asm/kvm-x86-ops.h

··· 49 49 KVM_X86_OP(get_gdt) 50 50 KVM_X86_OP(set_gdt) 51 51 KVM_X86_OP(sync_dirty_debug_regs) 52 - KVM_X86_OP(set_dr6) 53 52 KVM_X86_OP(set_dr7) 54 53 KVM_X86_OP(cache_reg) 55 54 KVM_X86_OP(get_rflags) ··· 138 139 KVM_X86_OP(apic_init_signal_blocked) 139 140 KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) 140 141 KVM_X86_OP_OPTIONAL(migrate_timers) 141 - KVM_X86_OP(msr_filter_changed) 142 + KVM_X86_OP(recalc_msr_intercepts) 142 143 KVM_X86_OP(complete_emulated_msr) 143 144 KVM_X86_OP(vcpu_deliver_sipi_vector) 144 145 KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);

+15 -7

arch/x86/include/asm/kvm_host.h

··· 1408 1408 1409 1409 gpa_t wall_clock; 1410 1410 1411 - bool mwait_in_guest; 1412 - bool hlt_in_guest; 1413 - bool pause_in_guest; 1414 - bool cstate_in_guest; 1411 + u64 disabled_exits; 1415 1412 1416 1413 s64 kvmclock_offset; 1417 1414 ··· 1684 1687 return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; 1685 1688 } 1686 1689 1690 + enum kvm_x86_run_flags { 1691 + KVM_RUN_FORCE_IMMEDIATE_EXIT = BIT(0), 1692 + KVM_RUN_LOAD_GUEST_DR6 = BIT(1), 1693 + KVM_RUN_LOAD_DEBUGCTL = BIT(2), 1694 + }; 1695 + 1687 1696 struct kvm_x86_ops { 1688 1697 const char *name; 1689 1698 ··· 1718 1715 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 1719 1716 void (*vcpu_put)(struct kvm_vcpu *vcpu); 1720 1717 1718 + /* 1719 + * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to 1720 + * match the host's value even while the guest is active. 1721 + */ 1722 + const u64 HOST_OWNED_DEBUGCTL; 1723 + 1721 1724 void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); 1722 1725 int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); 1723 1726 int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); ··· 1746 1737 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 1747 1738 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); 1748 1739 void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); 1749 - void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); 1750 1740 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); 1751 1741 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 1752 1742 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); ··· 1776 1768 1777 1769 int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); 1778 1770 enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, 1779 - bool force_immediate_exit); 1771 + u64 run_flags); 1780 1772 int (*handle_exit)(struct kvm_vcpu *vcpu, 1781 1773 enum exit_fastpath_completion exit_fastpath); 1782 1774 int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); ··· 1908 1900 int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); 1909 1901 1910 1902 void (*migrate_timers)(struct kvm_vcpu *vcpu); 1911 - void (*msr_filter_changed)(struct kvm_vcpu *vcpu); 1903 + void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu); 1912 1904 int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); 1913 1905 1914 1906 void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);

+1

arch/x86/include/asm/msr-index.h

··· 419 419 #define DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI (1UL << 12) 420 420 #define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 421 421 #define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) 422 + #define DEBUGCTLMSR_RTM_DEBUG BIT(15) 422 423 423 424 #define MSR_PEBS_FRONTEND 0x000003f7 424 425

+1

arch/x86/kvm/cpuid.c

··· 979 979 F(FSRS), 980 980 F(FSRC), 981 981 F(WRMSRNS), 982 + X86_64_F(LKGS), 982 983 F(AMX_FP16), 983 984 F(AVX_IFMA), 984 985 F(LAM),

+2

arch/x86/kvm/lapic.h

··· 21 21 #define APIC_BROADCAST 0xFF 22 22 #define X2APIC_BROADCAST 0xFFFFFFFFul 23 23 24 + #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) 25 + 24 26 enum lapic_mode { 25 27 LAPIC_MODE_DISABLED = 0, 26 28 LAPIC_MODE_INVALID = X2APIC_ENABLE,

+96 -32

arch/x86/kvm/svm/nested.c

··· 185 185 } 186 186 187 187 /* 188 + * This array (and its actual size) holds the set of offsets (indexing by chunk 189 + * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the 190 + * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g. 191 + * based on CPUID features. This array only tracks MSRs that *might* be passed 192 + * through to the guest. 193 + * 194 + * Hardcode the capacity of the array based on the maximum number of _offsets_. 195 + * MSRs are batched together, so there are fewer offsets than MSRs. 196 + */ 197 + static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; 198 + static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; 199 + typedef unsigned long nsvm_msrpm_merge_t; 200 + 201 + int __init nested_svm_init_msrpm_merge_offsets(void) 202 + { 203 + static const u32 merge_msrs[] __initconst = { 204 + MSR_STAR, 205 + MSR_IA32_SYSENTER_CS, 206 + MSR_IA32_SYSENTER_EIP, 207 + MSR_IA32_SYSENTER_ESP, 208 + #ifdef CONFIG_X86_64 209 + MSR_GS_BASE, 210 + MSR_FS_BASE, 211 + MSR_KERNEL_GS_BASE, 212 + MSR_LSTAR, 213 + MSR_CSTAR, 214 + MSR_SYSCALL_MASK, 215 + #endif 216 + MSR_IA32_SPEC_CTRL, 217 + MSR_IA32_PRED_CMD, 218 + MSR_IA32_FLUSH_CMD, 219 + MSR_IA32_APERF, 220 + MSR_IA32_MPERF, 221 + MSR_IA32_LASTBRANCHFROMIP, 222 + MSR_IA32_LASTBRANCHTOIP, 223 + MSR_IA32_LASTINTFROMIP, 224 + MSR_IA32_LASTINTTOIP, 225 + }; 226 + int i, j; 227 + 228 + for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) { 229 + int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]); 230 + u32 offset; 231 + 232 + if (WARN_ON(bit_nr < 0)) 233 + return -EIO; 234 + 235 + /* 236 + * Merging is done in chunks to reduce the number of accesses 237 + * to L1's bitmap. 238 + */ 239 + offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t); 240 + 241 + for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) { 242 + if (nested_svm_msrpm_merge_offsets[j] == offset) 243 + break; 244 + } 245 + 246 + if (j < nested_svm_nr_msrpm_merge_offsets) 247 + continue; 248 + 249 + if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets))) 250 + return -EIO; 251 + 252 + nested_svm_msrpm_merge_offsets[j] = offset; 253 + nested_svm_nr_msrpm_merge_offsets++; 254 + } 255 + 256 + return 0; 257 + } 258 + 259 + /* 188 260 * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function 189 261 * is optimized in that it only merges the parts where KVM MSR permission bitmap 190 262 * may contain zero bits. 191 263 */ 192 - static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 264 + static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu) 193 265 { 266 + struct vcpu_svm *svm = to_svm(vcpu); 267 + nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm; 268 + nsvm_msrpm_merge_t *msrpm01 = svm->msrpm; 194 269 int i; 195 270 196 271 /* ··· 280 205 if (!svm->nested.force_msr_bitmap_recalc) { 281 206 struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments; 282 207 283 - if (kvm_hv_hypercall_enabled(&svm->vcpu) && 208 + if (kvm_hv_hypercall_enabled(vcpu) && 284 209 hve->hv_enlightenments_control.msr_bitmap && 285 210 (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS))) 286 211 goto set_msrpm_base_pa; ··· 290 215 if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) 291 216 return true; 292 217 293 - for (i = 0; i < MSRPM_OFFSETS; i++) { 294 - u32 value, p; 295 - u64 offset; 218 + for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) { 219 + const int p = nested_svm_msrpm_merge_offsets[i]; 220 + nsvm_msrpm_merge_t l1_val; 221 + gpa_t gpa; 296 222 297 - if (msrpm_offsets[i] == 0xffffffff) 298 - break; 223 + gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val)); 299 224 300 - p = msrpm_offsets[i]; 301 - 302 - /* x2apic msrs are intercepted always for the nested guest */ 303 - if (is_x2apic_msrpm_offset(p)) 304 - continue; 305 - 306 - offset = svm->nested.ctl.msrpm_base_pa + (p * 4); 307 - 308 - if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) 225 + if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val))) 309 226 return false; 310 227 311 - svm->nested.msrpm[p] = svm->msrpm[p] | value; 228 + msrpm02[p] = msrpm01[p] | l1_val; 312 229 } 313 230 314 231 svm->nested.force_msr_bitmap_recalc = false; ··· 1004 937 if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) 1005 938 goto out_exit_err; 1006 939 1007 - if (nested_svm_vmrun_msrpm(svm)) 940 + if (nested_svm_merge_msrpm(vcpu)) 1008 941 goto out; 1009 942 1010 943 out_exit_err: ··· 1297 1230 svm->nested.msrpm = svm_vcpu_alloc_msrpm(); 1298 1231 if (!svm->nested.msrpm) 1299 1232 goto err_free_vmcb02; 1300 - svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); 1301 1233 1302 1234 svm->nested.initialized = true; 1303 1235 return 0; ··· 1356 1290 1357 1291 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) 1358 1292 { 1359 - u32 offset, msr, value; 1360 - int write, mask; 1293 + gpa_t base = svm->nested.ctl.msrpm_base_pa; 1294 + int write, bit_nr; 1295 + u8 value, mask; 1296 + u32 msr; 1361 1297 1362 1298 if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) 1363 1299 return NESTED_EXIT_HOST; 1364 1300 1365 1301 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1366 - offset = svm_msrpm_offset(msr); 1302 + bit_nr = svm_msrpm_bit_nr(msr); 1367 1303 write = svm->vmcb->control.exit_info_1 & 1; 1368 - mask = 1 << ((2 * (msr & 0xf)) + write); 1369 1304 1370 - if (offset == MSR_INVALID) 1305 + if (bit_nr < 0) 1371 1306 return NESTED_EXIT_DONE; 1372 1307 1373 - /* Offset is in 32 bit units but need in 8 bit units */ 1374 - offset *= 4; 1375 - 1376 - if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4)) 1308 + if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE, 1309 + &value, sizeof(value))) 1377 1310 return NESTED_EXIT_DONE; 1378 1311 1312 + mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1)); 1379 1313 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 1380 1314 } 1381 1315 ··· 1885 1819 1886 1820 static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) 1887 1821 { 1888 - struct vcpu_svm *svm = to_svm(vcpu); 1889 - 1890 1822 if (WARN_ON(!is_guest_mode(vcpu))) 1891 1823 return true; 1892 1824 1893 1825 if (!vcpu->arch.pdptrs_from_userspace && 1894 - !nested_npt_enabled(svm) && is_pae_paging(vcpu)) 1826 + !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu)) 1895 1827 /* 1896 1828 * Reload the guest's PDPTRs since after a migration 1897 1829 * the guest CR3 might be restored prior to setting the nested ··· 1898 1834 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 1899 1835 return false; 1900 1836 1901 - if (!nested_svm_vmrun_msrpm(svm)) { 1837 + if (!nested_svm_merge_msrpm(vcpu)) { 1902 1838 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1903 1839 vcpu->run->internal.suberror = 1904 1840 KVM_INTERNAL_ERROR_EMULATION;

+12 -21

arch/x86/kvm/svm/sev.c

··· 4390 4390 count, in); 4391 4391 } 4392 4392 4393 - static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) 4393 + void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4394 4394 { 4395 - struct kvm_vcpu *vcpu = &svm->vcpu; 4395 + /* Clear intercepts on MSRs that are context switched by hardware. */ 4396 + svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW); 4397 + svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); 4398 + svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); 4396 4399 4397 - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { 4398 - bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4399 - guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4400 - 4401 - set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); 4402 - } 4400 + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) 4401 + svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, 4402 + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && 4403 + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); 4403 4404 4404 4405 /* 4405 4406 * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if ··· 4414 4413 * XSAVES being exposed to the guest so that KVM can at least honor 4415 4414 * guest CPUID for RDMSR and WRMSR. 4416 4415 */ 4417 - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && 4418 - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) 4419 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); 4420 - else 4421 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0); 4416 + svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW, 4417 + !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) || 4418 + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)); 4422 4419 } 4423 4420 4424 4421 void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) ··· 4428 4429 best = kvm_find_cpuid_entry(vcpu, 0x8000001F); 4429 4430 if (best) 4430 4431 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 4431 - 4432 - if (sev_es_guest(svm->vcpu.kvm)) 4433 - sev_es_vcpu_after_set_cpuid(svm); 4434 4432 } 4435 4433 4436 4434 static void sev_es_init_vmcb(struct vcpu_svm *svm) 4437 4435 { 4438 4436 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4439 4437 struct vmcb *vmcb = svm->vmcb01.ptr; 4440 - struct kvm_vcpu *vcpu = &svm->vcpu; 4441 4438 4442 4439 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; 4443 4440 ··· 4491 4496 4492 4497 /* Can't intercept XSETBV, HV can't modify XCR0 directly */ 4493 4498 svm_clr_intercept(svm, INTERCEPT_XSETBV); 4494 - 4495 - /* Clear intercepts on selected MSRs */ 4496 - set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); 4497 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); 4498 4499 } 4499 4500 4500 4501 void sev_init_vmcb(struct vcpu_svm *svm)

+167 -339

arch/x86/kvm/svm/svm.c

··· 72 72 73 73 static bool erratum_383_found __read_mostly; 74 74 75 - u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 76 - 77 75 /* 78 76 * Set osvw_len to higher value when updated Revision Guides 79 77 * are published and we know what the new status bits are ··· 79 81 static uint64_t osvw_len = 4, osvw_status; 80 82 81 83 static DEFINE_PER_CPU(u64, current_tsc_ratio); 82 - 83 - #define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) 84 - 85 - static const struct svm_direct_access_msrs { 86 - u32 index; /* Index of the MSR */ 87 - bool always; /* True if intercept is initially cleared */ 88 - } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { 89 - { .index = MSR_STAR, .always = true }, 90 - { .index = MSR_IA32_SYSENTER_CS, .always = true }, 91 - { .index = MSR_IA32_SYSENTER_EIP, .always = false }, 92 - { .index = MSR_IA32_SYSENTER_ESP, .always = false }, 93 - #ifdef CONFIG_X86_64 94 - { .index = MSR_GS_BASE, .always = true }, 95 - { .index = MSR_FS_BASE, .always = true }, 96 - { .index = MSR_KERNEL_GS_BASE, .always = true }, 97 - { .index = MSR_LSTAR, .always = true }, 98 - { .index = MSR_CSTAR, .always = true }, 99 - { .index = MSR_SYSCALL_MASK, .always = true }, 100 - #endif 101 - { .index = MSR_IA32_SPEC_CTRL, .always = false }, 102 - { .index = MSR_IA32_PRED_CMD, .always = false }, 103 - { .index = MSR_IA32_FLUSH_CMD, .always = false }, 104 - { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, 105 - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, 106 - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, 107 - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, 108 - { .index = MSR_IA32_LASTINTTOIP, .always = false }, 109 - { .index = MSR_IA32_XSS, .always = false }, 110 - { .index = MSR_EFER, .always = false }, 111 - { .index = MSR_IA32_CR_PAT, .always = false }, 112 - { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, 113 - { .index = MSR_TSC_AUX, .always = false }, 114 - { .index = X2APIC_MSR(APIC_ID), .always = false }, 115 - { .index = X2APIC_MSR(APIC_LVR), .always = false }, 116 - { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, 117 - { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, 118 - { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, 119 - { .index = X2APIC_MSR(APIC_EOI), .always = false }, 120 - { .index = X2APIC_MSR(APIC_RRR), .always = false }, 121 - { .index = X2APIC_MSR(APIC_LDR), .always = false }, 122 - { .index = X2APIC_MSR(APIC_DFR), .always = false }, 123 - { .index = X2APIC_MSR(APIC_SPIV), .always = false }, 124 - { .index = X2APIC_MSR(APIC_ISR), .always = false }, 125 - { .index = X2APIC_MSR(APIC_TMR), .always = false }, 126 - { .index = X2APIC_MSR(APIC_IRR), .always = false }, 127 - { .index = X2APIC_MSR(APIC_ESR), .always = false }, 128 - { .index = X2APIC_MSR(APIC_ICR), .always = false }, 129 - { .index = X2APIC_MSR(APIC_ICR2), .always = false }, 130 - 131 - /* 132 - * Note: 133 - * AMD does not virtualize APIC TSC-deadline timer mode, but it is 134 - * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, 135 - * the AVIC hardware would generate GP fault. Therefore, always 136 - * intercept the MSR 0x832, and do not setup direct_access_msr. 137 - */ 138 - { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, 139 - { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, 140 - { .index = X2APIC_MSR(APIC_LVT0), .always = false }, 141 - { .index = X2APIC_MSR(APIC_LVT1), .always = false }, 142 - { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, 143 - { .index = X2APIC_MSR(APIC_TMICT), .always = false }, 144 - { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, 145 - { .index = X2APIC_MSR(APIC_TDCR), .always = false }, 146 - { .index = MSR_INVALID, .always = false }, 147 - }; 148 84 149 85 /* 150 86 * These 2 parameters are used to config the controls for Pause-Loop Exiting: ··· 196 264 * defer the restoration of TSC_AUX until the CPU returns to userspace. 197 265 */ 198 266 static int tsc_aux_uret_slot __read_mostly = -1; 199 - 200 - static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 201 - 202 - #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 203 - #define MSRS_RANGE_SIZE 2048 204 - #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 205 - 206 - u32 svm_msrpm_offset(u32 msr) 207 - { 208 - u32 offset; 209 - int i; 210 - 211 - for (i = 0; i < NUM_MSR_MAPS; i++) { 212 - if (msr < msrpm_ranges[i] || 213 - msr >= msrpm_ranges[i] + MSRS_IN_RANGE) 214 - continue; 215 - 216 - offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ 217 - offset += (i * MSRS_RANGE_SIZE); /* add range offset */ 218 - 219 - /* Now we have the u8 offset - but need the u32 offset */ 220 - return offset / 4; 221 - } 222 - 223 - /* MSR not in any range */ 224 - return MSR_INVALID; 225 - } 226 267 227 268 static int get_npt_level(void) 228 269 { ··· 663 758 recalc_intercepts(svm); 664 759 } 665 760 666 - static int direct_access_msr_slot(u32 msr) 667 - { 668 - u32 i; 669 - 670 - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) 671 - if (direct_access_msrs[i].index == msr) 672 - return i; 673 - 674 - return -ENOENT; 675 - } 676 - 677 - static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, 678 - int write) 679 - { 680 - struct vcpu_svm *svm = to_svm(vcpu); 681 - int slot = direct_access_msr_slot(msr); 682 - 683 - if (slot == -ENOENT) 684 - return; 685 - 686 - /* Set the shadow bitmaps to the desired intercept states */ 687 - if (read) 688 - set_bit(slot, svm->shadow_msr_intercept.read); 689 - else 690 - clear_bit(slot, svm->shadow_msr_intercept.read); 691 - 692 - if (write) 693 - set_bit(slot, svm->shadow_msr_intercept.write); 694 - else 695 - clear_bit(slot, svm->shadow_msr_intercept.write); 696 - } 697 - 698 - static bool valid_msr_intercept(u32 index) 699 - { 700 - return direct_access_msr_slot(index) != -ENOENT; 701 - } 702 - 703 761 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 704 762 { 705 - u8 bit_write; 706 - unsigned long tmp; 707 - u32 offset; 708 - u32 *msrpm; 709 - 710 763 /* 711 764 * For non-nested case: 712 765 * If the L01 MSR bitmap does not intercept the MSR, then we need to ··· 674 811 * If the L02 MSR bitmap does not intercept the MSR, then we need to 675 812 * save it. 676 813 */ 677 - msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 678 - to_svm(vcpu)->msrpm; 814 + void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : 815 + to_svm(vcpu)->msrpm; 679 816 680 - offset = svm_msrpm_offset(msr); 681 - bit_write = 2 * (msr & 0x0f) + 1; 682 - tmp = msrpm[offset]; 683 - 684 - BUG_ON(offset == MSR_INVALID); 685 - 686 - return test_bit(bit_write, &tmp); 817 + return svm_test_msr_bitmap_write(msrpm, msr); 687 818 } 688 819 689 - static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, 690 - u32 msr, int read, int write) 820 + void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 691 821 { 692 822 struct vcpu_svm *svm = to_svm(vcpu); 693 - u8 bit_read, bit_write; 694 - unsigned long tmp; 695 - u32 offset; 823 + void *msrpm = svm->msrpm; 696 824 697 - /* 698 - * If this warning triggers extend the direct_access_msrs list at the 699 - * beginning of the file 700 - */ 701 - WARN_ON(!valid_msr_intercept(msr)); 825 + /* Don't disable interception for MSRs userspace wants to handle. */ 826 + if (type & MSR_TYPE_R) { 827 + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 828 + svm_clear_msr_bitmap_read(msrpm, msr); 829 + else 830 + svm_set_msr_bitmap_read(msrpm, msr); 831 + } 702 832 703 - /* Enforce non allowed MSRs to trap */ 704 - if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 705 - read = 0; 706 - 707 - if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 708 - write = 0; 709 - 710 - offset = svm_msrpm_offset(msr); 711 - bit_read = 2 * (msr & 0x0f); 712 - bit_write = 2 * (msr & 0x0f) + 1; 713 - tmp = msrpm[offset]; 714 - 715 - BUG_ON(offset == MSR_INVALID); 716 - 717 - read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); 718 - write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); 719 - 720 - msrpm[offset] = tmp; 833 + if (type & MSR_TYPE_W) { 834 + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 835 + svm_clear_msr_bitmap_write(msrpm, msr); 836 + else 837 + svm_set_msr_bitmap_write(msrpm, msr); 838 + } 721 839 722 840 svm_hv_vmcb_dirty_nested_enlightenments(vcpu); 723 841 svm->nested.force_msr_bitmap_recalc = true; 724 842 } 725 843 726 - void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, 727 - int read, int write) 844 + void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) 728 845 { 729 - set_shadow_msr_intercept(vcpu, msr, read, write); 730 - set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); 731 - } 732 - 733 - u32 *svm_vcpu_alloc_msrpm(void) 734 - { 735 - unsigned int order = get_order(MSRPM_SIZE); 736 - struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); 737 - u32 *msrpm; 846 + unsigned int order = get_order(size); 847 + struct page *pages = alloc_pages(gfp_mask, order); 848 + void *pm; 738 849 739 850 if (!pages) 740 851 return NULL; 741 852 742 - msrpm = page_address(pages); 743 - memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); 853 + /* 854 + * Set all bits in the permissions map so that all MSR and I/O accesses 855 + * are intercepted by default. 856 + */ 857 + pm = page_address(pages); 858 + memset(pm, 0xff, PAGE_SIZE * (1 << order)); 744 859 745 - return msrpm; 860 + return pm; 746 861 } 747 862 748 - void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) 863 + static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) 749 864 { 750 - int i; 865 + bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); 751 866 752 - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 753 - if (!direct_access_msrs[i].always) 754 - continue; 755 - set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); 756 - } 867 + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); 868 + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); 869 + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); 870 + svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); 871 + 872 + if (sev_es_guest(vcpu->kvm)) 873 + svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); 757 874 } 758 875 759 876 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) 760 877 { 878 + static const u32 x2avic_passthrough_msrs[] = { 879 + X2APIC_MSR(APIC_ID), 880 + X2APIC_MSR(APIC_LVR), 881 + X2APIC_MSR(APIC_TASKPRI), 882 + X2APIC_MSR(APIC_ARBPRI), 883 + X2APIC_MSR(APIC_PROCPRI), 884 + X2APIC_MSR(APIC_EOI), 885 + X2APIC_MSR(APIC_RRR), 886 + X2APIC_MSR(APIC_LDR), 887 + X2APIC_MSR(APIC_DFR), 888 + X2APIC_MSR(APIC_SPIV), 889 + X2APIC_MSR(APIC_ISR), 890 + X2APIC_MSR(APIC_TMR), 891 + X2APIC_MSR(APIC_IRR), 892 + X2APIC_MSR(APIC_ESR), 893 + X2APIC_MSR(APIC_ICR), 894 + X2APIC_MSR(APIC_ICR2), 895 + 896 + /* 897 + * Note! Always intercept LVTT, as TSC-deadline timer mode 898 + * isn't virtualized by hardware, and the CPU will generate a 899 + * #GP instead of a #VMEXIT. 900 + */ 901 + X2APIC_MSR(APIC_LVTTHMR), 902 + X2APIC_MSR(APIC_LVTPC), 903 + X2APIC_MSR(APIC_LVT0), 904 + X2APIC_MSR(APIC_LVT1), 905 + X2APIC_MSR(APIC_LVTERR), 906 + X2APIC_MSR(APIC_TMICT), 907 + X2APIC_MSR(APIC_TMCCT), 908 + X2APIC_MSR(APIC_TDCR), 909 + }; 761 910 int i; 762 911 763 912 if (intercept == svm->x2avic_msrs_intercepted) ··· 778 903 if (!x2avic_enabled) 779 904 return; 780 905 781 - for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { 782 - int index = direct_access_msrs[i].index; 783 - 784 - if ((index < APIC_BASE_MSR) || 785 - (index > APIC_BASE_MSR + 0xff)) 786 - continue; 787 - set_msr_interception(&svm->vcpu, svm->msrpm, index, 788 - !intercept, !intercept); 789 - } 906 + for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) 907 + svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], 908 + MSR_TYPE_RW, intercept); 790 909 791 910 svm->x2avic_msrs_intercepted = intercept; 792 911 } 793 912 794 - void svm_vcpu_free_msrpm(u32 *msrpm) 913 + void svm_vcpu_free_msrpm(void *msrpm) 795 914 { 796 915 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 797 916 } 798 917 799 - static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) 918 + static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 800 919 { 801 920 struct vcpu_svm *svm = to_svm(vcpu); 802 - u32 i; 921 + 922 + svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW); 923 + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 924 + 925 + #ifdef CONFIG_X86_64 926 + svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 927 + svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 928 + svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 929 + svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW); 930 + svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW); 931 + svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW); 932 + #endif 933 + 934 + if (lbrv) 935 + svm_recalc_lbr_msr_intercepts(vcpu); 936 + 937 + if (cpu_feature_enabled(X86_FEATURE_IBPB)) 938 + svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 939 + !guest_has_pred_cmd_msr(vcpu)); 940 + 941 + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 942 + svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 943 + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 803 944 804 945 /* 805 - * Set intercept permissions for all direct access MSRs again. They 806 - * will automatically get filtered through the MSR filter, so we are 807 - * back in sync after this. 946 + * Disable interception of SPEC_CTRL if KVM doesn't need to manually 947 + * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if 948 + * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively 949 + * using SPEC_CTRL. 808 950 */ 809 - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 810 - u32 msr = direct_access_msrs[i].index; 811 - u32 read = test_bit(i, svm->shadow_msr_intercept.read); 812 - u32 write = test_bit(i, svm->shadow_msr_intercept.write); 813 - 814 - set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); 815 - } 816 - } 817 - 818 - static void add_msr_offset(u32 offset) 819 - { 820 - int i; 821 - 822 - for (i = 0; i < MSRPM_OFFSETS; ++i) { 823 - 824 - /* Offset already in list? */ 825 - if (msrpm_offsets[i] == offset) 826 - return; 827 - 828 - /* Slot used by another offset? */ 829 - if (msrpm_offsets[i] != MSR_INVALID) 830 - continue; 831 - 832 - /* Add offset to list */ 833 - msrpm_offsets[i] = offset; 834 - 835 - return; 836 - } 951 + if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL)) 952 + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 953 + !guest_has_spec_ctrl_msr(vcpu)); 954 + else 955 + svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 956 + !svm->spec_ctrl); 837 957 838 958 /* 839 - * If this BUG triggers the msrpm_offsets table has an overflow. Just 840 - * increase MSRPM_OFFSETS in this case. 959 + * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU, 960 + * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits. 841 961 */ 842 - BUG(); 843 - } 962 + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW, 963 + guest_cpuid_is_intel_compatible(vcpu)); 964 + svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, 965 + guest_cpuid_is_intel_compatible(vcpu)); 844 966 845 - static void init_msrpm_offsets(void) 846 - { 847 - int i; 848 - 849 - memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); 850 - 851 - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 852 - u32 offset; 853 - 854 - offset = svm_msrpm_offset(direct_access_msrs[i].index); 855 - BUG_ON(offset == MSR_INVALID); 856 - 857 - add_msr_offset(offset); 967 + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 968 + svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 969 + svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 858 970 } 971 + 972 + if (sev_es_guest(vcpu->kvm)) 973 + sev_es_recalc_msr_intercepts(vcpu); 974 + 975 + /* 976 + * x2APIC intercepts are modified on-demand and cannot be filtered by 977 + * userspace. 978 + */ 859 979 } 860 980 861 981 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) ··· 869 999 struct vcpu_svm *svm = to_svm(vcpu); 870 1000 871 1001 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 872 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 873 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 874 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); 875 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); 876 - 877 - if (sev_es_guest(vcpu->kvm)) 878 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); 1002 + svm_recalc_lbr_msr_intercepts(vcpu); 879 1003 880 1004 /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ 881 1005 if (is_guest_mode(vcpu)) ··· 881 1017 struct vcpu_svm *svm = to_svm(vcpu); 882 1018 883 1019 KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); 884 - 885 1020 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 886 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 887 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 888 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); 889 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 1021 + svm_recalc_lbr_msr_intercepts(vcpu); 890 1022 891 1023 /* 892 1024 * Move the LBR msrs back to the vmcb01 to avoid copying them ··· 1037 1177 } 1038 1178 1039 1179 /* Evaluate instruction intercepts that depend on guest CPUID features. */ 1040 - static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, 1041 - struct vcpu_svm *svm) 1180 + static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 1042 1181 { 1182 + struct vcpu_svm *svm = to_svm(vcpu); 1183 + 1043 1184 /* 1044 1185 * Intercept INVPCID if shadow paging is enabled to sync/free shadow 1045 1186 * roots, or if INVPCID is disabled in the guest to inject #UD. ··· 1059 1198 else 1060 1199 svm_set_intercept(svm, INTERCEPT_RDTSCP); 1061 1200 } 1062 - } 1063 - 1064 - static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) 1065 - { 1066 - struct vcpu_svm *svm = to_svm(vcpu); 1067 1201 1068 1202 if (guest_cpuid_is_intel_compatible(vcpu)) { 1069 - /* 1070 - * We must intercept SYSENTER_EIP and SYSENTER_ESP 1071 - * accesses because the processor only stores 32 bits. 1072 - * For the same reason we cannot use virtual VMLOAD/VMSAVE. 1073 - */ 1074 1203 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1075 1204 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1076 1205 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1077 - 1078 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); 1079 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); 1080 1206 } else { 1081 1207 /* 1082 1208 * If hardware supports Virtual VMLOAD VMSAVE then enable it ··· 1074 1226 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1075 1227 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1076 1228 } 1077 - /* No need to intercept these MSRs */ 1078 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); 1079 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); 1080 1229 } 1230 + } 1231 + 1232 + static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) 1233 + { 1234 + svm_recalc_instruction_intercepts(vcpu); 1235 + svm_recalc_msr_intercepts(vcpu); 1081 1236 } 1082 1237 1083 1238 static void init_vmcb(struct kvm_vcpu *vcpu) ··· 1205 1354 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1206 1355 } 1207 1356 1208 - svm_recalc_instruction_intercepts(vcpu, svm); 1209 - 1210 - /* 1211 - * If the host supports V_SPEC_CTRL then disable the interception 1212 - * of MSR_IA32_SPEC_CTRL. 1213 - */ 1214 - if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 1215 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 1216 - 1217 1357 if (kvm_vcpu_apicv_active(vcpu)) 1218 1358 avic_init_vmcb(svm, vmcb); 1219 1359 ··· 1224 1382 sev_init_vmcb(svm); 1225 1383 1226 1384 svm_hv_init_vmcb(vmcb); 1227 - init_vmcb_after_set_cpuid(vcpu); 1385 + 1386 + svm_recalc_intercepts_after_set_cpuid(vcpu); 1228 1387 1229 1388 vmcb_mark_all_dirty(vmcb); 1230 1389 ··· 1235 1392 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) 1236 1393 { 1237 1394 struct vcpu_svm *svm = to_svm(vcpu); 1238 - 1239 - svm_vcpu_init_msrpm(vcpu, svm->msrpm); 1240 1395 1241 1396 svm_init_osvw(vcpu); 1242 1397 ··· 1340 1499 sev_free_vcpu(vcpu); 1341 1500 1342 1501 __free_page(__sme_pa_to_page(svm->vmcb01.pa)); 1343 - __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1502 + svm_vcpu_free_msrpm(svm->msrpm); 1344 1503 } 1345 1504 1346 1505 #ifdef CONFIG_CPU_MITIGATIONS ··· 2724 2883 return 0; 2725 2884 } 2726 2885 2727 - static bool 2728 - sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2886 + static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, 2887 + struct msr_data *msr_info) 2729 2888 { 2730 2889 return sev_es_guest(vcpu->kvm) && 2731 2890 vcpu->arch.guest_state_protected && 2732 - svm_msrpm_offset(msr_info->index) != MSR_INVALID && 2733 2891 !msr_write_intercepted(vcpu, msr_info->index); 2734 2892 } 2735 2893 ··· 2959 3119 * 2960 3120 * For nested: 2961 3121 * The handling of the MSR bitmap for L2 guests is done in 2962 - * nested_svm_vmrun_msrpm. 3122 + * nested_svm_merge_msrpm(). 2963 3123 * We update the L1 MSR bit as well since it will end up 2964 3124 * touching the MSR anyway now. 2965 3125 */ 2966 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 3126 + svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 2967 3127 break; 2968 3128 case MSR_AMD64_VIRT_SPEC_CTRL: 2969 3129 if (!msr->host_initiated && ··· 3029 3189 3030 3190 /* 3031 3191 * TSC_AUX is usually changed only during boot and never read 3032 - * directly. Intercept TSC_AUX instead of exposing it to the 3033 - * guest via direct_access_msrs, and switch it via user return. 3192 + * directly. Intercept TSC_AUX and switch it via user return. 3034 3193 */ 3035 3194 preempt_disable(); 3036 3195 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); ··· 4231 4392 guest_state_exit_irqoff(); 4232 4393 } 4233 4394 4234 - static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, 4235 - bool force_immediate_exit) 4395 + static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 4236 4396 { 4397 + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 4237 4398 struct vcpu_svm *svm = to_svm(vcpu); 4238 4399 bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); 4239 4400 ··· 4280 4441 svm_hv_update_vp_id(svm->vmcb, vcpu); 4281 4442 4282 4443 /* 4283 - * Run with all-zero DR6 unless needed, so that we can get the exact cause 4284 - * of a #DB. 4444 + * Run with all-zero DR6 unless the guest can write DR6 freely, so that 4445 + * KVM can get the exact cause of a #DB. Note, loading guest DR6 from 4446 + * KVM's snapshot is only necessary when DR accesses won't exit. 4285 4447 */ 4286 - if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) 4448 + if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) 4449 + svm_set_dr6(vcpu, vcpu->arch.dr6); 4450 + else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) 4287 4451 svm_set_dr6(vcpu, DR6_ACTIVE_LOW); 4288 4452 4289 4453 clgi(); ··· 4466 4624 if (guest_cpuid_is_intel_compatible(vcpu)) 4467 4625 guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); 4468 4626 4469 - svm_recalc_instruction_intercepts(vcpu, svm); 4470 - 4471 - if (boot_cpu_has(X86_FEATURE_IBPB)) 4472 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 4473 - !!guest_has_pred_cmd_msr(vcpu)); 4474 - 4475 - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 4476 - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, 4477 - !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4478 - 4479 4627 if (sev_guest(vcpu->kvm)) 4480 4628 sev_vcpu_after_set_cpuid(svm); 4481 4629 4482 - init_vmcb_after_set_cpuid(vcpu); 4630 + svm_recalc_intercepts_after_set_cpuid(vcpu); 4483 4631 } 4484 4632 4485 4633 static bool svm_has_wbinvd_exit(void) ··· 5020 5188 } 5021 5189 5022 5190 if (!pause_filter_count || !pause_filter_thresh) 5023 - kvm->arch.pause_in_guest = true; 5191 + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 5024 5192 5025 5193 if (enable_apicv) { 5026 5194 int ret = avic_vm_init(kvm); ··· 5087 5255 .set_idt = svm_set_idt, 5088 5256 .get_gdt = svm_get_gdt, 5089 5257 .set_gdt = svm_set_gdt, 5090 - .set_dr6 = svm_set_dr6, 5091 5258 .set_dr7 = svm_set_dr7, 5092 5259 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, 5093 5260 .cache_reg = svm_cache_reg, ··· 5171 5340 5172 5341 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 5173 5342 5174 - .msr_filter_changed = svm_msr_filter_changed, 5343 + .recalc_msr_intercepts = svm_recalc_msr_intercepts, 5175 5344 .complete_emulated_msr = svm_complete_emulated_msr, 5176 5345 5177 5346 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, ··· 5307 5476 5308 5477 static __init int svm_hardware_setup(void) 5309 5478 { 5310 - int cpu; 5311 - struct page *iopm_pages; 5312 5479 void *iopm_va; 5313 - int r; 5314 - unsigned int order = get_order(IOPM_SIZE); 5480 + int cpu, r; 5315 5481 5316 5482 /* 5317 5483 * NX is required for shadow paging and for NPT if the NX huge pages ··· 5319 5491 return -EOPNOTSUPP; 5320 5492 } 5321 5493 kvm_enable_efer_bits(EFER_NX); 5322 - 5323 - iopm_pages = alloc_pages(GFP_KERNEL, order); 5324 - 5325 - if (!iopm_pages) 5326 - return -ENOMEM; 5327 - 5328 - iopm_va = page_address(iopm_pages); 5329 - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 5330 - iopm_base = __sme_page_pa(iopm_pages); 5331 - 5332 - init_msrpm_offsets(); 5333 5494 5334 5495 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 5335 5496 XFEATURE_MASK_BNDCSR); ··· 5353 5536 if (nested) { 5354 5537 pr_info("Nested Virtualization enabled\n"); 5355 5538 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 5539 + 5540 + r = nested_svm_init_msrpm_merge_offsets(); 5541 + if (r) 5542 + return r; 5356 5543 } 5357 5544 5358 5545 /* ··· 5388 5567 else 5389 5568 pr_info("LBR virtualization supported\n"); 5390 5569 } 5570 + 5571 + iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL); 5572 + if (!iopm_va) 5573 + return -ENOMEM; 5574 + 5575 + iopm_base = __sme_set(__pa(iopm_va)); 5576 + 5391 5577 /* 5392 5578 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which 5393 5579 * may be modified by svm_adjust_mmio_mask()), as well as nrips.

+86 -18

arch/x86/kvm/svm/svm.h

··· 44 44 #define IOPM_SIZE PAGE_SIZE * 3 45 45 #define MSRPM_SIZE PAGE_SIZE * 2 46 46 47 - #define MAX_DIRECT_ACCESS_MSRS 48 48 - #define MSRPM_OFFSETS 32 49 - extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 50 47 extern bool npt_enabled; 51 48 extern int nrips; 52 49 extern int vgif; ··· 186 189 u64 vmcb12_gpa; 187 190 u64 last_vmcb12_gpa; 188 191 189 - /* These are the merged vectors */ 190 - u32 *msrpm; 192 + /* 193 + * The MSR permissions map used for vmcb02, which is the merge result 194 + * of vmcb01 and vmcb12 195 + */ 196 + void *msrpm; 191 197 192 198 /* A VMRUN has started but has not yet been performed, so 193 199 * we cannot inject a nested vmexit yet. */ ··· 271 271 */ 272 272 u64 virt_spec_ctrl; 273 273 274 - u32 *msrpm; 274 + void *msrpm; 275 275 276 276 ulong nmi_iret_rip; 277 277 ··· 325 325 */ 326 326 struct list_head ir_list; 327 327 spinlock_t ir_list_lock; 328 - 329 - /* Save desired MSR intercept (read: pass-through) state */ 330 - struct { 331 - DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); 332 - DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); 333 - } shadow_msr_intercept; 334 328 335 329 struct vcpu_sev_es_state sev_es; 336 330 ··· 615 621 svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); 616 622 } 617 623 618 - /* svm.c */ 619 - #define MSR_INVALID 0xffffffffU 624 + /* 625 + * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range 626 + * is reserved). Each MSR within a range is covered by two bits, one each for 627 + * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted. 628 + */ 629 + #define SVM_MSRPM_BYTES_PER_RANGE 2048 630 + #define SVM_BITS_PER_MSR 2 631 + #define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR) 632 + #define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE) 633 + static_assert(SVM_MSRS_PER_RANGE == 8192); 634 + #define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1) 635 + 636 + static __always_inline int svm_msrpm_bit_nr(u32 msr) 637 + { 638 + int range_nr; 639 + 640 + switch (msr & ~SVM_MSRPM_OFFSET_MASK) { 641 + case 0: 642 + range_nr = 0; 643 + break; 644 + case 0xc0000000: 645 + range_nr = 1; 646 + break; 647 + case 0xc0010000: 648 + range_nr = 2; 649 + break; 650 + default: 651 + return -EINVAL; 652 + } 653 + 654 + return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE + 655 + (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR; 656 + } 657 + 658 + #define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \ 659 + static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \ 660 + u32 msr) \ 661 + { \ 662 + int bit_nr; \ 663 + \ 664 + bit_nr = svm_msrpm_bit_nr(msr); \ 665 + if (bit_nr < 0) \ 666 + return (rtype)true; \ 667 + \ 668 + return bitop##_bit(bit_nr + bit_rw, bitmap); \ 669 + } 670 + 671 + #define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ 672 + __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \ 673 + __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1) 674 + 675 + BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test) 676 + BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear) 677 + BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set) 620 678 621 679 #define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) 622 680 681 + /* svm.c */ 623 682 extern bool dump_invalid_vmcb; 624 683 625 - u32 svm_msrpm_offset(u32 msr); 626 - u32 *svm_vcpu_alloc_msrpm(void); 627 - void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); 628 - void svm_vcpu_free_msrpm(u32 *msrpm); 684 + void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask); 685 + 686 + static inline void *svm_vcpu_alloc_msrpm(void) 687 + { 688 + return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT); 689 + } 690 + 691 + void svm_vcpu_free_msrpm(void *msrpm); 629 692 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); 630 693 void svm_enable_lbrv(struct kvm_vcpu *vcpu); 631 694 void svm_update_lbrv(struct kvm_vcpu *vcpu); ··· 701 650 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); 702 651 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, 703 652 int trig_mode, int vec); 653 + 654 + void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); 655 + 656 + static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu, 657 + u32 msr, int type) 658 + { 659 + svm_set_intercept_for_msr(vcpu, msr, type, false); 660 + } 661 + 662 + static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu, 663 + u32 msr, int type) 664 + { 665 + svm_set_intercept_for_msr(vcpu, msr, type, true); 666 + } 704 667 705 668 /* nested.c */ 706 669 ··· 743 678 { 744 679 return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); 745 680 } 681 + 682 + int __init nested_svm_init_msrpm_merge_offsets(void); 746 683 747 684 int enter_svm_guest_mode(struct kvm_vcpu *vcpu, 748 685 u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); ··· 829 762 void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); 830 763 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); 831 764 void sev_es_vcpu_reset(struct vcpu_svm *svm); 765 + void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu); 832 766 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); 833 767 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); 834 768 void sev_es_unmap_ghcb(struct vcpu_svm *svm);

-2

arch/x86/kvm/vmx/common.h

··· 53 53 #ifdef CONFIG_X86_64 54 54 u64 msr_host_kernel_gs_base; 55 55 #endif 56 - 57 - unsigned long host_debugctlmsr; 58 56 }; 59 57 60 58 #ifdef CONFIG_KVM_INTEL_TDX

+8 -15

arch/x86/kvm/vmx/main.c

··· 175 175 return vmx_vcpu_pre_run(vcpu); 176 176 } 177 177 178 - static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 178 + static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 179 179 { 180 180 if (is_td_vcpu(vcpu)) 181 - return tdx_vcpu_run(vcpu, force_immediate_exit); 181 + return tdx_vcpu_run(vcpu, run_flags); 182 182 183 - return vmx_vcpu_run(vcpu, force_immediate_exit); 183 + return vmx_vcpu_run(vcpu, run_flags); 184 184 } 185 185 186 186 static int vt_handle_exit(struct kvm_vcpu *vcpu, ··· 220 220 return vmx_get_msr(vcpu, msr_info); 221 221 } 222 222 223 - static void vt_msr_filter_changed(struct kvm_vcpu *vcpu) 223 + static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 224 224 { 225 225 /* 226 226 * TDX doesn't allow VMM to configure interception of MSR accesses. ··· 231 231 if (is_td_vcpu(vcpu)) 232 232 return; 233 233 234 - vmx_msr_filter_changed(vcpu); 234 + vmx_recalc_msr_intercepts(vcpu); 235 235 } 236 236 237 237 static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) ··· 487 487 return; 488 488 489 489 vmx_set_gdt(vcpu, dt); 490 - } 491 - 492 - static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 493 - { 494 - if (is_td_vcpu(vcpu)) 495 - return; 496 - 497 - vmx_set_dr6(vcpu, val); 498 490 } 499 491 500 492 static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) ··· 915 923 .vcpu_load = vt_op(vcpu_load), 916 924 .vcpu_put = vt_op(vcpu_put), 917 925 926 + .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS, 927 + 918 928 .update_exception_bitmap = vt_op(update_exception_bitmap), 919 929 .get_feature_msr = vmx_get_feature_msr, 920 930 .get_msr = vt_op(get_msr), ··· 937 943 .set_idt = vt_op(set_idt), 938 944 .get_gdt = vt_op(get_gdt), 939 945 .set_gdt = vt_op(set_gdt), 940 - .set_dr6 = vt_op(set_dr6), 941 946 .set_dr7 = vt_op(set_dr7), 942 947 .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs), 943 948 .cache_reg = vt_op(cache_reg), ··· 1027 1034 .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), 1028 1035 .migrate_timers = vmx_migrate_timers, 1029 1036 1030 - .msr_filter_changed = vt_op(msr_filter_changed), 1037 + .recalc_msr_intercepts = vt_op(recalc_msr_intercepts), 1031 1038 .complete_emulated_msr = vt_op(complete_emulated_msr), 1032 1039 1033 1040 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,

+22 -5

arch/x86/kvm/vmx/nested.c

··· 715 715 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 716 716 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 717 717 718 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 719 + MSR_IA32_APERF, MSR_TYPE_R); 720 + 721 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 722 + MSR_IA32_MPERF, MSR_TYPE_R); 723 + 718 724 kvm_vcpu_unmap(vcpu, &map); 719 725 720 726 vmx->nested.force_msr_bitmap_recalc = false; ··· 2669 2663 if (vmx->nested.nested_run_pending && 2670 2664 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2671 2665 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2672 - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2666 + vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2667 + vmx_get_supported_debugctl(vcpu, false)); 2673 2668 } else { 2674 2669 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2675 - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2670 + vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2676 2671 } 2677 2672 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2678 2673 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) ··· 3163 3156 return -EINVAL; 3164 3157 3165 3158 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3166 - CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3159 + (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3160 + CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) 3167 3161 return -EINVAL; 3168 3162 3169 3163 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && ··· 3538 3530 3539 3531 if (!vmx->nested.nested_run_pending || 3540 3532 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3541 - vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3533 + vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3542 3534 if (kvm_mpx_supported() && 3543 3535 (!vmx->nested.nested_run_pending || 3544 3536 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) ··· 4616 4608 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4617 4609 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4618 4610 4611 + /* 4612 + * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4613 + * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4614 + * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4615 + * vmcs02 doesn't strictly track vmcs12. 4616 + */ 4619 4617 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4620 4618 vmcs12->guest_dr7 = vcpu->arch.dr7; 4621 4619 ··· 4812 4798 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4813 4799 4814 4800 kvm_set_dr(vcpu, 7, 0x400); 4815 - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4801 + vmx_guest_debugctl_write(vcpu, 0); 4816 4802 4817 4803 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4818 4804 vmcs12->vm_exit_msr_load_count)) ··· 4866 4852 else 4867 4853 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4868 4854 } 4855 + 4856 + /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4857 + vmx_reload_guest_debugctl(vcpu); 4869 4858 4870 4859 /* 4871 4860 * Note that calling vmx_set_{efer,cr0,cr4} is important as they

+4 -4

arch/x86/kvm/vmx/pmu_intel.c

··· 653 653 */ 654 654 static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu) 655 655 { 656 - u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL); 656 + u64 data = vmx_guest_debugctl_read(); 657 657 658 658 if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) { 659 659 data &= ~DEBUGCTLMSR_LBR; 660 - vmcs_write64(GUEST_IA32_DEBUGCTL, data); 660 + vmx_guest_debugctl_write(vcpu, data); 661 661 } 662 662 } 663 663 ··· 730 730 731 731 if (!lbr_desc->event) { 732 732 vmx_disable_lbr_msrs_passthrough(vcpu); 733 - if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) 733 + if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR) 734 734 goto warn; 735 735 if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use)) 736 736 goto warn; ··· 752 752 753 753 static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) 754 754 { 755 - if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)) 755 + if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)) 756 756 intel_pmu_release_guest_lbr_event(vcpu); 757 757 } 758 758

+11 -13

arch/x86/kvm/vmx/tdx.c

··· 783 783 else 784 784 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 785 785 786 - vt->host_debugctlmsr = get_debugctlmsr(); 787 - 788 786 vt->guest_state_loaded = true; 789 787 } 790 788 ··· 1023 1025 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ 1024 1026 DEBUGCTLMSR_FREEZE_IN_SMM) 1025 1027 1026 - fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 1028 + fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 1027 1029 { 1028 1030 struct vcpu_tdx *tdx = to_tdx(vcpu); 1029 1031 struct vcpu_vt *vt = to_vt(vcpu); 1030 1032 1031 1033 /* 1032 - * force_immediate_exit requires vCPU entering for events injection with 1033 - * an immediately exit followed. But The TDX module doesn't guarantee 1034 - * entry, it's already possible for KVM to _think_ it completely entry 1035 - * to the guest without actually having done so. 1036 - * Since KVM never needs to force an immediate exit for TDX, and can't 1037 - * do direct injection, just warn on force_immediate_exit. 1034 + * WARN if KVM wants to force an immediate exit, as the TDX module does 1035 + * not guarantee entry into the guest, i.e. it's possible for KVM to 1036 + * _think_ it completed entry to the guest and forced an immediate exit 1037 + * without actually having done so. Luckily, KVM never needs to force 1038 + * an immediate exit for TDX (KVM can't do direct event injection, so 1039 + * just WARN and continue on. 1038 1040 */ 1039 - WARN_ON_ONCE(force_immediate_exit); 1041 + WARN_ON_ONCE(run_flags); 1040 1042 1041 1043 /* 1042 1044 * Wait until retry of SEPT-zap-related SEAMCALL completes before ··· 1046 1048 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) 1047 1049 return EXIT_FASTPATH_EXIT_HANDLED; 1048 1050 1049 - trace_kvm_entry(vcpu, force_immediate_exit); 1051 + trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT); 1050 1052 1051 1053 if (pi_test_on(&vt->pi_desc)) { 1052 1054 apic->send_IPI_self(POSTED_INTR_VECTOR); ··· 1058 1060 1059 1061 tdx_vcpu_enter_exit(vcpu); 1060 1062 1061 - if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) 1062 - update_debugctlmsr(vt->host_debugctlmsr); 1063 + if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED) 1064 + update_debugctlmsr(vcpu->arch.host_debugctl); 1063 1065 1064 1066 tdx_load_host_xsave_state(vcpu); 1065 1067 tdx->guest_entered = true;

+91 -193

arch/x86/kvm/vmx/vmx.c

··· 168 168 RTIT_STATUS_BYTECNT)) 169 169 170 170 /* 171 - * List of MSRs that can be directly passed to the guest. 172 - * In addition to these x2apic, PT and LBR MSRs are handled specially. 173 - */ 174 - static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 175 - MSR_IA32_SPEC_CTRL, 176 - MSR_IA32_PRED_CMD, 177 - MSR_IA32_FLUSH_CMD, 178 - MSR_IA32_TSC, 179 - #ifdef CONFIG_X86_64 180 - MSR_FS_BASE, 181 - MSR_GS_BASE, 182 - MSR_KERNEL_GS_BASE, 183 - MSR_IA32_XFD, 184 - MSR_IA32_XFD_ERR, 185 - #endif 186 - MSR_IA32_SYSENTER_CS, 187 - MSR_IA32_SYSENTER_ESP, 188 - MSR_IA32_SYSENTER_EIP, 189 - MSR_CORE_C1_RES, 190 - MSR_CORE_C3_RESIDENCY, 191 - MSR_CORE_C6_RESIDENCY, 192 - MSR_CORE_C7_RESIDENCY, 193 - }; 194 - 195 - /* 196 171 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 197 172 * ple_gap: upper bound on the amount of time between two successive 198 173 * executions of PAUSE in a loop. Also indicate if ple enabled. ··· 647 672 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 648 673 { 649 674 return flexpriority_enabled && lapic_in_kernel(vcpu); 650 - } 651 - 652 - static int vmx_get_passthrough_msr_slot(u32 msr) 653 - { 654 - int i; 655 - 656 - switch (msr) { 657 - case 0x800 ... 0x8ff: 658 - /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 659 - return -ENOENT; 660 - case MSR_IA32_RTIT_STATUS: 661 - case MSR_IA32_RTIT_OUTPUT_BASE: 662 - case MSR_IA32_RTIT_OUTPUT_MASK: 663 - case MSR_IA32_RTIT_CR3_MATCH: 664 - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 665 - /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 666 - case MSR_LBR_SELECT: 667 - case MSR_LBR_TOS: 668 - case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 669 - case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 670 - case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 671 - case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 672 - case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 673 - /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 674 - return -ENOENT; 675 - } 676 - 677 - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 678 - if (vmx_possible_passthrough_msrs[i] == msr) 679 - return i; 680 - } 681 - 682 - WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 683 - return -ENOENT; 684 675 } 685 676 686 677 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) ··· 2094 2153 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2095 2154 break; 2096 2155 case MSR_IA32_DEBUGCTLMSR: 2097 - msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2156 + msr_info->data = vmx_guest_debugctl_read(); 2098 2157 break; 2099 2158 default: 2100 2159 find_uret_msr: ··· 2119 2178 return (unsigned long)data; 2120 2179 } 2121 2180 2122 - static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2181 + u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2123 2182 { 2124 2183 u64 debugctl = 0; 2125 2184 ··· 2131 2190 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2132 2191 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2133 2192 2193 + if (boot_cpu_has(X86_FEATURE_RTM) && 2194 + (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) 2195 + debugctl |= DEBUGCTLMSR_RTM_DEBUG; 2196 + 2134 2197 return debugctl; 2198 + } 2199 + 2200 + bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) 2201 + { 2202 + u64 invalid; 2203 + 2204 + invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); 2205 + if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { 2206 + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 2207 + invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); 2208 + } 2209 + return !invalid; 2135 2210 } 2136 2211 2137 2212 /* ··· 2218 2261 } 2219 2262 vmcs_writel(GUEST_SYSENTER_ESP, data); 2220 2263 break; 2221 - case MSR_IA32_DEBUGCTLMSR: { 2222 - u64 invalid; 2223 - 2224 - invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2225 - if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2226 - kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2227 - data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2228 - invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2229 - } 2230 - 2231 - if (invalid) 2264 + case MSR_IA32_DEBUGCTLMSR: 2265 + if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) 2232 2266 return 1; 2267 + 2268 + data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2233 2269 2234 2270 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2235 2271 VM_EXIT_SAVE_DEBUG_CONTROLS) 2236 2272 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2237 2273 2238 - vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2274 + vmx_guest_debugctl_write(vcpu, data); 2275 + 2239 2276 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2240 2277 (data & DEBUGCTLMSR_LBR)) 2241 2278 intel_pmu_create_guest_lbr_event(vcpu); 2242 2279 return 0; 2243 - } 2244 2280 case MSR_IA32_BNDCFGS: 2245 2281 if (!kvm_mpx_supported() || 2246 2282 (!msr_info->host_initiated && ··· 3967 4017 vmx->nested.force_msr_bitmap_recalc = true; 3968 4018 } 3969 4019 3970 - void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4020 + void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 3971 4021 { 3972 4022 struct vcpu_vmx *vmx = to_vmx(vcpu); 3973 4023 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3974 - int idx; 3975 4024 3976 4025 if (!cpu_has_vmx_msr_bitmap()) 3977 4026 return; 3978 4027 3979 4028 vmx_msr_bitmap_l01_changed(vmx); 3980 4029 3981 - /* 3982 - * Mark the desired intercept state in shadow bitmap, this is needed 3983 - * for resync when the MSR filters change. 3984 - */ 3985 - idx = vmx_get_passthrough_msr_slot(msr); 3986 - if (idx >= 0) { 3987 - if (type & MSR_TYPE_R) 3988 - clear_bit(idx, vmx->shadow_msr_intercept.read); 3989 - if (type & MSR_TYPE_W) 3990 - clear_bit(idx, vmx->shadow_msr_intercept.write); 4030 + if (type & MSR_TYPE_R) { 4031 + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 4032 + vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4033 + else 4034 + vmx_set_msr_bitmap_read(msr_bitmap, msr); 3991 4035 } 3992 4036 3993 - if ((type & MSR_TYPE_R) && 3994 - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 3995 - vmx_set_msr_bitmap_read(msr_bitmap, msr); 3996 - type &= ~MSR_TYPE_R; 4037 + if (type & MSR_TYPE_W) { 4038 + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 4039 + vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4040 + else 4041 + vmx_set_msr_bitmap_write(msr_bitmap, msr); 3997 4042 } 3998 - 3999 - if ((type & MSR_TYPE_W) && 4000 - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 4001 - vmx_set_msr_bitmap_write(msr_bitmap, msr); 4002 - type &= ~MSR_TYPE_W; 4003 - } 4004 - 4005 - if (type & MSR_TYPE_R) 4006 - vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4007 - 4008 - if (type & MSR_TYPE_W) 4009 - vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4010 - } 4011 - 4012 - void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4013 - { 4014 - struct vcpu_vmx *vmx = to_vmx(vcpu); 4015 - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4016 - int idx; 4017 - 4018 - if (!cpu_has_vmx_msr_bitmap()) 4019 - return; 4020 - 4021 - vmx_msr_bitmap_l01_changed(vmx); 4022 - 4023 - /* 4024 - * Mark the desired intercept state in shadow bitmap, this is needed 4025 - * for resync when the MSR filter changes. 4026 - */ 4027 - idx = vmx_get_passthrough_msr_slot(msr); 4028 - if (idx >= 0) { 4029 - if (type & MSR_TYPE_R) 4030 - set_bit(idx, vmx->shadow_msr_intercept.read); 4031 - if (type & MSR_TYPE_W) 4032 - set_bit(idx, vmx->shadow_msr_intercept.write); 4033 - } 4034 - 4035 - if (type & MSR_TYPE_R) 4036 - vmx_set_msr_bitmap_read(msr_bitmap, msr); 4037 - 4038 - if (type & MSR_TYPE_W) 4039 - vmx_set_msr_bitmap_write(msr_bitmap, msr); 4040 4043 } 4041 4044 4042 4045 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) ··· 4068 4165 } 4069 4166 } 4070 4167 4071 - void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4168 + void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4072 4169 { 4073 - struct vcpu_vmx *vmx = to_vmx(vcpu); 4074 - u32 i; 4075 - 4076 4170 if (!cpu_has_vmx_msr_bitmap()) 4077 4171 return; 4078 4172 4079 - /* 4080 - * Redo intercept permissions for MSRs that KVM is passing through to 4081 - * the guest. Disabling interception will check the new MSR filter and 4082 - * ensure that KVM enables interception if usersepace wants to filter 4083 - * the MSR. MSRs that KVM is already intercepting don't need to be 4084 - * refreshed since KVM is going to intercept them regardless of what 4085 - * userspace wants. 4086 - */ 4087 - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4088 - u32 msr = vmx_possible_passthrough_msrs[i]; 4089 - 4090 - if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4091 - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4092 - 4093 - if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4094 - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4173 + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 4174 + #ifdef CONFIG_X86_64 4175 + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 4176 + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 4177 + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 4178 + #endif 4179 + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 4180 + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 4181 + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 4182 + if (kvm_cstate_in_guest(vcpu->kvm)) { 4183 + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 4184 + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 4185 + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 4186 + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 4187 + } 4188 + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 4189 + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 4190 + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 4095 4191 } 4096 4192 4097 4193 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4098 4194 if (vmx_pt_mode_is_host_guest()) 4099 4195 pt_update_intercept_for_msr(vcpu); 4196 + 4197 + if (vcpu->arch.xfd_no_write_intercept) 4198 + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); 4199 + 4200 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 4201 + !to_vmx(vcpu)->spec_ctrl); 4202 + 4203 + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 4204 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 4205 + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 4206 + 4207 + if (cpu_feature_enabled(X86_FEATURE_IBPB)) 4208 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 4209 + !guest_has_pred_cmd_msr(vcpu)); 4210 + 4211 + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 4212 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 4213 + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4214 + 4215 + /* 4216 + * x2APIC and LBR MSR intercepts are modified on-demand and cannot be 4217 + * filtered by userspace. 4218 + */ 4100 4219 } 4101 4220 4102 4221 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, ··· 4719 4794 vmcs_write32(GUEST_SYSENTER_CS, 0); 4720 4795 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4721 4796 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4722 - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4797 + 4798 + vmx_guest_debugctl_write(&vmx->vcpu, 0); 4723 4799 4724 4800 if (cpu_has_vmx_tpr_shadow()) { 4725 4801 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); ··· 5534 5608 * a stale dr6 from the guest. 5535 5609 */ 5536 5610 set_debugreg(DR6_RESERVED, 6); 5537 - } 5538 - 5539 - void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5540 - { 5541 - lockdep_assert_irqs_disabled(); 5542 - set_debugreg(vcpu->arch.dr6, 6); 5543 5611 } 5544 5612 5545 5613 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) ··· 7247 7327 guest_state_exit_irqoff(); 7248 7328 } 7249 7329 7250 - fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7330 + fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 7251 7331 { 7332 + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 7252 7333 struct vcpu_vmx *vmx = to_vmx(vcpu); 7253 7334 unsigned long cr3, cr4; 7254 7335 ··· 7293 7372 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7294 7373 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7295 7374 vcpu->arch.regs_dirty = 0; 7375 + 7376 + if (run_flags & KVM_RUN_LOAD_GUEST_DR6) 7377 + set_debugreg(vcpu->arch.dr6, 6); 7378 + 7379 + if (run_flags & KVM_RUN_LOAD_DEBUGCTL) 7380 + vmx_reload_guest_debugctl(vcpu); 7296 7381 7297 7382 /* 7298 7383 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately ··· 7474 7547 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7475 7548 } 7476 7549 7477 - /* The MSR bitmap starts with all ones */ 7478 - bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7479 - bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7480 - 7481 - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7482 - #ifdef CONFIG_X86_64 7483 - vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7484 - vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7485 - vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7486 - #endif 7487 - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7488 - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7489 - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7490 - if (kvm_cstate_in_guest(vcpu->kvm)) { 7491 - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7492 - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7493 - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7494 - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7495 - } 7496 - 7497 7550 vmx->loaded_vmcs = &vmx->vmcs01; 7498 7551 7499 7552 if (cpu_need_virtualize_apic_accesses(vcpu)) { ··· 7523 7616 int vmx_vm_init(struct kvm *kvm) 7524 7617 { 7525 7618 if (!ple_gap) 7526 - kvm->arch.pause_in_guest = true; 7619 + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 7527 7620 7528 7621 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7529 7622 switch (l1tf_mitigation) { ··· 7760 7853 } 7761 7854 } 7762 7855 7763 - if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7764 - vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7765 - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 7766 - 7767 - if (boot_cpu_has(X86_FEATURE_IBPB)) 7768 - vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7769 - !guest_has_pred_cmd_msr(vcpu)); 7770 - 7771 - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7772 - vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7773 - !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7774 - 7775 7856 set_cr4_guest_host_mask(vmx); 7776 7857 7777 7858 vmx_write_encls_bitmap(vcpu, NULL); ··· 7774 7879 else 7775 7880 vmx->msr_ia32_feature_control_valid_bits &= 7776 7881 ~FEAT_CTL_SGX_LC_ENABLED; 7882 + 7883 + /* Recalc MSR interception to account for feature changes. */ 7884 + vmx_recalc_msr_intercepts(vcpu); 7777 7885 7778 7886 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7779 7887 vmx_update_exception_bitmap(vcpu);

+39 -18

arch/x86/kvm/vmx/vmx.h

··· 19 19 #include "../mmu.h" 20 20 #include "common.h" 21 21 22 - #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) 23 - 24 22 #ifdef CONFIG_X86_64 25 23 #define MAX_NR_USER_RETURN_MSRS 7 26 24 #else ··· 294 296 struct pt_desc pt_desc; 295 297 struct lbr_desc lbr_desc; 296 298 297 - /* Save desired MSR intercept (read: pass-through) state */ 298 - #define MAX_POSSIBLE_PASSTHROUGH_MSRS 16 299 - struct { 300 - DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 301 - DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 302 - } shadow_msr_intercept; 303 - 304 299 /* ve_info must be page aligned. */ 305 300 struct vmx_ve_information *ve_info; 306 301 }; ··· 386 395 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); 387 396 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); 388 397 389 - void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); 390 - void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); 398 + void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); 399 + 400 + static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, 401 + u32 msr, int type) 402 + { 403 + vmx_set_intercept_for_msr(vcpu, msr, type, false); 404 + } 405 + 406 + static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, 407 + u32 msr, int type) 408 + { 409 + vmx_set_intercept_for_msr(vcpu, msr, type, true); 410 + } 391 411 392 412 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); 393 413 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); 394 414 395 415 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); 396 416 397 - static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, 398 - int type, bool value) 417 + void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); 418 + 419 + u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated); 420 + bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated); 421 + 422 + #define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM) 423 + 424 + static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val) 399 425 { 400 - if (value) 401 - vmx_enable_intercept_for_msr(vcpu, msr, type); 402 - else 403 - vmx_disable_intercept_for_msr(vcpu, msr, type); 426 + WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS); 427 + 428 + val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS; 429 + vmcs_write64(GUEST_IA32_DEBUGCTL, val); 404 430 } 405 431 406 - void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); 432 + static inline u64 vmx_guest_debugctl_read(void) 433 + { 434 + return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS; 435 + } 436 + 437 + static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu) 438 + { 439 + u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL); 440 + 441 + if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS)) 442 + return; 443 + 444 + vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS); 445 + } 407 446 408 447 /* 409 448 * Note, early Intel manuals have the write-low and read-high bitmap offsets

+3 -3

arch/x86/kvm/vmx/x86_ops.h

··· 21 21 int vmx_vcpu_precreate(struct kvm *kvm); 22 22 int vmx_vcpu_create(struct kvm_vcpu *vcpu); 23 23 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu); 24 - fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); 24 + fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags); 25 25 void vmx_vcpu_free(struct kvm_vcpu *vcpu); 26 26 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); 27 27 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); ··· 52 52 int trig_mode, int vector); 53 53 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); 54 54 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); 55 - void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); 55 + void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu); 56 56 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 57 57 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); 58 58 int vmx_get_feature_msr(u32 msr, u64 *data); ··· 133 133 void tdx_vcpu_free(struct kvm_vcpu *vcpu); 134 134 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); 135 135 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu); 136 - fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); 136 + fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags); 137 137 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 138 138 void tdx_vcpu_put(struct kvm_vcpu *vcpu); 139 139 bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu);

+69 -37

arch/x86/kvm/x86.c

··· 4582 4582 { 4583 4583 u64 r = KVM_X86_DISABLE_EXITS_PAUSE; 4584 4584 4585 + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) 4586 + r |= KVM_X86_DISABLE_EXITS_APERFMPERF; 4587 + 4585 4588 if (!mitigate_smt_rsb) { 4586 4589 r |= KVM_X86_DISABLE_EXITS_HLT | 4587 4590 KVM_X86_DISABLE_EXITS_CSTATE; ··· 5496 5493 5497 5494 if ((events->exception.injected || events->exception.pending) && 5498 5495 (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) 5499 - return -EINVAL; 5500 - 5501 - /* INITs are latched while in SMM */ 5502 - if (events->flags & KVM_VCPUEVENT_VALID_SMM && 5503 - (events->smi.smm || events->smi.pending) && 5504 - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 5505 5496 return -EINVAL; 5506 5497 5507 5498 process_nmi(vcpu); ··· 6487 6490 6488 6491 if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) && 6489 6492 cpu_smt_possible() && 6490 - (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) 6493 + (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE | 6494 + KVM_X86_DISABLE_EXITS_APERFMPERF))) 6491 6495 pr_warn_once(SMT_RSB_MSG); 6492 6496 6493 - if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) 6494 - kvm->arch.pause_in_guest = true; 6495 - if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) 6496 - kvm->arch.mwait_in_guest = true; 6497 - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) 6498 - kvm->arch.hlt_in_guest = true; 6499 - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) 6500 - kvm->arch.cstate_in_guest = true; 6497 + kvm_disable_exits(kvm, cap->args[0]); 6501 6498 r = 0; 6502 6499 disable_exits_unlock: 6503 6500 mutex_unlock(&kvm->lock); ··· 7202 7211 if (user_tsc_khz == 0) 7203 7212 user_tsc_khz = tsc_khz; 7204 7213 7205 - WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); 7206 - r = 0; 7207 - 7214 + mutex_lock(&kvm->lock); 7215 + if (!kvm->created_vcpus) { 7216 + WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); 7217 + r = 0; 7218 + } 7219 + mutex_unlock(&kvm->lock); 7208 7220 goto out; 7209 7221 } 7210 7222 case KVM_GET_TSC_KHZ: { ··· 10651 10657 dm_request_for_irq_injection(vcpu) && 10652 10658 kvm_cpu_accept_dm_intr(vcpu); 10653 10659 fastpath_t exit_fastpath; 10660 + u64 run_flags, debug_ctl; 10654 10661 10655 10662 bool req_immediate_exit = false; 10656 10663 ··· 10799 10804 kvm_vcpu_update_apicv(vcpu); 10800 10805 if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) 10801 10806 kvm_check_async_pf_completion(vcpu); 10807 + 10808 + /* 10809 + * Recalc MSR intercepts as userspace may want to intercept 10810 + * accesses to MSRs that KVM would otherwise pass through to 10811 + * the guest. 10812 + */ 10802 10813 if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) 10803 - kvm_x86_call(msr_filter_changed)(vcpu); 10814 + kvm_x86_call(recalc_msr_intercepts)(vcpu); 10804 10815 10805 10816 if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) 10806 10817 kvm_x86_call(update_cpu_dirty_logging)(vcpu); ··· 10902 10901 goto cancel_injection; 10903 10902 } 10904 10903 10905 - if (req_immediate_exit) 10904 + run_flags = 0; 10905 + if (req_immediate_exit) { 10906 + run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; 10906 10907 kvm_make_request(KVM_REQ_EVENT, vcpu); 10908 + } 10907 10909 10908 10910 fpregs_assert_state_consistent(); 10909 10911 if (test_thread_flag(TIF_NEED_FPU_LOAD)) ··· 10924 10920 set_debugreg(vcpu->arch.eff_db[3], 3); 10925 10921 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ 10926 10922 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 10927 - kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); 10923 + run_flags |= KVM_RUN_LOAD_GUEST_DR6; 10928 10924 } else if (unlikely(hw_breakpoint_active())) { 10929 10925 set_debugreg(DR7_FIXED_1, 7); 10930 10926 } 10931 10927 10932 - vcpu->arch.host_debugctl = get_debugctlmsr(); 10928 + /* 10929 + * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL 10930 + * can be modified in IRQ context, e.g. via SMP function calls. Inform 10931 + * vendor code if any host-owned bits were changed, e.g. so that the 10932 + * value loaded into hardware while running the guest can be updated. 10933 + */ 10934 + debug_ctl = get_debugctlmsr(); 10935 + if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && 10936 + !vcpu->arch.guest_state_protected) 10937 + run_flags |= KVM_RUN_LOAD_DEBUGCTL; 10938 + vcpu->arch.host_debugctl = debug_ctl; 10933 10939 10934 10940 guest_timing_enter_irqoff(); 10935 10941 ··· 10953 10939 WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && 10954 10940 (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); 10955 10941 10956 - exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, 10957 - req_immediate_exit); 10942 + exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags); 10958 10943 if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) 10959 10944 break; 10960 10945 ··· 10964 10951 exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; 10965 10952 break; 10966 10953 } 10954 + 10955 + run_flags = 0; 10967 10956 10968 10957 /* Note, VM-Exits that go down the "slow" path are accounted below. */ 10969 10958 ++vcpu->stat.exits; ··· 11440 11425 trace_kvm_fpu(0); 11441 11426 } 11442 11427 11428 + static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) 11429 + { 11430 + /* 11431 + * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and 11432 + * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted 11433 + * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be 11434 + * converted to INIT_RECEIVED. 11435 + */ 11436 + if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) 11437 + return -EINVAL; 11438 + 11439 + /* 11440 + * Disallow running the vCPU if userspace forced it into an impossible 11441 + * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked. 11442 + */ 11443 + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED && 11444 + !kvm_apic_init_sipi_allowed(vcpu)) 11445 + return -EINVAL; 11446 + 11447 + return kvm_x86_call(vcpu_pre_run)(vcpu); 11448 + } 11449 + 11443 11450 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) 11444 11451 { 11445 11452 struct kvm_queued_exception *ex = &vcpu->arch.exception; ··· 11564 11527 goto out; 11565 11528 } 11566 11529 11567 - r = kvm_x86_call(vcpu_pre_run)(vcpu); 11530 + r = kvm_x86_vcpu_pre_run(vcpu); 11568 11531 if (r <= 0) 11569 11532 goto out; 11570 11533 ··· 11808 11771 } 11809 11772 11810 11773 /* 11811 - * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow 11812 - * forcing the guest into INIT/SIPI if those events are supposed to be 11813 - * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state 11814 - * if an SMI is pending as well. 11774 + * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead 11775 + * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI. 11776 + * Translate SIPI_RECEIVED as appropriate for backwards compatibility. 11815 11777 */ 11816 - if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) && 11817 - (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || 11818 - mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) 11819 - goto out; 11820 - 11821 11778 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { 11822 - kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 11779 + mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED; 11823 11780 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); 11824 - } else 11825 - kvm_set_mp_state(vcpu, mp_state->mp_state); 11781 + } 11782 + 11783 + kvm_set_mp_state(vcpu, mp_state->mp_state); 11826 11784 kvm_make_request(KVM_REQ_EVENT, vcpu); 11827 11785 11828 11786 ret = 0;

+14 -4

arch/x86/kvm/x86.h

··· 499 499 __rem; \ 500 500 }) 501 501 502 + static inline void kvm_disable_exits(struct kvm *kvm, u64 mask) 503 + { 504 + kvm->arch.disabled_exits |= mask; 505 + } 506 + 502 507 static inline bool kvm_mwait_in_guest(struct kvm *kvm) 503 508 { 504 - return kvm->arch.mwait_in_guest; 509 + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_MWAIT; 505 510 } 506 511 507 512 static inline bool kvm_hlt_in_guest(struct kvm *kvm) 508 513 { 509 - return kvm->arch.hlt_in_guest; 514 + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_HLT; 510 515 } 511 516 512 517 static inline bool kvm_pause_in_guest(struct kvm *kvm) 513 518 { 514 - return kvm->arch.pause_in_guest; 519 + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_PAUSE; 515 520 } 516 521 517 522 static inline bool kvm_cstate_in_guest(struct kvm *kvm) 518 523 { 519 - return kvm->arch.cstate_in_guest; 524 + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE; 525 + } 526 + 527 + static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm) 528 + { 529 + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF; 520 530 } 521 531 522 532 static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)

+1

include/uapi/linux/kvm.h

··· 644 644 #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) 645 645 #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) 646 646 #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) 647 + #define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4) 647 648 648 649 /* for KVM_ENABLE_CAP */ 649 650 struct kvm_enable_cap {

+1

tools/include/uapi/linux/kvm.h

··· 618 618 #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) 619 619 #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) 620 620 #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) 621 + #define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4) 621 622 622 623 /* for KVM_ENABLE_CAP */ 623 624 struct kvm_enable_cap {

+1

tools/testing/selftests/kvm/Makefile.kvm

··· 135 135 TEST_GEN_PROGS_x86 += x86/max_vcpuid_cap_test 136 136 TEST_GEN_PROGS_x86 += x86/triple_fault_event_test 137 137 TEST_GEN_PROGS_x86 += x86/recalc_apic_map_test 138 + TEST_GEN_PROGS_x86 += x86/aperfmperf_test 138 139 TEST_GEN_PROGS_x86 += access_tracking_perf_test 139 140 TEST_GEN_PROGS_x86 += coalesced_io_test 140 141 TEST_GEN_PROGS_x86 += dirty_log_perf_test

+1 -6

tools/testing/selftests/kvm/arch_timer.c

··· 98 98 static int test_migrate_vcpu(unsigned int vcpu_idx) 99 99 { 100 100 int ret; 101 - cpu_set_t cpuset; 102 101 uint32_t new_pcpu = test_get_pcpu(); 103 - 104 - CPU_ZERO(&cpuset); 105 - CPU_SET(new_pcpu, &cpuset); 106 102 107 103 pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu_idx, new_pcpu); 108 104 109 - ret = pthread_setaffinity_np(pt_vcpu_run[vcpu_idx], 110 - sizeof(cpuset), &cpuset); 105 + ret = __pin_task_to_cpu(pt_vcpu_run[vcpu_idx], new_pcpu); 111 106 112 107 /* Allow the error where the vCPU thread is already finished */ 113 108 TEST_ASSERT(ret == 0 || ret == ESRCH,

+2 -21

tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c

··· 862 862 return next; 863 863 } 864 864 865 - static void migrate_self(uint32_t new_pcpu) 866 - { 867 - int ret; 868 - cpu_set_t cpuset; 869 - pthread_t thread; 870 - 871 - thread = pthread_self(); 872 - 873 - CPU_ZERO(&cpuset); 874 - CPU_SET(new_pcpu, &cpuset); 875 - 876 - pr_debug("Migrating from %u to %u\n", sched_getcpu(), new_pcpu); 877 - 878 - ret = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset); 879 - 880 - TEST_ASSERT(ret == 0, "Failed to migrate to pCPU: %u; ret: %d\n", 881 - new_pcpu, ret); 882 - } 883 - 884 865 static void kvm_set_cntxct(struct kvm_vcpu *vcpu, uint64_t cnt, 885 866 enum arch_timer timer) 886 867 { ··· 888 907 sched_yield(); 889 908 break; 890 909 case USERSPACE_MIGRATE_SELF: 891 - migrate_self(next_pcpu()); 910 + pin_self_to_cpu(next_pcpu()); 892 911 break; 893 912 default: 894 913 break; ··· 900 919 struct ucall uc; 901 920 902 921 /* Start on CPU 0 */ 903 - migrate_self(0); 922 + pin_self_to_cpu(0); 904 923 905 924 while (true) { 906 925 vcpu_run(vcpu);

+30 -1

tools/testing/selftests/kvm/include/kvm_util.h

··· 21 21 #include <sys/eventfd.h> 22 22 #include <sys/ioctl.h> 23 23 24 + #include <pthread.h> 25 + 24 26 #include "kvm_util_arch.h" 25 27 #include "kvm_util_types.h" 26 28 #include "sparsebit.h" ··· 1055 1053 1056 1054 void kvm_set_files_rlimit(uint32_t nr_vcpus); 1057 1055 1058 - void kvm_pin_this_task_to_pcpu(uint32_t pcpu); 1056 + int __pin_task_to_cpu(pthread_t task, int cpu); 1057 + 1058 + static inline void pin_task_to_cpu(pthread_t task, int cpu) 1059 + { 1060 + int r; 1061 + 1062 + r = __pin_task_to_cpu(task, cpu); 1063 + TEST_ASSERT(!r, "Failed to set thread affinity to pCPU '%u'", cpu); 1064 + } 1065 + 1066 + static inline int pin_task_to_any_cpu(pthread_t task) 1067 + { 1068 + int cpu = sched_getcpu(); 1069 + 1070 + pin_task_to_cpu(task, cpu); 1071 + return cpu; 1072 + } 1073 + 1074 + static inline void pin_self_to_cpu(int cpu) 1075 + { 1076 + pin_task_to_cpu(pthread_self(), cpu); 1077 + } 1078 + 1079 + static inline int pin_self_to_any_cpu(void) 1080 + { 1081 + return pin_task_to_any_cpu(pthread_self()); 1082 + } 1083 + 1059 1084 void kvm_print_vcpu_pinning_help(void); 1060 1085 void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], 1061 1086 int nr_vcpus);

+7 -8

tools/testing/selftests/kvm/lib/kvm_util.c

··· 605 605 return vm_vcpu_recreate(vm, 0); 606 606 } 607 607 608 - void kvm_pin_this_task_to_pcpu(uint32_t pcpu) 608 + int __pin_task_to_cpu(pthread_t task, int cpu) 609 609 { 610 - cpu_set_t mask; 611 - int r; 610 + cpu_set_t cpuset; 612 611 613 - CPU_ZERO(&mask); 614 - CPU_SET(pcpu, &mask); 615 - r = sched_setaffinity(0, sizeof(mask), &mask); 616 - TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.", pcpu); 612 + CPU_ZERO(&cpuset); 613 + CPU_SET(cpu, &cpuset); 614 + 615 + return pthread_setaffinity_np(task, sizeof(cpuset), &cpuset); 617 616 } 618 617 619 618 static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) ··· 666 667 667 668 /* 2. Check if the main worker needs to be pinned. */ 668 669 if (cpu) { 669 - kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask)); 670 + pin_self_to_cpu(parse_pcpu(cpu, &allowed_mask)); 670 671 cpu = strtok(NULL, delim); 671 672 } 672 673

+1 -1

tools/testing/selftests/kvm/lib/memstress.c

··· 265 265 int vcpu_idx = vcpu->vcpu_idx; 266 266 267 267 if (memstress_args.pin_vcpus) 268 - kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]); 268 + pin_self_to_cpu(memstress_args.vcpu_to_pcpu[vcpu_idx]); 269 269 270 270 WRITE_ONCE(vcpu->running, true); 271 271

+213

tools/testing/selftests/kvm/x86/aperfmperf_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Test for KVM_X86_DISABLE_EXITS_APERFMPERF 4 + * 5 + * Copyright (C) 2025, Google LLC. 6 + * 7 + * Test the ability to disable VM-exits for rdmsr of IA32_APERF and 8 + * IA32_MPERF. When these VM-exits are disabled, reads of these MSRs 9 + * return the host's values. 10 + * 11 + * Note: Requires read access to /dev/cpu/<lpu>/msr to read host MSRs. 12 + */ 13 + 14 + #include <fcntl.h> 15 + #include <limits.h> 16 + #include <stdbool.h> 17 + #include <stdio.h> 18 + #include <stdint.h> 19 + #include <unistd.h> 20 + #include <asm/msr-index.h> 21 + 22 + #include "kvm_util.h" 23 + #include "processor.h" 24 + #include "svm_util.h" 25 + #include "test_util.h" 26 + #include "vmx.h" 27 + 28 + #define NUM_ITERATIONS 10000 29 + 30 + static int open_dev_msr(int cpu) 31 + { 32 + char path[PATH_MAX]; 33 + 34 + snprintf(path, sizeof(path), "/dev/cpu/%d/msr", cpu); 35 + return open_path_or_exit(path, O_RDONLY); 36 + } 37 + 38 + static uint64_t read_dev_msr(int msr_fd, uint32_t msr) 39 + { 40 + uint64_t data; 41 + ssize_t rc; 42 + 43 + rc = pread(msr_fd, &data, sizeof(data), msr); 44 + TEST_ASSERT(rc == sizeof(data), "Read of MSR 0x%x failed", msr); 45 + 46 + return data; 47 + } 48 + 49 + static void guest_read_aperf_mperf(void) 50 + { 51 + int i; 52 + 53 + for (i = 0; i < NUM_ITERATIONS; i++) 54 + GUEST_SYNC2(rdmsr(MSR_IA32_APERF), rdmsr(MSR_IA32_MPERF)); 55 + } 56 + 57 + #define L2_GUEST_STACK_SIZE 64 58 + 59 + static void l2_guest_code(void) 60 + { 61 + guest_read_aperf_mperf(); 62 + GUEST_DONE(); 63 + } 64 + 65 + static void l1_svm_code(struct svm_test_data *svm) 66 + { 67 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 68 + struct vmcb *vmcb = svm->vmcb; 69 + 70 + generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 71 + run_guest(vmcb, svm->vmcb_gpa); 72 + } 73 + 74 + static void l1_vmx_code(struct vmx_pages *vmx) 75 + { 76 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 77 + 78 + GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); 79 + GUEST_ASSERT_EQ(load_vmcs(vmx), true); 80 + 81 + prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 82 + 83 + /* 84 + * Enable MSR bitmaps (the bitmap itself is allocated, zeroed, and set 85 + * in the VMCS by prepare_vmcs()), as MSR exiting mandatory on Intel. 86 + */ 87 + vmwrite(CPU_BASED_VM_EXEC_CONTROL, 88 + vmreadz(CPU_BASED_VM_EXEC_CONTROL) | CPU_BASED_USE_MSR_BITMAPS); 89 + 90 + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); 91 + GUEST_ASSERT(!vmlaunch()); 92 + } 93 + 94 + static void guest_code(void *nested_test_data) 95 + { 96 + guest_read_aperf_mperf(); 97 + 98 + if (this_cpu_has(X86_FEATURE_SVM)) 99 + l1_svm_code(nested_test_data); 100 + else if (this_cpu_has(X86_FEATURE_VMX)) 101 + l1_vmx_code(nested_test_data); 102 + else 103 + GUEST_DONE(); 104 + 105 + TEST_FAIL("L2 should have signaled 'done'"); 106 + } 107 + 108 + static void guest_no_aperfmperf(void) 109 + { 110 + uint64_t msr_val; 111 + uint8_t vector; 112 + 113 + vector = rdmsr_safe(MSR_IA32_APERF, &msr_val); 114 + GUEST_ASSERT(vector == GP_VECTOR); 115 + 116 + vector = rdmsr_safe(MSR_IA32_APERF, &msr_val); 117 + GUEST_ASSERT(vector == GP_VECTOR); 118 + 119 + GUEST_DONE(); 120 + } 121 + 122 + int main(int argc, char *argv[]) 123 + { 124 + const bool has_nested = kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX); 125 + uint64_t host_aperf_before, host_mperf_before; 126 + vm_vaddr_t nested_test_data_gva; 127 + struct kvm_vcpu *vcpu; 128 + struct kvm_vm *vm; 129 + int msr_fd, cpu, i; 130 + 131 + /* Sanity check that APERF/MPERF are unsupported by default. */ 132 + vm = vm_create_with_one_vcpu(&vcpu, guest_no_aperfmperf); 133 + vcpu_run(vcpu); 134 + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); 135 + kvm_vm_free(vm); 136 + 137 + cpu = pin_self_to_any_cpu(); 138 + 139 + msr_fd = open_dev_msr(cpu); 140 + 141 + /* 142 + * This test requires a non-standard VM initialization, because 143 + * KVM_ENABLE_CAP cannot be used on a VM file descriptor after 144 + * a VCPU has been created. 145 + */ 146 + vm = vm_create(1); 147 + 148 + TEST_REQUIRE(vm_check_cap(vm, KVM_CAP_X86_DISABLE_EXITS) & 149 + KVM_X86_DISABLE_EXITS_APERFMPERF); 150 + 151 + vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, 152 + KVM_X86_DISABLE_EXITS_APERFMPERF); 153 + 154 + vcpu = vm_vcpu_add(vm, 0, guest_code); 155 + 156 + if (!has_nested) 157 + nested_test_data_gva = NONCANONICAL; 158 + else if (kvm_cpu_has(X86_FEATURE_SVM)) 159 + vcpu_alloc_svm(vm, &nested_test_data_gva); 160 + else 161 + vcpu_alloc_vmx(vm, &nested_test_data_gva); 162 + 163 + vcpu_args_set(vcpu, 1, nested_test_data_gva); 164 + 165 + host_aperf_before = read_dev_msr(msr_fd, MSR_IA32_APERF); 166 + host_mperf_before = read_dev_msr(msr_fd, MSR_IA32_MPERF); 167 + 168 + for (i = 0; i <= NUM_ITERATIONS * (1 + has_nested); i++) { 169 + uint64_t host_aperf_after, host_mperf_after; 170 + uint64_t guest_aperf, guest_mperf; 171 + struct ucall uc; 172 + 173 + vcpu_run(vcpu); 174 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 175 + 176 + switch (get_ucall(vcpu, &uc)) { 177 + case UCALL_DONE: 178 + goto done; 179 + case UCALL_ABORT: 180 + REPORT_GUEST_ASSERT(uc); 181 + case UCALL_SYNC: 182 + guest_aperf = uc.args[0]; 183 + guest_mperf = uc.args[1]; 184 + 185 + host_aperf_after = read_dev_msr(msr_fd, MSR_IA32_APERF); 186 + host_mperf_after = read_dev_msr(msr_fd, MSR_IA32_MPERF); 187 + 188 + TEST_ASSERT(host_aperf_before < guest_aperf, 189 + "APERF: host_before (0x%" PRIx64 ") >= guest (0x%" PRIx64 ")", 190 + host_aperf_before, guest_aperf); 191 + TEST_ASSERT(guest_aperf < host_aperf_after, 192 + "APERF: guest (0x%" PRIx64 ") >= host_after (0x%" PRIx64 ")", 193 + guest_aperf, host_aperf_after); 194 + TEST_ASSERT(host_mperf_before < guest_mperf, 195 + "MPERF: host_before (0x%" PRIx64 ") >= guest (0x%" PRIx64 ")", 196 + host_mperf_before, guest_mperf); 197 + TEST_ASSERT(guest_mperf < host_mperf_after, 198 + "MPERF: guest (0x%" PRIx64 ") >= host_after (0x%" PRIx64 ")", 199 + guest_mperf, host_mperf_after); 200 + 201 + host_aperf_before = host_aperf_after; 202 + host_mperf_before = host_mperf_after; 203 + 204 + break; 205 + } 206 + } 207 + TEST_FAIL("Didn't receive UCALL_DONE\n"); 208 + done: 209 + kvm_vm_free(vm); 210 + close(msr_fd); 211 + 212 + return 0; 213 + }

+8

tools/testing/selftests/kvm/x86/userspace_msr_exit_test.c

··· 343 343 data = test_rdmsr(MSR_GS_BASE); 344 344 GUEST_ASSERT(data == MSR_GS_BASE); 345 345 346 + /* Access the MSRs again to ensure KVM has disabled interception.*/ 347 + data = test_rdmsr(MSR_FS_BASE); 348 + GUEST_ASSERT(data != MSR_FS_BASE); 349 + data = test_rdmsr(MSR_GS_BASE); 350 + GUEST_ASSERT(data != MSR_GS_BASE); 351 + 346 352 GUEST_DONE(); 347 353 } 348 354 ··· 688 682 "Expected ucall state to be UCALL_SYNC."); 689 683 vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs); 690 684 run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE); 685 + 686 + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow); 691 687 run_guest_then_process_ucall_done(vcpu); 692 688 } 693 689

Configure Feed

Configure Feed