Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-misc-6.20' of https://github.com/kvm-x86/linux into HEAD

KVM x86 misc changes for 6.20

- Disallow changing the virtual CPU model if L2 is active, for all the same
reasons KVM disallows change the model after the first KVM_RUN.

- Fix a bug where KVM would incorrectly reject host accesses to PV MSRs that
were advertised as supported to userspace when running with
KVM_CAP_ENFORCE_PV_FEATURE_CPUID enabled.

- Fix a bug where KVM would attempt to read protect guest state (CR3) when
configuring an async #PF entry.

- Fail the build if EXPORT_SYMBOL_GPL or EXPORT_SYMBOL is used in KVM (for x86
only) to enforce usage of EXPORT_SYMBOL_FOR_KVM_INTERNAL. Explicitly allow
the few exports that are intended for external usage.

- Ignore -EBUSY when checking nested events after a vCPU exits blocking as
the WARN is user-triggerable, and because exiting to userspace on -EBUSY
does more harm than good in pretty much every situation.

- Throw in the towel and drop the WARN on INIT/SIPI being blocked when vCPU is
in Wait-For-SIPI, as playing whack-a-mole with syzkaller turned out to be an
unwinnable game.

- Add support for new Intel instructions that don't require anything beyond
enumerating feature flags to userspace.

- Grab SRCU when reading PDPTRs in KVM_GET_SREGS2.

- Add WARNs to guard against modifying KVM's CPU caps outside of the intended
setup flow, as nested VMX in particular is sensitive to unexpected changes
in KVM's golden configuration.

- Add a quirk to allow userspace to opt-in to actually suppress EOI broadcasts
when the suppression feature is enabled by the guest (currently limited to
split IRQCHIP, i.e. userspace I/O APIC). Sadly, simply fixing KVM to honor
Suppress EOI Broadcasts isn't an option as some userspaces have come to rely
on KVM's buggy behavior (KVM advertises Supress EOI Broadcast irrespective
of whether or not userspace I/O APIC supports Directed EOIs).

- Minor cleanups.

+326 -69
+26 -2
Documentation/virt/kvm/api.rst
··· 7908 7908 7909 7909 Valid feature flags in args[0] are:: 7910 7910 7911 - #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) 7912 - #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) 7911 + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) 7912 + #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) 7913 + #define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST (1ULL << 2) 7914 + #define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST (1ULL << 3) 7913 7915 7914 7916 Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of 7915 7917 KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, ··· 7923 7921 as a broadcast even in x2APIC mode in order to support physical x2APIC 7924 7922 without interrupt remapping. This is undesirable in logical mode, 7925 7923 where 0xff represents CPUs 0-7 in cluster 0. 7924 + 7925 + Setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST instructs KVM to enable 7926 + Suppress EOI Broadcasts. KVM will advertise support for Suppress EOI 7927 + Broadcast to the guest and suppress LAPIC EOI broadcasts when the guest 7928 + sets the Suppress EOI Broadcast bit in the SPIV register. This flag is 7929 + supported only when using a split IRQCHIP. 7930 + 7931 + Setting KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST disables support for 7932 + Suppress EOI Broadcasts entirely, i.e. instructs KVM to NOT advertise 7933 + support to the guest. 7934 + 7935 + Modern VMMs should either enable KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST 7936 + or KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST. If not, legacy quirky 7937 + behavior will be used by KVM: in split IRQCHIP mode, KVM will advertise 7938 + support for Suppress EOI Broadcasts but not actually suppress EOI 7939 + broadcasts; for in-kernel IRQCHIP mode, KVM will not advertise support for 7940 + Suppress EOI Broadcasts. 7941 + 7942 + Setting both KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST and 7943 + KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST will fail with an EINVAL error, 7944 + as will setting KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST without a split 7945 + IRCHIP. 7926 7946 7927 7947 7.8 KVM_CAP_S390_USER_INSTR0 7928 7948 ----------------------------
+1
arch/x86/include/asm/cpufeatures.h
··· 326 326 #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ 327 327 #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ 328 328 #define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ 329 + #define X86_FEATURE_MOVRS (12*32+31) /* MOVRS instructions */ 329 330 330 331 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ 331 332 #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
+9
arch/x86/include/asm/kvm_host.h
··· 784 784 CPUID_24_0_EBX, 785 785 CPUID_8000_0021_ECX, 786 786 CPUID_7_1_ECX, 787 + CPUID_1E_1_EAX, 788 + CPUID_24_1_ECX, 787 789 NR_KVM_CPU_CAPS, 788 790 789 791 NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, ··· 1236 1234 KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ 1237 1235 }; 1238 1236 1237 + enum kvm_suppress_eoi_broadcast_mode { 1238 + KVM_SUPPRESS_EOI_BROADCAST_QUIRKED, /* Legacy behavior */ 1239 + KVM_SUPPRESS_EOI_BROADCAST_ENABLED, /* Enable Suppress EOI broadcast */ 1240 + KVM_SUPPRESS_EOI_BROADCAST_DISABLED /* Disable Suppress EOI broadcast */ 1241 + }; 1242 + 1239 1243 struct kvm_x86_msr_filter { 1240 1244 u8 count; 1241 1245 bool default_allow:1; ··· 1491 1483 1492 1484 bool x2apic_format; 1493 1485 bool x2apic_broadcast_quirk_disabled; 1486 + enum kvm_suppress_eoi_broadcast_mode suppress_eoi_broadcast_mode; 1494 1487 1495 1488 bool has_mapped_host_mmio; 1496 1489 bool guest_can_read_msr_platform_info;
+4 -2
arch/x86/include/uapi/asm/kvm.h
··· 916 916 __u64 pad1[4]; 917 917 }; 918 918 919 - #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) 920 - #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) 919 + #define KVM_X2APIC_API_USE_32BIT_IDS _BITULL(0) 920 + #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK _BITULL(1) 921 + #define KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST _BITULL(2) 922 + #define KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST _BITULL(3) 921 923 922 924 struct kvm_hyperv_eventfd { 923 925 __u32 conn_id;
+49
arch/x86/kvm/Makefile
··· 47 47 48 48 targets += kvm-asm-offsets.s 49 49 clean-files += kvm-asm-offsets.h 50 + 51 + 52 + # Fail the build if there is unexpected EXPORT_SYMBOL_GPL (or EXPORT_SYMBOL) 53 + # usage. All KVM-internal exports should use EXPORT_SYMBOL_FOR_KVM_INTERNAL. 54 + # Only a handful of exports intended for other modules (VFIO, KVMGT) should 55 + # use EXPORT_SYMBOL_GPL, and EXPORT_SYMBOL should never be used. 56 + ifdef CONFIG_KVM_X86 57 + # Search recursively for whole words and print line numbers. Filter out the 58 + # allowed set of exports, i.e. those that are intended for external usage. 59 + exports_grep_trailer := --include='*.[ch]' -nrw $(srctree)/virt/kvm $(srctree)/arch/x86/kvm | \ 60 + grep -v -e kvm_page_track_register_notifier \ 61 + -e kvm_page_track_unregister_notifier \ 62 + -e kvm_write_track_add_gfn \ 63 + -e kvm_write_track_remove_gfn \ 64 + -e kvm_get_kvm \ 65 + -e kvm_get_kvm_safe \ 66 + -e kvm_put_kvm 67 + 68 + # Force grep to emit a goofy group separator that can in turn be replaced with 69 + # the above newline macro (newlines in Make are a nightmare). Note, grep only 70 + # prints the group separator when N lines of context are requested via -C, 71 + # a.k.a. --NUM. Simply request zero lines. Print the separator only after 72 + # filtering out expected exports to avoid extra newlines in the error message. 73 + define get_kvm_exports 74 + $(shell grep "$(1)" -C0 $(exports_grep_trailer) | grep "$(1)" -C0 --group-separator="!SEP!") 75 + endef 76 + 77 + define check_kvm_exports 78 + nr_kvm_exports := $(shell grep "$(1)" $(exports_grep_trailer) | wc -l) 79 + 80 + ifneq (0,$$(nr_kvm_exports)) 81 + $$(error ERROR ***\ 82 + $$(newline)found $$(nr_kvm_exports) unwanted occurrences of $(1):\ 83 + $$(newline) $(subst !SEP!,$$(newline) ,$(call get_kvm_exports,$(1)))\ 84 + $$(newline)in directories:\ 85 + $$(newline) $(srctree)/arch/x86/kvm\ 86 + $$(newline) $(srctree)/virt/kvm\ 87 + $$(newline)Use EXPORT_SYMBOL_FOR_KVM_INTERNAL, not $(1)) 88 + endif # nr_kvm_exports != 0 89 + undefine nr_kvm_exports 90 + endef # check_kvm_exports 91 + 92 + $(eval $(call check_kvm_exports,EXPORT_SYMBOL_GPL)) 93 + $(eval $(call check_kvm_exports,EXPORT_SYMBOL)) 94 + 95 + undefine check_kvm_exports 96 + undefine get_kvm_exports 97 + undefine exports_grep_trailer 98 + endif # CONFIG_KVM_X86
+63 -12
arch/x86/kvm/cpuid.c
··· 36 36 u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; 37 37 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); 38 38 39 + bool kvm_is_configuring_cpu_caps __read_mostly; 40 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_configuring_cpu_caps); 41 + 39 42 struct cpuid_xstate_sizes { 40 43 u32 eax; 41 44 u32 ebx; ··· 537 534 BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps)); 538 535 539 536 /* 540 - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as 541 - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't 542 - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page 543 - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with 544 - * the core vCPU model on the fly. It would've been better to forbid any 545 - * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately 546 - * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do 537 + * KVM does not correctly handle changing guest CPUID after KVM_RUN or 538 + * while L2 is active, as MAXPHYADDR, GBPAGES support, AMD reserved bit 539 + * behavior, etc. aren't tracked in kvm_mmu_page_role, and L2 state 540 + * can't be adjusted (without breaking L2 in some way). As a result, 541 + * KVM may reuse SPs/SPTEs and/or run L2 with bad/misconfigured state. 542 + * 543 + * In practice, no sane VMM mucks with the core vCPU model on the fly. 544 + * It would've been better to forbid any KVM_SET_CPUID{,2} calls after 545 + * KVM_RUN or KVM_SET_NESTED_STATE altogether, but unfortunately some 546 + * VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do 547 547 * KVM_SET_CPUID{,2} again. To support this legacy behavior, check 548 548 * whether the supplied CPUID data is equal to what's already set. 549 549 */ 550 - if (kvm_vcpu_has_run(vcpu)) { 550 + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu)) { 551 551 r = kvm_cpuid_check_equal(vcpu, e2, nent); 552 552 if (r) 553 553 goto err; ··· 829 823 /* DS is defined by ptrace-abi.h on 32-bit builds. */ 830 824 #undef DS 831 825 832 - void kvm_set_cpu_caps(void) 826 + void kvm_initialize_cpu_caps(void) 833 827 { 834 828 memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); 829 + 830 + WARN_ON_ONCE(kvm_is_configuring_cpu_caps); 831 + kvm_is_configuring_cpu_caps = true; 835 832 836 833 BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > 837 834 sizeof(boot_cpu_data.x86_capability)); ··· 1034 1025 F(AMX_FP16), 1035 1026 F(AVX_IFMA), 1036 1027 F(LAM), 1028 + F(MOVRS), 1037 1029 ); 1038 1030 1039 1031 kvm_cpu_cap_init(CPUID_7_1_ECX, ··· 1073 1063 SCATTERED_F(SGX_EDECCSSA), 1074 1064 ); 1075 1065 1066 + kvm_cpu_cap_init(CPUID_1E_1_EAX, 1067 + F(AMX_INT8_ALIAS), 1068 + F(AMX_BF16_ALIAS), 1069 + F(AMX_COMPLEX_ALIAS), 1070 + F(AMX_FP16_ALIAS), 1071 + F(AMX_FP8), 1072 + F(AMX_TF32), 1073 + F(AMX_AVX512), 1074 + F(AMX_MOVRS), 1075 + ); 1076 + 1076 1077 kvm_cpu_cap_init(CPUID_24_0_EBX, 1077 1078 F(AVX10_128), 1078 1079 F(AVX10_256), 1079 1080 F(AVX10_512), 1081 + ); 1082 + 1083 + kvm_cpu_cap_init(CPUID_24_1_ECX, 1084 + F(AVX10_VNNI_INT), 1080 1085 ); 1081 1086 1082 1087 kvm_cpu_cap_init(CPUID_8000_0001_ECX, ··· 1295 1270 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 1296 1271 } 1297 1272 } 1298 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); 1273 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_initialize_cpu_caps); 1299 1274 1300 1275 #undef F 1301 1276 #undef SCATTERED_F ··· 1649 1624 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1650 1625 break; 1651 1626 } 1627 + 1628 + max_idx = entry->eax = min(entry->eax, 1u); 1629 + 1630 + /* KVM only supports up to 0x1e.0x1, capped above via min(). */ 1631 + if (max_idx >= 1) { 1632 + entry = do_host_cpuid(array, function, 1); 1633 + if (!entry) 1634 + goto out; 1635 + 1636 + cpuid_entry_override(entry, CPUID_1E_1_EAX); 1637 + entry->ebx = 0; 1638 + entry->ecx = 0; 1639 + entry->edx = 0; 1640 + } 1652 1641 break; 1653 1642 case 0x24: { 1654 1643 u8 avx10_version; ··· 1672 1633 break; 1673 1634 } 1674 1635 1636 + max_idx = entry->eax = min(entry->eax, 1u); 1675 1637 /* 1676 1638 * The AVX10 version is encoded in EBX[7:0]. Note, the version 1677 1639 * is guaranteed to be >=1 if AVX10 is supported. Note #2, the 1678 1640 * version needs to be captured before overriding EBX features! 1679 1641 */ 1680 - avx10_version = min_t(u8, entry->ebx & 0xff, 1); 1642 + avx10_version = min_t(u8, entry->ebx & 0xff, 2); 1681 1643 cpuid_entry_override(entry, CPUID_24_0_EBX); 1682 1644 entry->ebx |= avx10_version; 1683 1645 1684 - entry->eax = 0; 1685 1646 entry->ecx = 0; 1686 1647 entry->edx = 0; 1648 + 1649 + /* KVM only supports up to 0x24.0x1, capped above via min(). */ 1650 + if (max_idx >= 1) { 1651 + entry = do_host_cpuid(array, function, 1); 1652 + if (!entry) 1653 + goto out; 1654 + 1655 + cpuid_entry_override(entry, CPUID_24_1_ECX); 1656 + entry->eax = 0; 1657 + entry->ebx = 0; 1658 + entry->edx = 0; 1659 + } 1687 1660 break; 1688 1661 } 1689 1662 case KVM_CPUID_SIGNATURE: {
+11 -1
arch/x86/kvm/cpuid.h
··· 8 8 #include <uapi/asm/kvm_para.h> 9 9 10 10 extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; 11 - void kvm_set_cpu_caps(void); 11 + extern bool kvm_is_configuring_cpu_caps __read_mostly; 12 + 13 + void kvm_initialize_cpu_caps(void); 14 + 15 + static inline void kvm_finalize_cpu_caps(void) 16 + { 17 + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); 18 + kvm_is_configuring_cpu_caps = false; 19 + } 12 20 13 21 void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); 14 22 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, ··· 196 188 { 197 189 unsigned int x86_leaf = __feature_leaf(x86_feature); 198 190 191 + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); 199 192 kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); 200 193 } 201 194 ··· 204 195 { 205 196 unsigned int x86_leaf = __feature_leaf(x86_feature); 206 197 198 + WARN_ON_ONCE(!kvm_is_configuring_cpu_caps); 207 199 kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); 208 200 } 209 201
+1 -1
arch/x86/kvm/ioapic.c
··· 561 561 spin_lock(&ioapic->lock); 562 562 563 563 if (trigger_mode != IOAPIC_LEVEL_TRIG || 564 - kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) 564 + kvm_lapic_suppress_eoi_broadcast(apic)) 565 565 return; 566 566 567 567 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+68 -9
arch/x86/kvm/lapic.c
··· 105 105 apic_test_vector(vector, apic->regs + APIC_IRR); 106 106 } 107 107 108 + static bool kvm_lapic_advertise_suppress_eoi_broadcast(struct kvm *kvm) 109 + { 110 + switch (kvm->arch.suppress_eoi_broadcast_mode) { 111 + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: 112 + return true; 113 + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: 114 + return false; 115 + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: 116 + /* 117 + * The default in-kernel I/O APIC emulates the 82093AA and does not 118 + * implement an EOI register. Some guests (e.g. Windows with the 119 + * Hyper-V role enabled) disable LAPIC EOI broadcast without 120 + * checking the I/O APIC version, which can cause level-triggered 121 + * interrupts to never be EOI'd. 122 + * 123 + * To avoid this, KVM doesn't advertise Suppress EOI Broadcast 124 + * support when using the default in-kernel I/O APIC. 125 + * 126 + * Historically, in split IRQCHIP mode, KVM always advertised 127 + * Suppress EOI Broadcast support but did not actually suppress 128 + * EOIs, resulting in quirky behavior. 129 + */ 130 + return !ioapic_in_kernel(kvm); 131 + default: 132 + WARN_ON_ONCE(1); 133 + return false; 134 + } 135 + } 136 + 137 + bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic) 138 + { 139 + struct kvm *kvm = apic->vcpu->kvm; 140 + 141 + if (!(kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 142 + return false; 143 + 144 + switch (kvm->arch.suppress_eoi_broadcast_mode) { 145 + case KVM_SUPPRESS_EOI_BROADCAST_ENABLED: 146 + return true; 147 + case KVM_SUPPRESS_EOI_BROADCAST_DISABLED: 148 + return false; 149 + case KVM_SUPPRESS_EOI_BROADCAST_QUIRKED: 150 + /* 151 + * Historically, in split IRQCHIP mode, KVM ignored the suppress 152 + * EOI broadcast bit set by the guest and broadcasts EOIs to the 153 + * userspace I/O APIC. For In-kernel I/O APIC, the support itself 154 + * is not advertised, can only be enabled via KVM_SET_APIC_STATE, 155 + * and KVM's I/O APIC doesn't emulate Directed EOIs; but if the 156 + * feature is enabled, it is respected (with odd behavior). 157 + */ 158 + return ioapic_in_kernel(kvm); 159 + default: 160 + WARN_ON_ONCE(1); 161 + return false; 162 + } 163 + } 164 + 108 165 __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); 109 166 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu); 110 167 ··· 611 554 612 555 v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); 613 556 614 - /* 615 - * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) 616 - * which doesn't have EOI register; Some buggy OSes (e.g. Windows with 617 - * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC 618 - * version first and level-triggered interrupts never get EOIed in 619 - * IOAPIC. 620 - */ 557 + 621 558 if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && 622 - !ioapic_in_kernel(vcpu->kvm)) 559 + kvm_lapic_advertise_suppress_eoi_broadcast(vcpu->kvm)) 623 560 v |= APIC_LVR_DIRECTED_EOI; 624 561 kvm_lapic_set_reg(apic, APIC_LVR, v); 625 562 } ··· 1568 1517 1569 1518 /* Request a KVM exit to inform the userspace IOAPIC. */ 1570 1519 if (irqchip_split(apic->vcpu->kvm)) { 1520 + /* 1521 + * Don't exit to userspace if the guest has enabled Directed 1522 + * EOI, a.k.a. Suppress EOI Broadcasts, in which case the local 1523 + * APIC doesn't broadcast EOIs (the guest must EOI the target 1524 + * I/O APIC(s) directly). 1525 + */ 1526 + if (kvm_lapic_suppress_eoi_broadcast(apic)) 1527 + return; 1528 + 1571 1529 apic->vcpu->arch.pending_ioapic_eoi = vector; 1572 1530 kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); 1573 1531 return; ··· 3558 3498 * wait-for-SIPI (WFS). 3559 3499 */ 3560 3500 if (!kvm_apic_init_sipi_allowed(vcpu)) { 3561 - WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); 3562 3501 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3563 3502 return 0; 3564 3503 }
+2
arch/x86/kvm/lapic.h
··· 231 231 232 232 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); 233 233 234 + bool kvm_lapic_suppress_eoi_broadcast(struct kvm_lapic *apic); 235 + 234 236 void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); 235 237 236 238 void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+5 -6
arch/x86/kvm/mmu/mmu.c
··· 4521 4521 arch.gfn = fault->gfn; 4522 4522 arch.error_code = fault->error_code; 4523 4523 arch.direct_map = vcpu->arch.mmu->root_role.direct; 4524 - arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); 4524 + if (arch.direct_map) 4525 + arch.cr3 = (unsigned long)INVALID_GPA; 4526 + else 4527 + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); 4525 4528 4526 4529 return kvm_setup_async_pf(vcpu, fault->addr, 4527 4530 kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); ··· 6034 6031 vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; 6035 6032 kvm_mmu_reset_context(vcpu); 6036 6033 6037 - /* 6038 - * Changing guest CPUID after KVM_RUN is forbidden, see the comment in 6039 - * kvm_arch_vcpu_ioctl(). 6040 - */ 6041 - KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); 6034 + KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm); 6042 6035 } 6043 6036 6044 6037 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+1 -1
arch/x86/kvm/pmu.c
··· 853 853 { 854 854 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 855 855 856 - if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) 856 + if (KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm)) 857 857 return; 858 858 859 859 /*
+19
arch/x86/kvm/reverse_cpuid.h
··· 44 44 #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) 45 45 #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) 46 46 47 + /* 48 + * Intel-defined sub-features, CPUID level 0x0000001E:1 (EAX). Note, several 49 + * of the bits are aliases to features of the same name that are enumerated via 50 + * various CPUID.0x7 sub-leafs. 51 + */ 52 + #define X86_FEATURE_AMX_INT8_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 0) 53 + #define X86_FEATURE_AMX_BF16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 1) 54 + #define X86_FEATURE_AMX_COMPLEX_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 2) 55 + #define X86_FEATURE_AMX_FP16_ALIAS KVM_X86_FEATURE(CPUID_1E_1_EAX, 3) 56 + #define X86_FEATURE_AMX_FP8 KVM_X86_FEATURE(CPUID_1E_1_EAX, 4) 57 + #define X86_FEATURE_AMX_TF32 KVM_X86_FEATURE(CPUID_1E_1_EAX, 6) 58 + #define X86_FEATURE_AMX_AVX512 KVM_X86_FEATURE(CPUID_1E_1_EAX, 7) 59 + #define X86_FEATURE_AMX_MOVRS KVM_X86_FEATURE(CPUID_1E_1_EAX, 8) 60 + 47 61 /* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ 48 62 #define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) 49 63 #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) 50 64 #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) 65 + 66 + /* Intel-defined sub-features, CPUID level 0x00000024:1 (ECX) */ 67 + #define X86_FEATURE_AVX10_VNNI_INT KVM_X86_FEATURE(CPUID_24_1_ECX, 2) 51 68 52 69 /* CPUID level 0x80000007 (EDX). */ 53 70 #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) ··· 107 90 [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, 108 91 [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, 109 92 [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, 93 + [CPUID_1E_1_EAX] = { 0x1e, 1, CPUID_EAX}, 94 + [CPUID_24_1_ECX] = { 0x24, 1, CPUID_ECX}, 110 95 }; 111 96 112 97 /*
+2 -1
arch/x86/kvm/svm/svm.c
··· 5259 5259 5260 5260 static __init void svm_set_cpu_caps(void) 5261 5261 { 5262 - kvm_set_cpu_caps(); 5262 + kvm_initialize_cpu_caps(); 5263 5263 5264 5264 kvm_caps.supported_perf_cap = 0; 5265 5265 ··· 5343 5343 kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); 5344 5344 5345 5345 kvm_setup_xss_caps(); 5346 + kvm_finalize_cpu_caps(); 5346 5347 } 5347 5348 5348 5349 static __init int svm_hardware_setup(void)
+2 -1
arch/x86/kvm/vmx/vmx.c
··· 8039 8039 8040 8040 static __init void vmx_set_cpu_caps(void) 8041 8041 { 8042 - kvm_set_cpu_caps(); 8042 + kvm_initialize_cpu_caps(); 8043 8043 8044 8044 /* CPUID 0x1 */ 8045 8045 if (nested) ··· 8098 8098 } 8099 8099 8100 8100 kvm_setup_xss_caps(); 8101 + kvm_finalize_cpu_caps(); 8101 8102 } 8102 8103 8103 8104 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+50 -31
arch/x86/kvm/x86.c
··· 121 121 122 122 #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE 123 123 124 - #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ 125 - KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) 124 + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ 125 + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK | \ 126 + KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST | \ 127 + KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) 126 128 127 129 static void update_cr8_intercept(struct kvm_vcpu *vcpu); 128 130 static void process_nmi(struct kvm_vcpu *vcpu); ··· 2316 2314 u64 val; 2317 2315 2318 2316 /* 2319 - * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does 2320 - * not support modifying the guest vCPU model on the fly, e.g. changing 2321 - * the nVMX capabilities while L2 is running is nonsensical. Allow 2322 - * writes of the same value, e.g. to allow userspace to blindly stuff 2323 - * all MSRs when emulating RESET. 2317 + * Reject writes to immutable feature MSRs if the vCPU model is frozen, 2318 + * as KVM doesn't support modifying the guest vCPU model on the fly, 2319 + * e.g. changing the VMX capabilities MSRs while L2 is active is 2320 + * nonsensical. Allow writes of the same value, e.g. so that userspace 2321 + * can blindly stuff all MSRs when emulating RESET. 2324 2322 */ 2325 - if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) && 2323 + if (!kvm_can_set_cpuid_and_feature_msrs(vcpu) && 2324 + kvm_is_immutable_feature_msr(index) && 2326 2325 (do_get_msr(vcpu, index, &val) || *data != val)) 2327 2326 return -EINVAL; 2328 2327 ··· 4099 4096 break; 4100 4097 case MSR_KVM_WALL_CLOCK_NEW: 4101 4098 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 4102 - return 1; 4099 + return KVM_MSR_RET_UNSUPPORTED; 4103 4100 4104 4101 vcpu->kvm->arch.wall_clock = data; 4105 4102 kvm_write_wall_clock(vcpu->kvm, data, 0); 4106 4103 break; 4107 4104 case MSR_KVM_WALL_CLOCK: 4108 4105 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 4109 - return 1; 4106 + return KVM_MSR_RET_UNSUPPORTED; 4110 4107 4111 4108 vcpu->kvm->arch.wall_clock = data; 4112 4109 kvm_write_wall_clock(vcpu->kvm, data, 0); 4113 4110 break; 4114 4111 case MSR_KVM_SYSTEM_TIME_NEW: 4115 4112 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 4116 - return 1; 4113 + return KVM_MSR_RET_UNSUPPORTED; 4117 4114 4118 4115 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); 4119 4116 break; 4120 4117 case MSR_KVM_SYSTEM_TIME: 4121 4118 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 4122 - return 1; 4119 + return KVM_MSR_RET_UNSUPPORTED; 4123 4120 4124 4121 kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); 4125 4122 break; 4126 4123 case MSR_KVM_ASYNC_PF_EN: 4127 4124 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 4128 - return 1; 4125 + return KVM_MSR_RET_UNSUPPORTED; 4129 4126 4130 4127 if (kvm_pv_enable_async_pf(vcpu, data)) 4131 4128 return 1; 4132 4129 break; 4133 4130 case MSR_KVM_ASYNC_PF_INT: 4134 4131 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 4135 - return 1; 4132 + return KVM_MSR_RET_UNSUPPORTED; 4136 4133 4137 4134 if (kvm_pv_enable_async_pf_int(vcpu, data)) 4138 4135 return 1; 4139 4136 break; 4140 4137 case MSR_KVM_ASYNC_PF_ACK: 4141 4138 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 4142 - return 1; 4139 + return KVM_MSR_RET_UNSUPPORTED; 4143 4140 if (data & 0x1) { 4144 4141 /* 4145 4142 * Pairs with the smp_mb__after_atomic() in ··· 4152 4149 break; 4153 4150 case MSR_KVM_STEAL_TIME: 4154 4151 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) 4155 - return 1; 4152 + return KVM_MSR_RET_UNSUPPORTED; 4156 4153 4157 4154 if (unlikely(!sched_info_on())) 4158 4155 return 1; ··· 4170 4167 break; 4171 4168 case MSR_KVM_PV_EOI_EN: 4172 4169 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) 4173 - return 1; 4170 + return KVM_MSR_RET_UNSUPPORTED; 4174 4171 4175 4172 if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) 4176 4173 return 1; ··· 4178 4175 4179 4176 case MSR_KVM_POLL_CONTROL: 4180 4177 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) 4181 - return 1; 4178 + return KVM_MSR_RET_UNSUPPORTED; 4182 4179 4183 4180 /* only enable bit supported */ 4184 4181 if (data & (-1ULL << 1)) ··· 4479 4476 break; 4480 4477 case MSR_KVM_WALL_CLOCK: 4481 4478 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 4482 - return 1; 4479 + return KVM_MSR_RET_UNSUPPORTED; 4483 4480 4484 4481 msr_info->data = vcpu->kvm->arch.wall_clock; 4485 4482 break; 4486 4483 case MSR_KVM_WALL_CLOCK_NEW: 4487 4484 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 4488 - return 1; 4485 + return KVM_MSR_RET_UNSUPPORTED; 4489 4486 4490 4487 msr_info->data = vcpu->kvm->arch.wall_clock; 4491 4488 break; 4492 4489 case MSR_KVM_SYSTEM_TIME: 4493 4490 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) 4494 - return 1; 4491 + return KVM_MSR_RET_UNSUPPORTED; 4495 4492 4496 4493 msr_info->data = vcpu->arch.time; 4497 4494 break; 4498 4495 case MSR_KVM_SYSTEM_TIME_NEW: 4499 4496 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) 4500 - return 1; 4497 + return KVM_MSR_RET_UNSUPPORTED; 4501 4498 4502 4499 msr_info->data = vcpu->arch.time; 4503 4500 break; 4504 4501 case MSR_KVM_ASYNC_PF_EN: 4505 4502 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) 4506 - return 1; 4503 + return KVM_MSR_RET_UNSUPPORTED; 4507 4504 4508 4505 msr_info->data = vcpu->arch.apf.msr_en_val; 4509 4506 break; 4510 4507 case MSR_KVM_ASYNC_PF_INT: 4511 4508 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 4512 - return 1; 4509 + return KVM_MSR_RET_UNSUPPORTED; 4513 4510 4514 4511 msr_info->data = vcpu->arch.apf.msr_int_val; 4515 4512 break; 4516 4513 case MSR_KVM_ASYNC_PF_ACK: 4517 4514 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) 4518 - return 1; 4515 + return KVM_MSR_RET_UNSUPPORTED; 4519 4516 4520 4517 msr_info->data = 0; 4521 4518 break; 4522 4519 case MSR_KVM_STEAL_TIME: 4523 4520 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) 4524 - return 1; 4521 + return KVM_MSR_RET_UNSUPPORTED; 4525 4522 4526 4523 msr_info->data = vcpu->arch.st.msr_val; 4527 4524 break; 4528 4525 case MSR_KVM_PV_EOI_EN: 4529 4526 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) 4530 - return 1; 4527 + return KVM_MSR_RET_UNSUPPORTED; 4531 4528 4532 4529 msr_info->data = vcpu->arch.pv_eoi.msr_val; 4533 4530 break; 4534 4531 case MSR_KVM_POLL_CONTROL: 4535 4532 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) 4536 - return 1; 4533 + return KVM_MSR_RET_UNSUPPORTED; 4537 4534 4538 4535 msr_info->data = vcpu->arch.msr_kvm_poll_control; 4539 4536 break; ··· 4934 4931 break; 4935 4932 case KVM_CAP_X2APIC_API: 4936 4933 r = KVM_X2APIC_API_VALID_FLAGS; 4934 + if (kvm && !irqchip_split(kvm)) 4935 + r &= ~KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST; 4937 4936 break; 4938 4937 case KVM_CAP_NESTED_STATE: 4939 4938 r = kvm_x86_ops.nested_ops->get_state ? ··· 6753 6748 if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) 6754 6749 break; 6755 6750 6751 + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && 6752 + (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST)) 6753 + break; 6754 + 6755 + if ((cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) && 6756 + !irqchip_split(kvm)) 6757 + break; 6758 + 6756 6759 if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) 6757 6760 kvm->arch.x2apic_format = true; 6758 6761 if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) 6759 6762 kvm->arch.x2apic_broadcast_quirk_disabled = true; 6763 + 6764 + if (cap->args[0] & KVM_X2APIC_ENABLE_SUPPRESS_EOI_BROADCAST) 6765 + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_ENABLED; 6766 + if (cap->args[0] & KVM_X2APIC_DISABLE_SUPPRESS_EOI_BROADCAST) 6767 + kvm->arch.suppress_eoi_broadcast_mode = KVM_SUPPRESS_EOI_BROADCAST_DISABLED; 6760 6768 6761 6769 r = 0; 6762 6770 break; ··· 11627 11609 if (is_guest_mode(vcpu)) { 11628 11610 int r = kvm_check_nested_events(vcpu); 11629 11611 11630 - WARN_ON_ONCE(r == -EBUSY); 11631 - if (r < 0) 11612 + if (r < 0 && r != -EBUSY) 11632 11613 return 0; 11633 11614 } 11634 11615 ··· 12175 12158 return; 12176 12159 12177 12160 if (is_pae_paging(vcpu)) { 12161 + kvm_vcpu_srcu_read_lock(vcpu); 12178 12162 for (i = 0 ; i < 4 ; i++) 12179 12163 sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); 12180 12164 sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; 12165 + kvm_vcpu_srcu_read_unlock(vcpu); 12181 12166 } 12182 12167 } 12183 12168 ··· 13339 13320 #endif 13340 13321 13341 13322 kvm_mmu_pre_destroy_vm(kvm); 13342 - static_call_cond(kvm_x86_vm_pre_destroy)(kvm); 13323 + kvm_x86_call(vm_pre_destroy)(kvm); 13343 13324 } 13344 13325 13345 13326 void kvm_arch_destroy_vm(struct kvm *kvm)
+13 -2
arch/x86/kvm/x86.h
··· 172 172 indirect_branch_prediction_barrier(); 173 173 } 174 174 175 - static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) 175 + /* 176 + * Disallow modifying CPUID and feature MSRs, which affect the core virtual CPU 177 + * model exposed to the guest and virtualized by KVM, if the vCPU has already 178 + * run or is in guest mode (L2). In both cases, KVM has already consumed the 179 + * current virtual CPU model, and doesn't support "unwinding" to react to the 180 + * new model. 181 + * 182 + * Note, the only way is_guest_mode() can be true with 'last_vmentry_cpu == -1' 183 + * is if userspace sets CPUID and feature MSRs (to enable VMX/SVM), then sets 184 + * nested state, and then attempts to set CPUID and/or feature MSRs *again*. 185 + */ 186 + static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu) 176 187 { 177 - return vcpu->arch.last_vmentry_cpu != -1; 188 + return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu); 178 189 } 179 190 180 191 static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state)