Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-vmx-6.20' of https://github.com/kvm-x86/linux into HEAD

KVM VMX changes for 6.20

- Fix an SGX bug where KVM would incorrectly try to handle EPCM #PFs by always
relecting EPCM #PFs back into the guest. KVM doesn't shadow EPCM entries,
and so EPCM violations cannot be due to KVM interference, and can't be
resolved by KVM.

- Fix a bug where KVM would register its posted interrupt wakeup handler even
if loading kvm-intel.ko ultimately failed.

- Disallow access to vmcb12 fields that aren't fully supported, mostly to
avoid weirdness and complexity for FRED and other features, where KVM wants
enable VMCS shadowing for fields that conditionally exist.

- Print out the "bad" offsets and values if kvm-intel.ko refuses to load (or
refuses to online a CPU) due to a VMCS config mismatch.

+171 -41
+1 -1
arch/x86/kvm/vmx/hyperv_evmcs.c
··· 7 7 #include "hyperv_evmcs.h" 8 8 9 9 #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) 10 - #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ 10 + #define EVMCS1_FIELD(number, name, clean_field)[ENC_TO_VMCS12_IDX(number)] = \ 11 11 {EVMCS1_OFFSET(name), clean_field} 12 12 13 13 const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+1 -1
arch/x86/kvm/vmx/hyperv_evmcs.h
··· 130 130 u16 *clean_field) 131 131 { 132 132 const struct evmcs_field *evmcs_field; 133 - unsigned int index = ROL16(field, 6); 133 + unsigned int index = ENC_TO_VMCS12_IDX(field); 134 134 135 135 if (unlikely(index >= nr_evmcs_1_fields)) 136 136 return -ENOENT;
+18 -13
arch/x86/kvm/vmx/nested.c
··· 86 86 pr_err("Missing field from shadow_read_only_field %x\n", 87 87 field + 1); 88 88 89 + if (get_vmcs12_field_offset(field) < 0) 90 + continue; 91 + 89 92 clear_bit(field, vmx_vmread_bitmap); 90 93 if (field & 1) 91 94 #ifdef CONFIG_X86_64 ··· 114 111 field <= GUEST_TR_AR_BYTES, 115 112 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 116 113 114 + if (get_vmcs12_field_offset(field) < 0) 115 + continue; 116 + 117 117 /* 118 - * PML and the preemption timer can be emulated, but the 119 - * processor cannot vmwrite to fields that don't exist 120 - * on bare metal. 118 + * KVM emulates PML and the VMX preemption timer irrespective 119 + * of hardware support, but shadowing their related VMCS fields 120 + * requires hardware support as the CPU will reject VMWRITEs to 121 + * fields that don't exist. 121 122 */ 122 123 switch (field) { 123 124 case GUEST_PML_INDEX: ··· 130 123 break; 131 124 case VMX_PREEMPTION_TIMER_VALUE: 132 125 if (!cpu_has_vmx_preemption_timer()) 133 - continue; 134 - break; 135 - case GUEST_INTR_STATUS: 136 - if (!cpu_has_vmx_apicv()) 137 126 continue; 138 127 break; 139 128 default: ··· 7077 7074 } 7078 7075 } 7079 7076 7080 - /* 7081 - * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 7082 - * that madness to get the encoding for comparison. 7083 - */ 7084 - #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 7085 - 7086 7077 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7087 7078 { 7088 7079 /* ··· 7403 7406 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7404 7407 { 7405 7408 int i; 7409 + 7410 + /* 7411 + * Note! The set of supported vmcs12 fields is consumed by both VMX 7412 + * MSR and shadow VMCS setup. 7413 + */ 7414 + nested_vmx_setup_vmcs12_fields(); 7415 + 7416 + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 7406 7417 7407 7418 if (!cpu_has_vmx_shadow_vmcs()) 7408 7419 enable_shadow_vmcs = 0;
+9
arch/x86/kvm/vmx/vmcs.h
··· 11 11 12 12 #include "capabilities.h" 13 13 14 + /* 15 + * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6 as a very 16 + * rudimentary compression of the range of indices. The compression ratio is 17 + * good enough to allow KVM to use a (very sparsely populated) array without 18 + * wasting too much memory, while the "algorithm" is fast enough to be used to 19 + * lookup vmcs12 fields on-demand, e.g. for emulation. 20 + */ 14 21 #define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) 22 + #define VMCS12_IDX_TO_ENC(idx) ROL16(idx, 10) 23 + #define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6) 15 24 16 25 struct vmcs_hdr { 17 26 u32 revision_id:31;
+70 -4
arch/x86/kvm/vmx/vmcs12.c
··· 4 4 #include "vmcs12.h" 5 5 6 6 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) 7 - #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) 7 + #define FIELD(number, name) [ENC_TO_VMCS12_IDX(number)] = VMCS12_OFFSET(name) 8 8 #define FIELD64(number, name) \ 9 9 FIELD(number, name), \ 10 - [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) 10 + [ENC_TO_VMCS12_IDX(number##_HIGH)] = VMCS12_OFFSET(name) + sizeof(u32) 11 11 12 - const unsigned short vmcs12_field_offsets[] = { 12 + static const u16 kvm_supported_vmcs12_field_offsets[] __initconst = { 13 13 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 14 14 FIELD(POSTED_INTR_NV, posted_intr_nv), 15 15 FIELD(GUEST_ES_SELECTOR, guest_es_selector), ··· 158 158 FIELD(HOST_SSP, host_ssp), 159 159 FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl), 160 160 }; 161 - const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); 161 + 162 + u16 vmcs12_field_offsets[ARRAY_SIZE(kvm_supported_vmcs12_field_offsets)] __ro_after_init; 163 + unsigned int nr_vmcs12_fields __ro_after_init; 164 + 165 + #define VMCS12_CASE64(enc) case enc##_HIGH: case enc 166 + 167 + static __init bool cpu_has_vmcs12_field(unsigned int idx) 168 + { 169 + switch (VMCS12_IDX_TO_ENC(idx)) { 170 + case VIRTUAL_PROCESSOR_ID: 171 + return cpu_has_vmx_vpid(); 172 + case POSTED_INTR_NV: 173 + return cpu_has_vmx_posted_intr(); 174 + VMCS12_CASE64(TSC_MULTIPLIER): 175 + return cpu_has_vmx_tsc_scaling(); 176 + case TPR_THRESHOLD: 177 + VMCS12_CASE64(VIRTUAL_APIC_PAGE_ADDR): 178 + return cpu_has_vmx_tpr_shadow(); 179 + VMCS12_CASE64(APIC_ACCESS_ADDR): 180 + return cpu_has_vmx_virtualize_apic_accesses(); 181 + VMCS12_CASE64(POSTED_INTR_DESC_ADDR): 182 + return cpu_has_vmx_posted_intr(); 183 + case GUEST_INTR_STATUS: 184 + return cpu_has_vmx_virtual_intr_delivery(); 185 + VMCS12_CASE64(VM_FUNCTION_CONTROL): 186 + VMCS12_CASE64(EPTP_LIST_ADDRESS): 187 + return cpu_has_vmx_vmfunc(); 188 + VMCS12_CASE64(EPT_POINTER): 189 + return cpu_has_vmx_ept(); 190 + VMCS12_CASE64(XSS_EXIT_BITMAP): 191 + return cpu_has_vmx_xsaves(); 192 + VMCS12_CASE64(ENCLS_EXITING_BITMAP): 193 + return cpu_has_vmx_encls_vmexit(); 194 + VMCS12_CASE64(GUEST_IA32_PERF_GLOBAL_CTRL): 195 + VMCS12_CASE64(HOST_IA32_PERF_GLOBAL_CTRL): 196 + return cpu_has_load_perf_global_ctrl(); 197 + case SECONDARY_VM_EXEC_CONTROL: 198 + return cpu_has_secondary_exec_ctrls(); 199 + case GUEST_S_CET: 200 + case GUEST_SSP: 201 + case GUEST_INTR_SSP_TABLE: 202 + case HOST_S_CET: 203 + case HOST_SSP: 204 + case HOST_INTR_SSP_TABLE: 205 + return cpu_has_load_cet_ctrl(); 206 + 207 + /* KVM always emulates PML and the VMX preemption timer in software. */ 208 + case GUEST_PML_INDEX: 209 + case VMX_PREEMPTION_TIMER_VALUE: 210 + default: 211 + return true; 212 + } 213 + } 214 + 215 + void __init nested_vmx_setup_vmcs12_fields(void) 216 + { 217 + unsigned int i; 218 + 219 + for (i = 0; i < ARRAY_SIZE(kvm_supported_vmcs12_field_offsets); i++) { 220 + if (!kvm_supported_vmcs12_field_offsets[i] || 221 + !cpu_has_vmcs12_field(i)) 222 + continue; 223 + 224 + vmcs12_field_offsets[i] = kvm_supported_vmcs12_field_offsets[i]; 225 + nr_vmcs12_fields = i + 1; 226 + } 227 + }
+5 -3
arch/x86/kvm/vmx/vmcs12.h
··· 374 374 CHECK_OFFSET(guest_pml_index, 996); 375 375 } 376 376 377 - extern const unsigned short vmcs12_field_offsets[]; 378 - extern const unsigned int nr_vmcs12_fields; 377 + extern u16 vmcs12_field_offsets[] __ro_after_init; 378 + extern unsigned int nr_vmcs12_fields __ro_after_init; 379 + 380 + void __init nested_vmx_setup_vmcs12_fields(void); 379 381 380 382 static inline short get_vmcs12_field_offset(unsigned long field) 381 383 { ··· 387 385 if (field >> 15) 388 386 return -ENOENT; 389 387 390 - index = ROL16(field, 6); 388 + index = ENC_TO_VMCS12_IDX(field); 391 389 if (index >= nr_vmcs12_fields) 392 390 return -ENOENT; 393 391
+67 -19
arch/x86/kvm/vmx/vmx.c
··· 2921 2921 } 2922 2922 if (nested) 2923 2923 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2924 + 2924 2925 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2925 - pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2926 + u32 *gold = (void *)&vmcs_config; 2927 + u32 *mine = (void *)&vmcs_conf; 2928 + int i; 2929 + 2930 + BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32)); 2931 + 2932 + pr_err("VMCS config on CPU %d doesn't match reference config:", cpu); 2933 + for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) { 2934 + if (gold[i] == mine[i]) 2935 + continue; 2936 + 2937 + pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x", 2938 + i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]); 2939 + } 2940 + pr_cont("\n"); 2926 2941 return -EIO; 2927 2942 } 2928 2943 return 0; ··· 5318 5303 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5319 5304 } 5320 5305 5306 + static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code) 5307 + { 5308 + unsigned long cr2 = vmx_get_exit_qual(vcpu); 5309 + 5310 + if (vcpu->arch.apf.host_apf_flags) 5311 + goto handle_pf; 5312 + 5313 + /* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */ 5314 + WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr); 5315 + 5316 + /* 5317 + * On SGX2 hardware, EPCM violations are delivered as #PF with the SGX 5318 + * flag set in the error code (SGX1 hardware generates #GP(0)). EPCM 5319 + * violations have nothing to do with shadow paging and can never be 5320 + * resolved by KVM; always reflect them into the guest. 5321 + */ 5322 + if (error_code & PFERR_SGX_MASK) { 5323 + WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) || 5324 + !cpu_feature_enabled(X86_FEATURE_SGX2)); 5325 + 5326 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) 5327 + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5328 + else 5329 + kvm_inject_gp(vcpu, 0); 5330 + return 1; 5331 + } 5332 + 5333 + /* 5334 + * If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs 5335 + * only to set PFERR_RSVD as appropriate (hardware won't set RSVD due 5336 + * to the GPA being legal with respect to host.MAXPHYADDR). 5337 + */ 5338 + if (enable_ept) { 5339 + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5340 + return 1; 5341 + } 5342 + 5343 + handle_pf: 5344 + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5345 + } 5346 + 5321 5347 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5322 5348 { 5323 5349 struct vcpu_vmx *vmx = to_vmx(vcpu); 5324 5350 struct kvm_run *kvm_run = vcpu->run; 5325 5351 u32 intr_info, ex_no, error_code; 5326 - unsigned long cr2, dr6; 5352 + unsigned long dr6; 5327 5353 u32 vect_info; 5328 5354 5329 5355 vect_info = vmx->idt_vectoring_info; ··· 5439 5383 return 0; 5440 5384 } 5441 5385 5442 - if (is_page_fault(intr_info)) { 5443 - cr2 = vmx_get_exit_qual(vcpu); 5444 - if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5445 - /* 5446 - * EPT will cause page fault only if we need to 5447 - * detect illegal GPAs. 5448 - */ 5449 - WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5450 - kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5451 - return 1; 5452 - } else 5453 - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5454 - } 5386 + if (is_page_fault(intr_info)) 5387 + return vmx_handle_page_fault(vcpu, error_code); 5455 5388 5456 5389 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5457 5390 ··· 8717 8672 * can hide/show features based on kvm_cpu_cap_has(). 8718 8673 */ 8719 8674 if (nested) { 8720 - nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8721 - 8722 8675 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8723 8676 if (r) 8724 8677 return r; 8725 8678 } 8726 8679 8727 8680 r = alloc_kvm_area(); 8728 - if (r && nested) 8729 - nested_vmx_hardware_unsetup(); 8681 + if (r) 8682 + goto err_kvm_area; 8730 8683 8731 8684 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8732 8685 ··· 8751 8708 8752 8709 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8753 8710 8711 + return 0; 8712 + 8713 + err_kvm_area: 8714 + if (nested) 8715 + nested_vmx_hardware_unsetup(); 8754 8716 return r; 8755 8717 } 8756 8718