Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini:
"arm64:

- Fix a couple of bugs affecting pKVM's PSCI relay implementation
when running in the hVHE mode, resulting in the host being entered
with the MMU in an unknown state, and EL2 being in the wrong mode

x86:

- Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow

- Ensure DEBUGCTL is context switched on AMD to avoid running the
guest with the host's value, which can lead to unexpected bus lock
#DBs

- Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't
properly emulate BTF. KVM's lack of context switching has meant BTF
has always been broken to some extent

- Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as
the guest can enable DebugSwap without KVM's knowledge

- Fix a bug in mmu_stress_tests where a vCPU could finish the "writes
to RO memory" phase without actually generating a write-protection
fault

- Fix a printf() goof in the SEV smoke test that causes build
failures with -Werror

- Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when
PERFMON_V2 isn't supported by KVM"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86: Explicitly zero EAX and EBX when PERFMON_V2 isn't supported by KVM
KVM: selftests: Fix printf() format goof in SEV smoke test
KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage
KVM: SVM: Don't rely on DebugSwap to restore host DR0..DR3
KVM: SVM: Save host DR masks on CPUs with DebugSwap
KVM: arm64: Initialize SCTLR_EL1 in __kvm_hyp_init_cpu()
KVM: arm64: Initialize HCR_EL2.E2H early
KVM: x86: Snapshot the host's DEBUGCTL after disabling IRQs
KVM: SVM: Manually context switch DEBUGCTL if LBR virtualization is disabled
KVM: x86: Snapshot the host's DEBUGCTL in common x86
KVM: SVM: Suppress DEBUGCTL.BTF on AMD
KVM: SVM: Drop DEBUGCTL[5:2] from guest's effective value
KVM: selftests: Assert that STI blocking isn't set after event injection
KVM: SVM: Set RFLAGS.IF=1 in C code, to get VMRUN out of the STI shadow

+130 -62
+26 -5
arch/arm64/include/asm/el2_setup.h
··· 16 16 #include <asm/sysreg.h> 17 17 #include <linux/irqchip/arm-gic-v3.h> 18 18 19 + .macro init_el2_hcr val 20 + mov_q x0, \val 21 + 22 + /* 23 + * Compliant CPUs advertise their VHE-onlyness with 24 + * ID_AA64MMFR4_EL1.E2H0 < 0. On such CPUs HCR_EL2.E2H is RES1, but it 25 + * can reset into an UNKNOWN state and might not read as 1 until it has 26 + * been initialized explicitly. 27 + * 28 + * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but 29 + * don't advertise it (they predate this relaxation). 30 + * 31 + * Initalize HCR_EL2.E2H so that later code can rely upon HCR_EL2.E2H 32 + * indicating whether the CPU is running in E2H mode. 33 + */ 34 + mrs_s x1, SYS_ID_AA64MMFR4_EL1 35 + sbfx x1, x1, #ID_AA64MMFR4_EL1_E2H0_SHIFT, #ID_AA64MMFR4_EL1_E2H0_WIDTH 36 + cmp x1, #0 37 + b.ge .LnVHE_\@ 38 + 39 + orr x0, x0, #HCR_E2H 40 + .LnVHE_\@: 41 + msr hcr_el2, x0 42 + isb 43 + .endm 44 + 19 45 .macro __init_el2_sctlr 20 46 mov_q x0, INIT_SCTLR_EL2_MMU_OFF 21 47 msr sctlr_el2, x0 ··· 268 242 msr_s SYS_GCSCR_EL1, xzr 269 243 msr_s SYS_GCSCRE0_EL1, xzr 270 244 .Lskip_gcs_\@: 271 - .endm 272 - 273 - .macro __init_el2_nvhe_prepare_eret 274 - mov x0, #INIT_PSTATE_EL1 275 - msr spsr_el2, x0 276 245 .endm 277 246 278 247 .macro __init_el2_mpam
+3 -19
arch/arm64/kernel/head.S
··· 298 298 msr sctlr_el2, x0 299 299 isb 300 300 0: 301 - mov_q x0, HCR_HOST_NVHE_FLAGS 302 301 303 - /* 304 - * Compliant CPUs advertise their VHE-onlyness with 305 - * ID_AA64MMFR4_EL1.E2H0 < 0. HCR_EL2.E2H can be 306 - * RES1 in that case. Publish the E2H bit early so that 307 - * it can be picked up by the init_el2_state macro. 308 - * 309 - * Fruity CPUs seem to have HCR_EL2.E2H set to RAO/WI, but 310 - * don't advertise it (they predate this relaxation). 311 - */ 312 - mrs_s x1, SYS_ID_AA64MMFR4_EL1 313 - tbz x1, #(ID_AA64MMFR4_EL1_E2H0_SHIFT + ID_AA64MMFR4_EL1_E2H0_WIDTH - 1), 1f 314 - 315 - orr x0, x0, #HCR_E2H 316 - 1: 317 - msr hcr_el2, x0 318 - isb 319 - 302 + init_el2_hcr HCR_HOST_NVHE_FLAGS 320 303 init_el2_state 321 304 322 305 /* Hypervisor stub */ ··· 322 339 msr sctlr_el1, x1 323 340 mov x2, xzr 324 341 3: 325 - __init_el2_nvhe_prepare_eret 342 + mov x0, #INIT_PSTATE_EL1 343 + msr spsr_el2, x0 326 344 327 345 mov w0, #BOOT_CPU_MODE_EL2 328 346 orr x0, x0, x2
+7 -3
arch/arm64/kvm/hyp/nvhe/hyp-init.S
··· 73 73 eret 74 74 SYM_CODE_END(__kvm_hyp_init) 75 75 76 + /* 77 + * Initialize EL2 CPU state to sane values. 78 + * 79 + * HCR_EL2.E2H must have been initialized already. 80 + */ 76 81 SYM_CODE_START_LOCAL(__kvm_init_el2_state) 77 - /* Initialize EL2 CPU state to sane values. */ 78 82 init_el2_state // Clobbers x0..x2 79 83 finalise_el2_state 80 84 ret ··· 210 206 211 207 2: msr SPsel, #1 // We want to use SP_EL{1,2} 212 208 213 - bl __kvm_init_el2_state 209 + init_el2_hcr 0 214 210 215 - __init_el2_nvhe_prepare_eret 211 + bl __kvm_init_el2_state 216 212 217 213 /* Enable MMU, set vectors and stack. */ 218 214 mov x0, x28
+3
arch/arm64/kvm/hyp/nvhe/psci-relay.c
··· 218 218 if (is_cpu_on) 219 219 release_boot_args(boot_args); 220 220 221 + write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR); 222 + write_sysreg(INIT_PSTATE_EL1, SPSR_EL2); 223 + 221 224 __host_enter(host_ctxt); 222 225 } 223 226
+1
arch/x86/include/asm/kvm_host.h
··· 780 780 u32 pkru; 781 781 u32 hflags; 782 782 u64 efer; 783 + u64 host_debugctl; 783 784 u64 apic_base; 784 785 struct kvm_lapic *apic; /* kernel irqchip context */ 785 786 bool load_eoi_exitmap_pending;
+1 -1
arch/x86/kvm/cpuid.c
··· 1763 1763 1764 1764 entry->ecx = entry->edx = 0; 1765 1765 if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { 1766 - entry->eax = entry->ebx; 1766 + entry->eax = entry->ebx = 0; 1767 1767 break; 1768 1768 } 1769 1769
+17 -7
arch/x86/kvm/svm/sev.c
··· 4590 4590 4591 4591 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) 4592 4592 { 4593 + struct kvm *kvm = svm->vcpu.kvm; 4594 + 4593 4595 /* 4594 4596 * All host state for SEV-ES guests is categorized into three swap types 4595 4597 * based on how it is handled by hardware during a world switch: ··· 4615 4613 4616 4614 /* 4617 4615 * If DebugSwap is enabled, debug registers are loaded but NOT saved by 4618 - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both 4619 - * saves and loads debug registers (Type-A). 4616 + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does 4617 + * not save or load debug registers. Sadly, KVM can't prevent SNP 4618 + * guests from lying about DebugSwap on secondary vCPUs, i.e. the 4619 + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what 4620 + * the guest has actually enabled (or not!) in the VMSA. 4621 + * 4622 + * If DebugSwap is *possible*, save the masks so that they're restored 4623 + * if the guest enables DebugSwap. But for the DRs themselves, do NOT 4624 + * rely on the CPU to restore the host values; KVM will restore them as 4625 + * needed in common code, via hw_breakpoint_restore(). Note, KVM does 4626 + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs 4627 + * don't need to be restored per se, KVM just needs to ensure they are 4628 + * loaded with the correct values *if* the CPU writes the MSRs. 4620 4629 */ 4621 - if (sev_vcpu_has_debug_swap(svm)) { 4622 - hostsa->dr0 = native_get_debugreg(0); 4623 - hostsa->dr1 = native_get_debugreg(1); 4624 - hostsa->dr2 = native_get_debugreg(2); 4625 - hostsa->dr3 = native_get_debugreg(3); 4630 + if (sev_vcpu_has_debug_swap(svm) || 4631 + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { 4626 4632 hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); 4627 4633 hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); 4628 4634 hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
+49
arch/x86/kvm/svm/svm.c
··· 3165 3165 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3166 3166 break; 3167 3167 } 3168 + 3169 + /* 3170 + * AMD changed the architectural behavior of bits 5:2. On CPUs 3171 + * without BusLockTrap, bits 5:2 control "external pins", but 3172 + * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap 3173 + * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed 3174 + * the guest to set bits 5:2 despite not actually virtualizing 3175 + * Performance-Monitoring/Breakpoint external pins. Drop bits 3176 + * 5:2 for backwards compatibility. 3177 + */ 3178 + data &= ~GENMASK(5, 2); 3179 + 3180 + /* 3181 + * Suppress BTF as KVM doesn't virtualize BTF, but there's no 3182 + * way to communicate lack of support to the guest. 3183 + */ 3184 + if (data & DEBUGCTLMSR_BTF) { 3185 + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 3186 + data &= ~DEBUGCTLMSR_BTF; 3187 + } 3188 + 3168 3189 if (data & DEBUGCTL_RESERVED_BITS) 3169 3190 return 1; 3170 3191 ··· 4210 4189 4211 4190 guest_state_enter_irqoff(); 4212 4191 4192 + /* 4193 + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of 4194 + * VMRUN controls whether or not physical IRQs are masked (KVM always 4195 + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the 4196 + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow 4197 + * into guest state if delivery of an event during VMRUN triggers a 4198 + * #VMEXIT, and the guest_state transitions already tell lockdep that 4199 + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of 4200 + * this path, so IRQs aren't actually unmasked while running host code. 4201 + */ 4202 + raw_local_irq_enable(); 4203 + 4213 4204 amd_clear_divider(); 4214 4205 4215 4206 if (sev_es_guest(vcpu->kvm)) ··· 4229 4196 sev_es_host_save_area(sd)); 4230 4197 else 4231 4198 __svm_vcpu_run(svm, spec_ctrl_intercepted); 4199 + 4200 + raw_local_irq_disable(); 4232 4201 4233 4202 guest_state_exit_irqoff(); 4234 4203 } ··· 4288 4253 clgi(); 4289 4254 kvm_load_guest_xsave_state(vcpu); 4290 4255 4256 + /* 4257 + * Hardware only context switches DEBUGCTL if LBR virtualization is 4258 + * enabled. Manually load DEBUGCTL if necessary (and restore it after 4259 + * VM-Exit), as running with the host's DEBUGCTL can negatively affect 4260 + * guest state and can even be fatal, e.g. due to Bus Lock Detect. 4261 + */ 4262 + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4263 + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4264 + update_debugctlmsr(svm->vmcb->save.dbgctl); 4265 + 4291 4266 kvm_wait_lapic_expire(vcpu); 4292 4267 4293 4268 /* ··· 4324 4279 4325 4280 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4326 4281 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 4282 + 4283 + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4284 + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4285 + update_debugctlmsr(vcpu->arch.host_debugctl); 4327 4286 4328 4287 kvm_load_host_xsave_state(vcpu); 4329 4288 stgi();
+1 -1
arch/x86/kvm/svm/svm.h
··· 584 584 /* svm.c */ 585 585 #define MSR_INVALID 0xffffffffU 586 586 587 - #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 587 + #define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) 588 588 589 589 extern bool dump_invalid_vmcb; 590 590
+1 -9
arch/x86/kvm/svm/vmenter.S
··· 170 170 mov VCPU_RDI(%_ASM_DI), %_ASM_DI 171 171 172 172 /* Enter guest mode */ 173 - sti 174 - 175 173 3: vmrun %_ASM_AX 176 174 4: 177 - cli 178 - 179 175 /* Pop @svm to RAX while it's the only available register. */ 180 176 pop %_ASM_AX 181 177 ··· 336 340 mov KVM_VMCB_pa(%rax), %rax 337 341 338 342 /* Enter guest mode */ 339 - sti 340 - 341 343 1: vmrun %rax 342 - 343 - 2: cli 344 - 344 + 2: 345 345 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 346 346 FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT 347 347
+2 -6
arch/x86/kvm/vmx/vmx.c
··· 1514 1514 */ 1515 1515 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1516 1516 { 1517 - struct vcpu_vmx *vmx = to_vmx(vcpu); 1518 - 1519 1517 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1520 1518 shrink_ple_window(vcpu); 1521 1519 1522 1520 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1523 1521 1524 1522 vmx_vcpu_pi_load(vcpu, cpu); 1525 - 1526 - vmx->host_debugctlmsr = get_debugctlmsr(); 1527 1523 } 1528 1524 1529 1525 void vmx_vcpu_put(struct kvm_vcpu *vcpu) ··· 7454 7458 } 7455 7459 7456 7460 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7457 - if (vmx->host_debugctlmsr) 7458 - update_debugctlmsr(vmx->host_debugctlmsr); 7461 + if (vcpu->arch.host_debugctl) 7462 + update_debugctlmsr(vcpu->arch.host_debugctl); 7459 7463 7460 7464 #ifndef CONFIG_X86_64 7461 7465 /*
-2
arch/x86/kvm/vmx/vmx.h
··· 340 340 /* apic deadline value in host tsc */ 341 341 u64 hv_deadline_tsc; 342 342 343 - unsigned long host_debugctlmsr; 344 - 345 343 /* 346 344 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in 347 345 * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+2
arch/x86/kvm/x86.c
··· 10968 10968 set_debugreg(0, 7); 10969 10969 } 10970 10970 10971 + vcpu->arch.host_debugctl = get_debugctlmsr(); 10972 + 10971 10973 guest_timing_enter_irqoff(); 10972 10974 10973 10975 for (;;) {
+13 -8
tools/testing/selftests/kvm/mmu_stress_test.c
··· 18 18 #include "ucall_common.h" 19 19 20 20 static bool mprotect_ro_done; 21 + static bool all_vcpus_hit_ro_fault; 21 22 22 23 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) 23 24 { ··· 37 36 38 37 /* 39 38 * Write to the region while mprotect(PROT_READ) is underway. Keep 40 - * looping until the memory is guaranteed to be read-only, otherwise 41 - * vCPUs may complete their writes and advance to the next stage 42 - * prematurely. 39 + * looping until the memory is guaranteed to be read-only and a fault 40 + * has occurred, otherwise vCPUs may complete their writes and advance 41 + * to the next stage prematurely. 43 42 * 44 43 * For architectures that support skipping the faulting instruction, 45 44 * generate the store via inline assembly to ensure the exact length ··· 57 56 #else 58 57 vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa); 59 58 #endif 60 - } while (!READ_ONCE(mprotect_ro_done)); 59 + } while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault)); 61 60 62 61 /* 63 62 * Only architectures that write the entire range can explicitly sync, ··· 82 81 83 82 static int nr_vcpus; 84 83 static atomic_t rendezvous; 84 + static atomic_t nr_ro_faults; 85 85 86 86 static void rendezvous_with_boss(void) 87 87 { ··· 150 148 * be stuck on the faulting instruction for other architectures. Go to 151 149 * stage 3 without a rendezvous 152 150 */ 153 - do { 154 - r = _vcpu_run(vcpu); 155 - } while (!r); 151 + r = _vcpu_run(vcpu); 156 152 TEST_ASSERT(r == -1 && errno == EFAULT, 157 153 "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno); 154 + 155 + atomic_inc(&nr_ro_faults); 156 + if (atomic_read(&nr_ro_faults) == nr_vcpus) { 157 + WRITE_ONCE(all_vcpus_hit_ro_fault, true); 158 + sync_global_to_guest(vm, all_vcpus_hit_ro_fault); 159 + } 158 160 159 161 #if defined(__x86_64__) || defined(__aarch64__) 160 162 /* ··· 384 378 rendezvous_with_vcpus(&time_run2, "run 2"); 385 379 386 380 mprotect(mem, slot_size, PROT_READ); 387 - usleep(10); 388 381 mprotect_ro_done = true; 389 382 sync_global_to_guest(vm, mprotect_ro_done); 390 383
+2
tools/testing/selftests/kvm/x86/nested_exceptions_test.c
··· 85 85 86 86 GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector)); 87 87 GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code); 88 + GUEST_ASSERT(!ctrl->int_state); 88 89 } 89 90 90 91 static void l1_svm_code(struct svm_test_data *svm) ··· 123 122 GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); 124 123 GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector); 125 124 GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code); 125 + GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO)); 126 126 } 127 127 128 128 static void l1_vmx_code(struct vmx_pages *vmx)
+2 -1
tools/testing/selftests/kvm/x86/sev_smoke_test.c
··· 52 52 bool bad = false; 53 53 for (i = 0; i < 4095; i++) { 54 54 if (from_host[i] != from_guest[i]) { 55 - printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); 55 + printf("mismatch at %u | %02hhx %02hhx\n", 56 + i, from_host[i], from_guest[i]); 56 57 bad = true; 57 58 } 58 59 }