Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-nested-7.1' of https://github.com/kvm-x86/linux into HEAD

KVM nested SVM changes for 7.1 (with one common x86 fix)

- To minimize the probability of corrupting guest state, defer KVM's
non-architectural delivery of exception payloads (e.g. CR2 and DR6) until
consumption of the payload is imminent, and force delivery of the payload
in all paths where userspace saves relevant state.

- Use vcpu->arch.cr2 when updating vmcb12's CR2 on nested #VMEXIT to fix a
bug where L2's CR2 can get corrupted after a save/restore, e.g. if the VM
is migrated while L2 is faulting in memory.

- Fix a class of nSVM bugs where some fields written by the CPU are not
synchronized from vmcb02 to cached vmcb12 after VMRUN, and so are not
up-to-date when saved by KVM_GET_NESTED_STATE.

- Fix a class of bugs where the ordering between KVM_SET_NESTED_STATE and
KVM_SET_{S}REGS could cause vmcb02 to be incorrectly initialized after
save+restore.

- Add a variety of missing nSVM consistency checks.

- Fix several bugs where KVM failed to correctly update VMCB fields on nested
#VMEXIT.

- Fix several bugs where KVM failed to correctly synthesize #UD or #GP for
SVM-related instructions.

- Add support for save+restore of virtualized LBRs (on SVM).

- Refactor various helpers and macros to improve clarity and (hopefully) make
the code easier to maintain.

- Aggressively sanitize fields when copying from vmcb12 to guard against
unintentionally allowing L1 to utilize yet-to-be-defined features.

- Fix several bugs where KVM botched rAX legality checks when emulating SVM
instructions. Note, KVM is still flawed in that KVM doesn't address size
prefix overrides for 64-bit guests; this should probably be documented as a
KVM erratum.

- Fail emulation of VMRUN/VMLOAD/VMSAVE if mapping vmcb12 fails instead of
somewhat arbitrarily synthesizing #GP (i.e. don't bastardize AMD's already-
sketchy behavior of generating #GP if for "unsupported" addresses).

- Cache all used vmcb12 fields to further harden against TOCTOU bugs.

+1234 -494
+15
arch/x86/include/asm/kvm_host.h
··· 1098 1098 */ 1099 1099 bool pdptrs_from_userspace; 1100 1100 1101 + /* 1102 + * Set if an emulated nested VM-Enter to L2 is pending completion. KVM 1103 + * must not synthesize a VM-Exit to L1 before entering L2, as VM-Exits 1104 + * can only occur at instruction boundaries. The only exception is 1105 + * VMX's "notify" exits, which exist in large part to break the CPU out 1106 + * of infinite ucode loops, but can corrupt vCPU state in the process! 1107 + * 1108 + * For all intents and purposes, this is a boolean, but it's tracked as 1109 + * a u8 so that KVM can detect when userspace may have stuffed vCPU 1110 + * state and generated an architecturally-impossible VM-Exit. 1111 + */ 1112 + #define KVM_NESTED_RUN_PENDING 1 1113 + #define KVM_NESTED_RUN_PENDING_UNTRUSTED 2 1114 + u8 nested_run_pending; 1115 + 1101 1116 #if IS_ENABLED(CONFIG_HYPERV) 1102 1117 hpa_t hv_root_tdp; 1103 1118 #endif
+13 -7
arch/x86/include/asm/svm.h
··· 142 142 u64 exit_info_2; 143 143 u32 exit_int_info; 144 144 u32 exit_int_info_err; 145 - u64 nested_ctl; 145 + u64 misc_ctl; 146 146 u64 avic_vapic_bar; 147 147 u64 ghcb_gpa; 148 148 u32 event_inj; 149 149 u32 event_inj_err; 150 150 u64 nested_cr3; 151 - u64 virt_ext; 151 + u64 misc_ctl2; 152 152 u32 clean; 153 153 u32 reserved_5; 154 154 u64 next_rip; ··· 181 181 #define TLB_CONTROL_FLUSH_ALL_ASID 1 182 182 #define TLB_CONTROL_FLUSH_ASID 3 183 183 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 184 + 185 + #define TLB_CONTROL_MASK GENMASK(2, 0) 184 186 185 187 #define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0) 186 188 #define ERAP_CONTROL_CLEAR_RAP BIT(1) ··· 224 222 #define X2APIC_MODE_SHIFT 30 225 223 #define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT) 226 224 227 - #define LBR_CTL_ENABLE_MASK BIT_ULL(0) 228 - #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) 225 + #define SVM_INT_VECTOR_MASK GENMASK(7, 0) 229 226 230 227 #define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0) 231 228 #define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1) ··· 240 239 #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) 241 240 #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) 242 241 243 - #define SVM_NESTED_CTL_NP_ENABLE BIT(0) 244 - #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) 245 - #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) 242 + #define SVM_MISC_ENABLE_NP BIT(0) 243 + #define SVM_MISC_ENABLE_SEV BIT(1) 244 + #define SVM_MISC_ENABLE_SEV_ES BIT(2) 246 245 246 + #define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0) 247 + #define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1) 247 248 248 249 #define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL 249 250 #define SVM_TSC_RATIO_MIN 0x0000000000000001ULL ··· 638 635 639 636 #define SVM_EVTINJ_VALID (1 << 31) 640 637 #define SVM_EVTINJ_VALID_ERR (1 << 11) 638 + 639 + #define SVM_EVTINJ_RESERVED_BITS ~(SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | \ 640 + SVM_EVTINJ_VALID_ERR | SVM_EVTINJ_VALID) 641 641 642 642 #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK 643 643 #define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
+1 -2
arch/x86/kvm/emulate.c
··· 3887 3887 { 3888 3888 u64 rax = reg_read(ctxt, VCPU_REGS_RAX); 3889 3889 3890 - /* Valid physical address? */ 3891 - if (rax & 0xffff000000000000ULL) 3890 + if (!ctxt->ops->page_address_valid(ctxt, rax)) 3892 3891 return emulate_gp(ctxt, 0); 3893 3892 3894 3893 return check_svme(ctxt);
-8
arch/x86/kvm/hyperv.h
··· 305 305 { 306 306 return false; 307 307 } 308 - static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) 309 - { 310 - return false; 311 - } 312 - static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu) 313 - { 314 - return false; 315 - } 316 308 static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) 317 309 { 318 310 return 0;
+2
arch/x86/kvm/kvm_emulate.h
··· 245 245 246 246 bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, 247 247 unsigned int flags); 248 + 249 + bool (*page_address_valid)(struct x86_emulate_ctxt *ctxt, gpa_t gpa); 248 250 }; 249 251 250 252 /* Type, address-of, and value of an instruction's operand. */
+8 -1
arch/x86/kvm/svm/hyperv.h
··· 41 41 return hv_vcpu->vp_assist_page.nested_control.features.directhypercall; 42 42 } 43 43 44 + static inline bool nested_svm_is_l2_tlb_flush_hcall(struct kvm_vcpu *vcpu) 45 + { 46 + return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 47 + nested_svm_l2_tlb_flush_enabled(vcpu) && 48 + kvm_hv_is_tlb_flush_hcall(vcpu); 49 + } 50 + 44 51 void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); 45 52 #else /* CONFIG_KVM_HYPERV */ 46 53 static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {} 47 - static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) 54 + static inline bool nested_svm_is_l2_tlb_flush_hcall(struct kvm_vcpu *vcpu) 48 55 { 49 56 return false; 50 57 }
+363 -260
arch/x86/kvm/svm/nested.c
··· 116 116 if (!nested_npt_enabled(svm)) 117 117 return true; 118 118 119 - if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)) 119 + if (!(svm->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE)) 120 120 return true; 121 121 122 122 return false; 123 123 } 124 124 125 - void recalc_intercepts(struct vcpu_svm *svm) 125 + void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm) 126 126 { 127 - struct vmcb_control_area *c, *h; 128 - struct vmcb_ctrl_area_cached *g; 127 + struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl; 128 + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; 129 + struct vmcb *vmcb01 = svm->vmcb01.ptr; 129 130 unsigned int i; 130 131 131 - vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 132 - 133 - if (!is_guest_mode(&svm->vcpu)) 132 + if (WARN_ON_ONCE(svm->vmcb != vmcb02)) 134 133 return; 135 134 136 - c = &svm->vmcb->control; 137 - h = &svm->vmcb01.ptr->control; 138 - g = &svm->nested.ctl; 135 + vmcb_mark_dirty(vmcb02, VMCB_INTERCEPTS); 139 136 140 137 for (i = 0; i < MAX_INTERCEPT; i++) 141 - c->intercepts[i] = h->intercepts[i]; 138 + vmcb02->control.intercepts[i] = vmcb01->control.intercepts[i]; 142 139 143 - if (g->int_ctl & V_INTR_MASKING_MASK) { 140 + if (vmcb12_ctrl->int_ctl & V_INTR_MASKING_MASK) { 144 141 /* 145 142 * If L2 is active and V_INTR_MASKING is enabled in vmcb12, 146 143 * disable intercept of CR8 writes as L2's CR8 does not affect ··· 148 151 * the effective RFLAGS.IF for L1 interrupts will never be set 149 152 * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs). 150 153 */ 151 - vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); 152 - if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)) 153 - vmcb_clr_intercept(c, INTERCEPT_VINTR); 154 + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_CR8_WRITE); 155 + if (!(vmcb01->save.rflags & X86_EFLAGS_IF)) 156 + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_VINTR); 154 157 } 155 158 156 - /* 157 - * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB 158 - * flush feature is enabled. 159 - */ 160 - if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu)) 161 - vmcb_clr_intercept(c, INTERCEPT_VMMCALL); 162 - 163 159 for (i = 0; i < MAX_INTERCEPT; i++) 164 - c->intercepts[i] |= g->intercepts[i]; 160 + vmcb02->control.intercepts[i] |= vmcb12_ctrl->intercepts[i]; 165 161 166 162 /* If SMI is not intercepted, ignore guest SMI intercept as well */ 167 163 if (!intercept_smi) 168 - vmcb_clr_intercept(c, INTERCEPT_SMI); 164 + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_SMI); 169 165 170 166 if (nested_vmcb_needs_vls_intercept(svm)) { 171 167 /* ··· 166 176 * we must intercept these instructions to correctly 167 177 * emulate them in case L1 doesn't intercept them. 168 178 */ 169 - vmcb_set_intercept(c, INTERCEPT_VMLOAD); 170 - vmcb_set_intercept(c, INTERCEPT_VMSAVE); 179 + vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMLOAD); 180 + vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMSAVE); 171 181 } else { 172 - WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)); 182 + WARN_ON_ONCE(!(vmcb02->control.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE)); 173 183 } 174 184 } 175 185 ··· 329 339 kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); 330 340 } 331 341 332 - static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, 333 - struct vmcb_ctrl_area_cached *control) 342 + static bool nested_svm_event_inj_valid_exept(struct kvm_vcpu *vcpu, u8 vector) 343 + { 344 + /* 345 + * Vectors that do not correspond to a defined exception are invalid 346 + * (including #NMI and reserved vectors). In a best effort to define 347 + * valid exceptions based on the virtual CPU, make all exceptions always 348 + * valid except those obviously tied to a CPU feature. 349 + */ 350 + switch (vector) { 351 + case DE_VECTOR: case DB_VECTOR: case BP_VECTOR: case OF_VECTOR: 352 + case BR_VECTOR: case UD_VECTOR: case NM_VECTOR: case DF_VECTOR: 353 + case TS_VECTOR: case NP_VECTOR: case SS_VECTOR: case GP_VECTOR: 354 + case PF_VECTOR: case MF_VECTOR: case AC_VECTOR: case MC_VECTOR: 355 + case XM_VECTOR: case HV_VECTOR: case SX_VECTOR: 356 + return true; 357 + case CP_VECTOR: 358 + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 359 + case VC_VECTOR: 360 + return guest_cpu_cap_has(vcpu, X86_FEATURE_SEV_ES); 361 + } 362 + return false; 363 + } 364 + 365 + /* 366 + * According to the APM, VMRUN exits with SVM_EXIT_ERR if SVM_EVTINJ_VALID is 367 + * set and: 368 + * - The type of event_inj is not one of the defined values. 369 + * - The type is SVM_EVTINJ_TYPE_EXEPT, but the vector is not a valid exception. 370 + */ 371 + static bool nested_svm_check_event_inj(struct kvm_vcpu *vcpu, u32 event_inj) 372 + { 373 + u32 type = event_inj & SVM_EVTINJ_TYPE_MASK; 374 + u8 vector = event_inj & SVM_EVTINJ_VEC_MASK; 375 + 376 + if (!(event_inj & SVM_EVTINJ_VALID)) 377 + return true; 378 + 379 + if (type != SVM_EVTINJ_TYPE_INTR && type != SVM_EVTINJ_TYPE_NMI && 380 + type != SVM_EVTINJ_TYPE_EXEPT && type != SVM_EVTINJ_TYPE_SOFT) 381 + return false; 382 + 383 + if (type == SVM_EVTINJ_TYPE_EXEPT && 384 + !nested_svm_event_inj_valid_exept(vcpu, vector)) 385 + return false; 386 + 387 + return true; 388 + } 389 + 390 + static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, 391 + struct vmcb_ctrl_area_cached *control) 334 392 { 335 393 if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) 336 394 return false; ··· 386 348 if (CC(control->asid == 0)) 387 349 return false; 388 350 389 - if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled)) 351 + if (CC((control->misc_ctl & SVM_MISC_ENABLE_NP) && 352 + !kvm_vcpu_is_legal_gpa(vcpu, control->nested_cr3))) 390 353 return false; 391 354 392 355 if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa, ··· 402 363 return false; 403 364 } 404 365 366 + if (CC(!nested_svm_check_event_inj(vcpu, control->event_inj))) 367 + return false; 368 + 405 369 return true; 406 370 } 407 371 408 372 /* Common checks that apply to both L1 and L2 state. */ 409 - static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, 410 - struct vmcb_save_area_cached *save) 373 + static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu, 374 + struct vmcb_save_area_cached *save) 411 375 { 412 376 if (CC(!(save->efer & EFER_SVME))) 413 377 return false; ··· 432 390 CC(!(save->cr0 & X86_CR0_PE)) || 433 391 CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3))) 434 392 return false; 393 + 394 + if (CC((save->cs.attrib & SVM_SELECTOR_L_MASK) && 395 + (save->cs.attrib & SVM_SELECTOR_DB_MASK))) 396 + return false; 435 397 } 436 398 437 399 /* Note, SVM doesn't have any additional restrictions on CR4. */ ··· 448 402 return true; 449 403 } 450 404 451 - static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) 452 - { 453 - struct vcpu_svm *svm = to_svm(vcpu); 454 - struct vmcb_save_area_cached *save = &svm->nested.save; 455 - 456 - return __nested_vmcb_check_save(vcpu, save); 457 - } 458 - 459 - static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) 460 - { 461 - struct vcpu_svm *svm = to_svm(vcpu); 462 - struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; 463 - 464 - return __nested_vmcb_check_controls(vcpu, ctl); 465 - } 466 - 467 405 int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu) 468 406 { 469 - if (!nested_vmcb_check_save(vcpu) || 470 - !nested_vmcb_check_controls(vcpu)) 407 + struct vcpu_svm *svm = to_svm(vcpu); 408 + 409 + if (!nested_vmcb_check_save(vcpu, &svm->nested.save) || 410 + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) 471 411 return -EINVAL; 472 412 473 413 return 0; ··· 488 456 nested_svm_sanitize_intercept(vcpu, to, SKINIT); 489 457 nested_svm_sanitize_intercept(vcpu, to, RDPRU); 490 458 491 - to->iopm_base_pa = from->iopm_base_pa; 492 - to->msrpm_base_pa = from->msrpm_base_pa; 459 + /* Always clear SVM_MISC_ENABLE_NP if the guest cannot use NPTs */ 460 + to->misc_ctl = from->misc_ctl; 461 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NPT)) 462 + to->misc_ctl &= ~SVM_MISC_ENABLE_NP; 463 + 464 + to->iopm_base_pa = from->iopm_base_pa & PAGE_MASK; 465 + to->msrpm_base_pa = from->msrpm_base_pa & PAGE_MASK; 493 466 to->tsc_offset = from->tsc_offset; 494 - to->tlb_ctl = from->tlb_ctl; 467 + to->tlb_ctl = from->tlb_ctl & TLB_CONTROL_MASK; 495 468 to->erap_ctl = from->erap_ctl; 496 469 to->int_ctl = from->int_ctl; 497 - to->int_vector = from->int_vector; 498 - to->int_state = from->int_state; 470 + to->int_vector = from->int_vector & SVM_INT_VECTOR_MASK; 471 + to->int_state = from->int_state & SVM_INTERRUPT_SHADOW_MASK; 499 472 to->exit_code = from->exit_code; 500 473 to->exit_info_1 = from->exit_info_1; 501 474 to->exit_info_2 = from->exit_info_2; 502 475 to->exit_int_info = from->exit_int_info; 503 476 to->exit_int_info_err = from->exit_int_info_err; 504 - to->nested_ctl = from->nested_ctl; 505 - to->event_inj = from->event_inj; 477 + to->event_inj = from->event_inj & ~SVM_EVTINJ_RESERVED_BITS; 506 478 to->event_inj_err = from->event_inj_err; 507 479 to->next_rip = from->next_rip; 508 480 to->nested_cr3 = from->nested_cr3; 509 - to->virt_ext = from->virt_ext; 481 + to->misc_ctl2 = from->misc_ctl2; 510 482 to->pause_filter_count = from->pause_filter_count; 511 483 to->pause_filter_thresh = from->pause_filter_thresh; 512 484 513 - /* Copy asid here because nested_vmcb_check_controls will check it. */ 485 + /* Copy asid here because nested_vmcb_check_controls() will check it */ 514 486 to->asid = from->asid; 515 - to->msrpm_base_pa &= ~0x0fffULL; 516 - to->iopm_base_pa &= ~0x0fffULL; 487 + to->clean = from->clean; 517 488 518 489 #ifdef CONFIG_KVM_HYPERV 519 490 /* Hyper-V extensions (Enlightened VMCB) */ 520 491 if (kvm_hv_hypercall_enabled(vcpu)) { 521 - to->clean = from->clean; 522 492 memcpy(&to->hv_enlightenments, &from->hv_enlightenments, 523 493 sizeof(to->hv_enlightenments)); 524 494 } ··· 536 502 static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, 537 503 struct vmcb_save_area *from) 538 504 { 539 - /* 540 - * Copy only fields that are validated, as we need them 541 - * to avoid TOC/TOU races. 542 - */ 543 - to->efer = from->efer; 544 - to->cr0 = from->cr0; 545 - to->cr3 = from->cr3; 546 - to->cr4 = from->cr4; 505 + to->es = from->es; 506 + to->cs = from->cs; 507 + to->ss = from->ss; 508 + to->ds = from->ds; 509 + to->gdtr = from->gdtr; 510 + to->idtr = from->idtr; 547 511 548 - to->dr6 = from->dr6; 512 + to->cpl = from->cpl; 513 + 514 + to->efer = from->efer; 515 + to->cr4 = from->cr4; 516 + to->cr3 = from->cr3; 517 + to->cr0 = from->cr0; 549 518 to->dr7 = from->dr7; 519 + to->dr6 = from->dr6; 520 + 521 + to->rflags = from->rflags; 522 + to->rip = from->rip; 523 + to->rsp = from->rsp; 524 + 525 + to->s_cet = from->s_cet; 526 + to->ssp = from->ssp; 527 + to->isst_addr = from->isst_addr; 528 + 529 + to->rax = from->rax; 530 + to->cr2 = from->cr2; 531 + 532 + svm_copy_lbrs(to, from); 550 533 } 551 534 552 535 void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, ··· 581 530 u32 mask; 582 531 svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; 583 532 svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err; 533 + svm->nested.ctl.int_state = svm->vmcb->control.int_state; 584 534 585 535 /* Only a few fields of int_ctl are written by the processor. */ 586 536 mask = V_IRQ_MASK | V_TPR_MASK; ··· 594 542 * int_ctl (because it was never recognized while L2 was running). 595 543 */ 596 544 if (svm_is_intercept(svm, INTERCEPT_VINTR) && 597 - !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts)) 545 + !vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_VINTR)) 598 546 mask &= ~V_IRQ_MASK; 599 547 600 548 if (nested_vgif_enabled(svm)) ··· 700 648 svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; 701 649 } 702 650 703 - static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) 651 + static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu) 704 652 { 653 + return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 654 + (to_svm(vcpu)->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR); 655 + } 656 + 657 + static void nested_vmcb02_prepare_save(struct vcpu_svm *svm) 658 + { 659 + struct vmcb_ctrl_area_cached *control = &svm->nested.ctl; 660 + struct vmcb_save_area_cached *save = &svm->nested.save; 705 661 bool new_vmcb12 = false; 706 662 struct vmcb *vmcb01 = svm->vmcb01.ptr; 707 663 struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; ··· 725 665 svm->nested.force_msr_bitmap_recalc = true; 726 666 } 727 667 728 - if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { 729 - vmcb02->save.es = vmcb12->save.es; 730 - vmcb02->save.cs = vmcb12->save.cs; 731 - vmcb02->save.ss = vmcb12->save.ss; 732 - vmcb02->save.ds = vmcb12->save.ds; 733 - vmcb02->save.cpl = vmcb12->save.cpl; 668 + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_SEG))) { 669 + vmcb02->save.es = save->es; 670 + vmcb02->save.cs = save->cs; 671 + vmcb02->save.ss = save->ss; 672 + vmcb02->save.ds = save->ds; 673 + vmcb02->save.cpl = save->cpl; 734 674 vmcb_mark_dirty(vmcb02, VMCB_SEG); 735 675 } 736 676 737 - if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { 738 - vmcb02->save.gdtr = vmcb12->save.gdtr; 739 - vmcb02->save.idtr = vmcb12->save.idtr; 677 + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DT))) { 678 + vmcb02->save.gdtr = save->gdtr; 679 + vmcb02->save.idtr = save->idtr; 740 680 vmcb_mark_dirty(vmcb02, VMCB_DT); 741 681 } 742 682 743 683 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && 744 - (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) { 745 - vmcb02->save.s_cet = vmcb12->save.s_cet; 746 - vmcb02->save.isst_addr = vmcb12->save.isst_addr; 747 - vmcb02->save.ssp = vmcb12->save.ssp; 684 + (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_CET)))) { 685 + vmcb02->save.s_cet = save->s_cet; 686 + vmcb02->save.isst_addr = save->isst_addr; 687 + vmcb02->save.ssp = save->ssp; 748 688 vmcb_mark_dirty(vmcb02, VMCB_CET); 749 689 } 750 690 751 - kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); 691 + kvm_set_rflags(vcpu, save->rflags | X86_EFLAGS_FIXED); 752 692 753 693 svm_set_efer(vcpu, svm->nested.save.efer); 754 694 755 695 svm_set_cr0(vcpu, svm->nested.save.cr0); 756 696 svm_set_cr4(vcpu, svm->nested.save.cr4); 757 697 758 - svm->vcpu.arch.cr2 = vmcb12->save.cr2; 698 + svm->vcpu.arch.cr2 = save->cr2; 759 699 760 - kvm_rax_write(vcpu, vmcb12->save.rax); 761 - kvm_rsp_write(vcpu, vmcb12->save.rsp); 762 - kvm_rip_write(vcpu, vmcb12->save.rip); 700 + kvm_rax_write(vcpu, save->rax); 701 + kvm_rsp_write(vcpu, save->rsp); 702 + kvm_rip_write(vcpu, save->rip); 763 703 764 704 /* In case we don't even reach vcpu_run, the fields are not updated */ 765 - vmcb02->save.rax = vmcb12->save.rax; 766 - vmcb02->save.rsp = vmcb12->save.rsp; 767 - vmcb02->save.rip = vmcb12->save.rip; 705 + vmcb02->save.rax = save->rax; 706 + vmcb02->save.rsp = save->rsp; 707 + vmcb02->save.rip = save->rip; 768 708 769 - if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { 709 + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DR))) { 770 710 vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; 771 711 svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; 772 712 vmcb_mark_dirty(vmcb02, VMCB_DR); 773 713 } 774 714 775 - if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 776 - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { 715 + if (nested_vmcb12_has_lbrv(vcpu)) { 777 716 /* 778 717 * Reserved bits of DEBUGCTL are ignored. Be consistent with 779 718 * svm_set_msr's definition of reserved bits. 780 719 */ 781 - svm_copy_lbrs(vmcb02, vmcb12); 720 + svm_copy_lbrs(&vmcb02->save, save); 782 721 vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; 783 722 } else { 784 - svm_copy_lbrs(vmcb02, vmcb01); 723 + svm_copy_lbrs(&vmcb02->save, &vmcb01->save); 785 724 } 725 + vmcb_mark_dirty(vmcb02, VMCB_LBR); 786 726 svm_update_lbrv(&svm->vcpu); 787 727 } 788 728 ··· 810 750 return type == SVM_EVTINJ_TYPE_NMI; 811 751 } 812 752 813 - static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, 814 - unsigned long vmcb12_rip, 815 - unsigned long vmcb12_csbase) 753 + static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) 816 754 { 817 755 u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; 818 756 u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; 819 757 820 - struct kvm_vcpu *vcpu = &svm->vcpu; 821 - struct vmcb *vmcb01 = svm->vmcb01.ptr; 758 + struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl; 822 759 struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; 823 - u32 pause_count12; 824 - u32 pause_thresh12; 760 + struct vmcb *vmcb01 = svm->vmcb01.ptr; 761 + struct kvm_vcpu *vcpu = &svm->vcpu; 762 + u32 pause_count12, pause_thresh12; 825 763 826 764 nested_svm_transition_tlb_flush(vcpu); 827 765 ··· 832 774 */ 833 775 834 776 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && 835 - (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) 777 + (vmcb12_ctrl->int_ctl & V_GIF_ENABLE_MASK)) 836 778 int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); 837 779 else 838 780 int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); ··· 848 790 V_NMI_BLOCKING_MASK); 849 791 } 850 792 851 - /* Copied from vmcb01. msrpm_base can be overwritten later. */ 852 - vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; 793 + /* 794 + * Copied from vmcb01. msrpm_base can be overwritten later. 795 + * 796 + * SVM_MISC_ENABLE_NP in vmcb12 is only used for consistency checks. If 797 + * L1 enables NPTs, KVM shadows L1's NPTs and uses those to run L2. If 798 + * L1 disables NPT, KVM runs L2 with the same NPTs used to run L1. For 799 + * the latter, L1 runs L2 with shadow page tables that translate L2 GVAs 800 + * to L1 GPAs, so the same NPTs can be used for L1 and L2. 801 + */ 802 + vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & SVM_MISC_ENABLE_NP; 853 803 vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; 854 804 vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; 855 805 vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP); ··· 884 818 * L1 re-enters L2, the same instruction will trigger a VM-Exit and the 885 819 * entire cycle start over. 886 820 */ 887 - if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) 821 + if (vmcb02->save.rip && (svm->nested.last_bus_lock_rip == vmcb02->save.rip)) 888 822 vmcb02->control.bus_lock_counter = 1; 889 823 else 890 824 vmcb02->control.bus_lock_counter = 0; ··· 898 832 if (nested_npt_enabled(svm)) 899 833 nested_svm_init_mmu_context(vcpu); 900 834 901 - vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 902 - vcpu->arch.l1_tsc_offset, 903 - svm->nested.ctl.tsc_offset, 904 - svm->tsc_ratio_msr); 835 + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(vcpu->arch.l1_tsc_offset, 836 + vmcb12_ctrl->tsc_offset, 837 + svm->tsc_ratio_msr); 905 838 906 839 vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; 907 840 ··· 909 844 nested_svm_update_tsc_ratio_msr(vcpu); 910 845 911 846 vmcb02->control.int_ctl = 912 - (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | 847 + (vmcb12_ctrl->int_ctl & int_ctl_vmcb12_bits) | 913 848 (vmcb01->control.int_ctl & int_ctl_vmcb01_bits); 914 849 915 - vmcb02->control.int_vector = svm->nested.ctl.int_vector; 916 - vmcb02->control.int_state = svm->nested.ctl.int_state; 917 - vmcb02->control.event_inj = svm->nested.ctl.event_inj; 918 - vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; 850 + vmcb02->control.int_vector = vmcb12_ctrl->int_vector; 851 + vmcb02->control.int_state = vmcb12_ctrl->int_state; 852 + vmcb02->control.event_inj = vmcb12_ctrl->event_inj; 853 + vmcb02->control.event_inj_err = vmcb12_ctrl->event_inj_err; 919 854 920 855 /* 921 - * next_rip is consumed on VMRUN as the return address pushed on the 922 - * stack for injected soft exceptions/interrupts. If nrips is exposed 923 - * to L1, take it verbatim from vmcb12. If nrips is supported in 924 - * hardware but not exposed to L1, stuff the actual L2 RIP to emulate 925 - * what a nrips=0 CPU would do (L1 is responsible for advancing RIP 926 - * prior to injecting the event). 856 + * If nrips is exposed to L1, take NextRIP as-is. Otherwise, L1 857 + * advances L2's RIP before VMRUN instead of using NextRIP. KVM will 858 + * stuff the current RIP as vmcb02's NextRIP before L2 is run. After 859 + * the first run of L2 (e.g. after save+restore), NextRIP is updated by 860 + * the CPU and/or KVM and should be used regardless of L1's support. 927 861 */ 928 - if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 929 - vmcb02->control.next_rip = svm->nested.ctl.next_rip; 930 - else if (boot_cpu_has(X86_FEATURE_NRIPS)) 931 - vmcb02->control.next_rip = vmcb12_rip; 862 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || 863 + !vcpu->arch.nested_run_pending) 864 + vmcb02->control.next_rip = vmcb12_ctrl->next_rip; 932 865 933 866 svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); 867 + 868 + /* 869 + * soft_int_csbase, soft_int_old_rip, and soft_int_next_rip (if L1 870 + * doesn't have NRIPS) are initialized later, before the vCPU is run. 871 + */ 934 872 if (is_evtinj_soft(vmcb02->control.event_inj)) { 935 873 svm->soft_int_injected = true; 936 - svm->soft_int_csbase = vmcb12_csbase; 937 - svm->soft_int_old_rip = vmcb12_rip; 938 - if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 939 - svm->soft_int_next_rip = svm->nested.ctl.next_rip; 940 - else 941 - svm->soft_int_next_rip = vmcb12_rip; 874 + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || 875 + !vcpu->arch.nested_run_pending) 876 + svm->soft_int_next_rip = vmcb12_ctrl->next_rip; 942 877 } 943 878 944 - /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ 879 + /* SVM_MISC2_ENABLE_V_LBR is controlled by svm_update_lbrv() */ 945 880 946 881 if (!nested_vmcb_needs_vls_intercept(svm)) 947 - vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 882 + vmcb02->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; 948 883 949 884 if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) 950 - pause_count12 = svm->nested.ctl.pause_filter_count; 885 + pause_count12 = vmcb12_ctrl->pause_filter_count; 951 886 else 952 887 pause_count12 = 0; 953 888 if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) 954 - pause_thresh12 = svm->nested.ctl.pause_filter_thresh; 889 + pause_thresh12 = vmcb12_ctrl->pause_filter_thresh; 955 890 else 956 891 pause_thresh12 = 0; 957 892 if (kvm_pause_in_guest(svm->vcpu.kvm)) { ··· 965 900 vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; 966 901 967 902 /* ... but ensure filtering is disabled if so requested. */ 968 - if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { 903 + if (vmcb12_is_intercept(vmcb12_ctrl, INTERCEPT_PAUSE)) { 969 904 if (!pause_count12) 970 905 vmcb02->control.pause_filter_count = 0; 971 906 if (!pause_thresh12) ··· 982 917 * L2 is the "guest"). 983 918 */ 984 919 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) 985 - vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl & 920 + vmcb02->control.erap_ctl = (vmcb12_ctrl->erap_ctl & 986 921 ERAP_CONTROL_ALLOW_LARGER_RAP) | 987 922 ERAP_CONTROL_CLEAR_RAP; 988 923 ··· 990 925 * Merge guest and host intercepts - must be called with vcpu in 991 926 * guest-mode to take effect. 992 927 */ 993 - recalc_intercepts(svm); 928 + nested_vmcb02_recalc_intercepts(svm); 994 929 } 995 930 996 931 static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) ··· 1005 940 to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl; 1006 941 } 1007 942 1008 - int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, 1009 - struct vmcb *vmcb12, bool from_vmrun) 943 + int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun) 1010 944 { 1011 945 struct vcpu_svm *svm = to_svm(vcpu); 946 + struct vmcb_ctrl_area_cached *control = &svm->nested.ctl; 947 + struct vmcb_save_area_cached *save = &svm->nested.save; 1012 948 int ret; 1013 949 1014 950 trace_kvm_nested_vmenter(svm->vmcb->save.rip, 1015 951 vmcb12_gpa, 1016 - vmcb12->save.rip, 1017 - vmcb12->control.int_ctl, 1018 - vmcb12->control.event_inj, 1019 - vmcb12->control.nested_ctl, 1020 - vmcb12->control.nested_cr3, 1021 - vmcb12->save.cr3, 952 + save->rip, 953 + control->int_ctl, 954 + control->event_inj, 955 + control->misc_ctl, 956 + control->nested_cr3, 957 + save->cr3, 1022 958 KVM_ISA_SVM); 1023 959 1024 - trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, 1025 - vmcb12->control.intercepts[INTERCEPT_CR] >> 16, 1026 - vmcb12->control.intercepts[INTERCEPT_EXCEPTION], 1027 - vmcb12->control.intercepts[INTERCEPT_WORD3], 1028 - vmcb12->control.intercepts[INTERCEPT_WORD4], 1029 - vmcb12->control.intercepts[INTERCEPT_WORD5]); 960 + trace_kvm_nested_intercepts(control->intercepts[INTERCEPT_CR] & 0xffff, 961 + control->intercepts[INTERCEPT_CR] >> 16, 962 + control->intercepts[INTERCEPT_EXCEPTION], 963 + control->intercepts[INTERCEPT_WORD3], 964 + control->intercepts[INTERCEPT_WORD4], 965 + control->intercepts[INTERCEPT_WORD5]); 1030 966 1031 967 1032 968 svm->nested.vmcb12_gpa = vmcb12_gpa; ··· 1037 971 nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); 1038 972 1039 973 svm_switch_vmcb(svm, &svm->nested.vmcb02); 1040 - nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); 1041 - nested_vmcb02_prepare_save(svm, vmcb12); 974 + nested_vmcb02_prepare_control(svm); 975 + nested_vmcb02_prepare_save(svm); 1042 976 1043 977 ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, 1044 978 nested_npt_enabled(svm), from_vmrun); ··· 1058 992 return 0; 1059 993 } 1060 994 995 + static int nested_svm_copy_vmcb12_to_cache(struct kvm_vcpu *vcpu, u64 vmcb12_gpa) 996 + { 997 + struct vcpu_svm *svm = to_svm(vcpu); 998 + struct kvm_host_map map; 999 + struct vmcb *vmcb12; 1000 + int r = 0; 1001 + 1002 + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) 1003 + return -EFAULT; 1004 + 1005 + vmcb12 = map.hva; 1006 + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 1007 + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 1008 + 1009 + if (nested_svm_check_cached_vmcb12(vcpu) < 0) { 1010 + vmcb12->control.exit_code = SVM_EXIT_ERR; 1011 + vmcb12->control.exit_info_1 = 0; 1012 + vmcb12->control.exit_info_2 = 0; 1013 + vmcb12->control.event_inj = 0; 1014 + vmcb12->control.event_inj_err = 0; 1015 + svm_set_gif(svm, false); 1016 + r = -EINVAL; 1017 + } 1018 + 1019 + kvm_vcpu_unmap(vcpu, &map); 1020 + return r; 1021 + } 1022 + 1061 1023 int nested_svm_vmrun(struct kvm_vcpu *vcpu) 1062 1024 { 1063 1025 struct vcpu_svm *svm = to_svm(vcpu); 1064 1026 int ret; 1065 - struct vmcb *vmcb12; 1066 - struct kvm_host_map map; 1067 1027 u64 vmcb12_gpa; 1068 1028 struct vmcb *vmcb01 = svm->vmcb01.ptr; 1069 1029 ··· 1110 1018 return ret; 1111 1019 } 1112 1020 1113 - vmcb12_gpa = svm->vmcb->save.rax; 1114 - ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); 1115 - if (ret == -EINVAL) { 1116 - kvm_inject_gp(vcpu, 0); 1117 - return 1; 1118 - } else if (ret) { 1119 - return kvm_skip_emulated_instruction(vcpu); 1120 - } 1121 - 1122 - ret = kvm_skip_emulated_instruction(vcpu); 1123 - 1124 - vmcb12 = map.hva; 1125 - 1126 1021 if (WARN_ON_ONCE(!svm->nested.initialized)) 1127 1022 return -EINVAL; 1128 1023 1129 - nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 1130 - nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 1131 - 1132 - if (nested_svm_check_cached_vmcb12(vcpu) < 0) { 1133 - vmcb12->control.exit_code = SVM_EXIT_ERR; 1134 - vmcb12->control.exit_info_1 = 0; 1135 - vmcb12->control.exit_info_2 = 0; 1136 - goto out; 1024 + vmcb12_gpa = kvm_register_read(vcpu, VCPU_REGS_RAX); 1025 + if (!page_address_valid(vcpu, vmcb12_gpa)) { 1026 + kvm_inject_gp(vcpu, 0); 1027 + return 1; 1137 1028 } 1029 + 1030 + ret = nested_svm_copy_vmcb12_to_cache(vcpu, vmcb12_gpa); 1031 + if (ret) { 1032 + if (ret == -EFAULT) 1033 + return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 1034 + 1035 + /* Advance RIP past VMRUN as part of the nested #VMEXIT. */ 1036 + return kvm_skip_emulated_instruction(vcpu); 1037 + } 1038 + 1039 + /* At this point, VMRUN is guaranteed to not fault; advance RIP. */ 1040 + ret = kvm_skip_emulated_instruction(vcpu); 1138 1041 1139 1042 /* 1140 1043 * Since vmcb01 is not in use, we can use it to store some of the L1 ··· 1144 1057 if (!npt_enabled) 1145 1058 vmcb01->save.cr3 = kvm_read_cr3(vcpu); 1146 1059 1147 - svm->nested.nested_run_pending = 1; 1060 + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 1148 1061 1149 - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) 1150 - goto out_exit_err; 1062 + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) || 1063 + !nested_svm_merge_msrpm(vcpu)) { 1064 + vcpu->arch.nested_run_pending = 0; 1065 + svm->nmi_l1_to_l2 = false; 1066 + svm->soft_int_injected = false; 1151 1067 1152 - if (nested_svm_merge_msrpm(vcpu)) 1153 - goto out; 1068 + svm->vmcb->control.exit_code = SVM_EXIT_ERR; 1069 + svm->vmcb->control.exit_info_1 = 0; 1070 + svm->vmcb->control.exit_info_2 = 0; 1154 1071 1155 - out_exit_err: 1156 - svm->nested.nested_run_pending = 0; 1157 - svm->nmi_l1_to_l2 = false; 1158 - svm->soft_int_injected = false; 1159 - 1160 - svm->vmcb->control.exit_code = SVM_EXIT_ERR; 1161 - svm->vmcb->control.exit_info_1 = 0; 1162 - svm->vmcb->control.exit_info_2 = 0; 1163 - 1164 - nested_svm_vmexit(svm); 1165 - 1166 - out: 1167 - kvm_vcpu_unmap(vcpu, &map); 1072 + nested_svm_vmexit(svm); 1073 + } 1168 1074 1169 1075 return ret; 1170 1076 } ··· 1187 1107 to_save->isst_addr = from_save->isst_addr; 1188 1108 to_save->ssp = from_save->ssp; 1189 1109 } 1110 + 1111 + if (kvm_cpu_cap_has(X86_FEATURE_LBRV)) { 1112 + svm_copy_lbrs(to_save, from_save); 1113 + to_save->dbgctl &= ~DEBUGCTL_RESERVED_BITS; 1114 + } 1190 1115 } 1191 1116 1192 1117 void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) ··· 1210 1125 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1211 1126 } 1212 1127 1213 - int nested_svm_vmexit(struct vcpu_svm *svm) 1128 + static int nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu) 1214 1129 { 1215 - struct kvm_vcpu *vcpu = &svm->vcpu; 1216 - struct vmcb *vmcb01 = svm->vmcb01.ptr; 1130 + struct vcpu_svm *svm = to_svm(vcpu); 1217 1131 struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; 1218 - struct vmcb *vmcb12; 1219 1132 struct kvm_host_map map; 1133 + struct vmcb *vmcb12; 1220 1134 int rc; 1221 1135 1222 1136 rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); 1223 - if (rc) { 1224 - if (rc == -EINVAL) 1225 - kvm_inject_gp(vcpu, 0); 1226 - return 1; 1227 - } 1137 + if (rc) 1138 + return rc; 1228 1139 1229 1140 vmcb12 = map.hva; 1230 - 1231 - /* Exit Guest-Mode */ 1232 - leave_guest_mode(vcpu); 1233 - svm->nested.vmcb12_gpa = 0; 1234 - WARN_ON_ONCE(svm->nested.nested_run_pending); 1235 - 1236 - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 1237 - 1238 - /* in case we halted in L2 */ 1239 - kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 1240 - 1241 - /* Give the current vmcb to the guest */ 1242 1141 1243 1142 vmcb12->save.es = vmcb02->save.es; 1244 1143 vmcb12->save.cs = vmcb02->save.cs; ··· 1233 1164 vmcb12->save.efer = svm->vcpu.arch.efer; 1234 1165 vmcb12->save.cr0 = kvm_read_cr0(vcpu); 1235 1166 vmcb12->save.cr3 = kvm_read_cr3(vcpu); 1236 - vmcb12->save.cr2 = vmcb02->save.cr2; 1167 + vmcb12->save.cr2 = vcpu->arch.cr2; 1237 1168 vmcb12->save.cr4 = svm->vcpu.arch.cr4; 1238 1169 vmcb12->save.rflags = kvm_get_rflags(vcpu); 1239 1170 vmcb12->save.rip = kvm_rip_read(vcpu); ··· 1260 1191 if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 1261 1192 vmcb12->control.next_rip = vmcb02->control.next_rip; 1262 1193 1194 + if (nested_vmcb12_has_lbrv(vcpu)) 1195 + svm_copy_lbrs(&vmcb12->save, &vmcb02->save); 1196 + 1197 + vmcb12->control.event_inj = 0; 1198 + vmcb12->control.event_inj_err = 0; 1263 1199 vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; 1264 - vmcb12->control.event_inj = svm->nested.ctl.event_inj; 1265 - vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; 1200 + 1201 + trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, 1202 + vmcb12->control.exit_info_1, 1203 + vmcb12->control.exit_info_2, 1204 + vmcb12->control.exit_int_info, 1205 + vmcb12->control.exit_int_info_err, 1206 + KVM_ISA_SVM); 1207 + 1208 + kvm_vcpu_unmap(vcpu, &map); 1209 + return 0; 1210 + } 1211 + 1212 + void nested_svm_vmexit(struct vcpu_svm *svm) 1213 + { 1214 + struct kvm_vcpu *vcpu = &svm->vcpu; 1215 + struct vmcb *vmcb01 = svm->vmcb01.ptr; 1216 + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; 1217 + 1218 + if (nested_svm_vmexit_update_vmcb12(vcpu)) 1219 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1220 + 1221 + /* Exit Guest-Mode */ 1222 + leave_guest_mode(vcpu); 1223 + svm->nested.vmcb12_gpa = 0; 1224 + 1225 + kvm_warn_on_nested_run_pending(vcpu); 1226 + 1227 + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 1228 + 1229 + /* in case we halted in L2 */ 1230 + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 1266 1231 1267 1232 if (!kvm_pause_in_guest(vcpu->kvm)) { 1268 1233 vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; ··· 1305 1202 } 1306 1203 1307 1204 /* 1308 - * Invalidate bus_lock_rip unless KVM is still waiting for the guest 1309 - * to make forward progress before re-enabling bus lock detection. 1205 + * Invalidate last_bus_lock_rip unless KVM is still waiting for the 1206 + * guest to make forward progress before re-enabling bus lock detection. 1310 1207 */ 1311 1208 if (!vmcb02->control.bus_lock_counter) 1312 - svm->nested.ctl.bus_lock_rip = INVALID_GPA; 1209 + svm->nested.last_bus_lock_rip = INVALID_GPA; 1313 1210 1314 1211 nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); 1315 1212 ··· 1342 1239 if (!nested_exit_on_intr(svm)) 1343 1240 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 1344 1241 1345 - if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 1346 - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) 1347 - svm_copy_lbrs(vmcb12, vmcb02); 1348 - else 1349 - svm_copy_lbrs(vmcb01, vmcb02); 1242 + if (!nested_vmcb12_has_lbrv(vcpu)) { 1243 + svm_copy_lbrs(&vmcb01->save, &vmcb02->save); 1244 + vmcb_mark_dirty(vmcb01, VMCB_LBR); 1245 + } 1350 1246 1351 1247 svm_update_lbrv(vcpu); 1352 1248 ··· 1398 1296 svm->vcpu.arch.dr7 = DR7_FIXED_1; 1399 1297 kvm_update_dr7(&svm->vcpu); 1400 1298 1401 - trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, 1402 - vmcb12->control.exit_info_1, 1403 - vmcb12->control.exit_info_2, 1404 - vmcb12->control.exit_int_info, 1405 - vmcb12->control.exit_int_info_err, 1406 - KVM_ISA_SVM); 1407 - 1408 - kvm_vcpu_unmap(vcpu, &map); 1409 - 1410 1299 nested_svm_transition_tlb_flush(vcpu); 1411 1300 1412 1301 nested_svm_uninit_mmu_context(vcpu); 1413 1302 1414 - rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true); 1415 - if (rc) 1416 - return 1; 1303 + if (nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true)) 1304 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1305 + 1306 + /* Drop tracking for L1->L2 injected NMIs and soft IRQs */ 1307 + svm->nmi_l1_to_l2 = false; 1308 + svm->soft_int_injected = false; 1417 1309 1418 1310 /* 1419 1311 * Drop what we picked up for L2 via svm_complete_interrupts() so it ··· 1432 1336 */ 1433 1337 if (kvm_apicv_activated(vcpu->kvm)) 1434 1338 __kvm_vcpu_update_apicv(vcpu); 1435 - 1436 - return 0; 1437 1339 } 1438 1340 1439 1341 static void nested_svm_triple_fault(struct kvm_vcpu *vcpu) ··· 1501 1407 struct vcpu_svm *svm = to_svm(vcpu); 1502 1408 1503 1409 if (is_guest_mode(vcpu)) { 1504 - svm->nested.nested_run_pending = 0; 1410 + vcpu->arch.nested_run_pending = 0; 1505 1411 svm->nested.vmcb12_gpa = INVALID_GPA; 1506 1412 1507 1413 leave_guest_mode(vcpu); ··· 1686 1592 * previously injected event, the pending exception occurred while said 1687 1593 * event was being delivered and thus needs to be handled. 1688 1594 */ 1689 - bool block_nested_exceptions = svm->nested.nested_run_pending; 1595 + bool block_nested_exceptions = vcpu->arch.nested_run_pending; 1690 1596 /* 1691 1597 * New events (not exceptions) are only recognized at instruction 1692 1598 * boundaries. If an event needs reinjection, then KVM is handling a ··· 1776 1682 } 1777 1683 case SVM_EXIT_VMMCALL: 1778 1684 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 1779 - if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 1780 - nested_svm_l2_tlb_flush_enabled(vcpu) && 1781 - kvm_hv_is_tlb_flush_hcall(vcpu)) 1685 + if (nested_svm_is_l2_tlb_flush_hcall(vcpu)) 1782 1686 return NESTED_EXIT_HOST; 1783 1687 break; 1784 1688 default: ··· 1821 1729 dst->exit_info_2 = from->exit_info_2; 1822 1730 dst->exit_int_info = from->exit_int_info; 1823 1731 dst->exit_int_info_err = from->exit_int_info_err; 1824 - dst->nested_ctl = from->nested_ctl; 1732 + dst->misc_ctl = from->misc_ctl; 1825 1733 dst->event_inj = from->event_inj; 1826 1734 dst->event_inj_err = from->event_inj_err; 1827 1735 dst->next_rip = from->next_rip; 1828 - dst->nested_cr3 = from->nested_cr3; 1829 - dst->virt_ext = from->virt_ext; 1736 + dst->nested_cr3 = from->nested_cr3; 1737 + dst->misc_ctl2 = from->misc_ctl2; 1830 1738 dst->pause_filter_count = from->pause_filter_count; 1831 1739 dst->pause_filter_thresh = from->pause_filter_thresh; 1832 1740 /* 'clean' and 'hv_enlightenments' are not changed by KVM */ ··· 1861 1769 kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; 1862 1770 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 1863 1771 1864 - if (svm->nested.nested_run_pending) 1772 + if (vcpu->arch.nested_run_pending) 1865 1773 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 1866 1774 } 1867 1775 ··· 1961 1869 1962 1870 ret = -EINVAL; 1963 1871 __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); 1964 - if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) 1872 + if (!nested_vmcb_check_controls(vcpu, &ctl_cached)) 1965 1873 goto out_free; 1966 1874 1967 1875 /* 1968 1876 * Processor state contains L2 state. Check that it is 1969 - * valid for guest mode (see nested_vmcb_check_save). 1877 + * valid for guest mode (see nested_vmcb_check_save()). 1970 1878 */ 1971 1879 cr0 = kvm_read_cr0(vcpu); 1972 1880 if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) ··· 1980 1888 if (!(save->cr0 & X86_CR0_PG) || 1981 1889 !(save->cr0 & X86_CR0_PE) || 1982 1890 (save->rflags & X86_EFLAGS_VM) || 1983 - !__nested_vmcb_check_save(vcpu, &save_cached)) 1891 + !nested_vmcb_check_save(vcpu, &save_cached)) 1984 1892 goto out_free; 1985 1893 1986 1894 ··· 1998 1906 1999 1907 svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); 2000 1908 2001 - svm->nested.nested_run_pending = 2002 - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 1909 + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) 1910 + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; 1911 + else 1912 + vcpu->arch.nested_run_pending = 0; 2003 1913 2004 1914 svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; 2005 1915 ··· 2009 1915 nested_copy_vmcb_control_to_cache(svm, ctl); 2010 1916 2011 1917 svm_switch_vmcb(svm, &svm->nested.vmcb02); 2012 - nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); 1918 + nested_vmcb02_prepare_control(svm); 1919 + 1920 + /* 1921 + * Any previously restored state (e.g. KVM_SET_SREGS) would mark fields 1922 + * dirty in vmcb01 instead of vmcb02, so mark all of vmcb02 dirty here. 1923 + */ 1924 + vmcb_mark_all_dirty(svm->vmcb); 2013 1925 2014 1926 /* 2015 1927 * While the nested guest CR3 is already checked and set by ··· 2029 1929 goto out_free; 2030 1930 2031 1931 svm->nested.force_msr_bitmap_recalc = true; 1932 + 1933 + if (kvm_vcpu_apicv_active(vcpu)) 1934 + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 2032 1935 2033 1936 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 2034 1937 ret = 0;
+3 -3
arch/x86/kvm/svm/sev.c
··· 4591 4591 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4592 4592 struct vmcb *vmcb = svm->vmcb01.ptr; 4593 4593 4594 - svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; 4594 + svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES; 4595 4595 4596 4596 /* 4597 4597 * An SEV-ES guest requires a VMSA area that is a separate from the ··· 4631 4631 if (!sev_vcpu_has_debug_swap(svm)) { 4632 4632 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 4633 4633 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 4634 - recalc_intercepts(svm); 4634 + svm_mark_intercepts_dirty(svm); 4635 4635 } else { 4636 4636 /* 4637 4637 * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't ··· 4662 4662 { 4663 4663 struct kvm_vcpu *vcpu = &svm->vcpu; 4664 4664 4665 - svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; 4665 + svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV; 4666 4666 clr_exception_intercept(svm, UD_VECTOR); 4667 4667 4668 4668 /*
+236 -117
arch/x86/kvm/svm/svm.c
··· 52 52 #include "svm.h" 53 53 #include "svm_ops.h" 54 54 55 + #include "hyperv.h" 55 56 #include "kvm_onhyperv.h" 56 57 #include "svm_onhyperv.h" 57 58 ··· 217 216 218 217 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { 219 218 if (!(efer & EFER_SVME)) { 219 + /* 220 + * Architecturally, clearing EFER.SVME while a guest is 221 + * running yields undefined behavior, i.e. KVM can do 222 + * literally anything. Force the vCPU back into L1 as 223 + * that is the safest option for KVM, but synthesize a 224 + * triple fault (for L1!) so that KVM at least doesn't 225 + * run random L2 code in the context of L1. Do so if 226 + * and only if the vCPU is actively running, e.g. to 227 + * avoid positives if userspace is stuffing state. 228 + */ 229 + if (is_guest_mode(vcpu) && vcpu->wants_to_run) 230 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 231 + 220 232 svm_leave_nested(vcpu); 221 233 /* #GP intercept is still needed for vmware backdoor */ 222 234 if (!enable_vmware_backdoor) ··· 258 244 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) 259 245 set_exception_intercept(svm, GP_VECTOR); 260 246 } 247 + 248 + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); 261 249 } 262 250 263 251 svm->vmcb->save.efer = efer | EFER_SVME; ··· 651 635 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 652 636 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 653 637 654 - recalc_intercepts(svm); 638 + svm_mark_intercepts_dirty(svm); 655 639 } 656 640 657 641 static void clr_dr_intercepts(struct vcpu_svm *svm) ··· 660 644 661 645 vmcb->control.intercepts[INTERCEPT_DR] = 0; 662 646 663 - recalc_intercepts(svm); 647 + svm_mark_intercepts_dirty(svm); 664 648 } 665 649 666 650 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) ··· 726 710 static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) 727 711 { 728 712 struct vcpu_svm *svm = to_svm(vcpu); 729 - bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); 713 + bool intercept = !(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR); 730 714 731 715 if (intercept == svm->lbr_msrs_intercepted) 732 716 return; ··· 857 841 */ 858 842 } 859 843 860 - void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) 861 - { 862 - to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; 863 - to_vmcb->save.br_from = from_vmcb->save.br_from; 864 - to_vmcb->save.br_to = from_vmcb->save.br_to; 865 - to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; 866 - to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; 867 - 868 - vmcb_mark_dirty(to_vmcb, VMCB_LBR); 869 - } 870 - 871 844 static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) 872 845 { 873 - to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 846 + to_svm(vcpu)->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_LBR; 874 847 } 875 848 876 849 void svm_enable_lbrv(struct kvm_vcpu *vcpu) ··· 871 866 static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) 872 867 { 873 868 KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); 874 - to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 869 + to_svm(vcpu)->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_LBR; 875 870 } 876 871 877 872 void svm_update_lbrv(struct kvm_vcpu *vcpu) 878 873 { 879 874 struct vcpu_svm *svm = to_svm(vcpu); 880 - bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; 875 + bool current_enable_lbrv = svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR; 881 876 bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || 882 877 (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 883 - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); 878 + (svm->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR)); 884 879 885 880 if (enable_lbrv && !current_enable_lbrv) 886 881 __svm_enable_lbrv(vcpu); ··· 1014 1009 preempt_enable(); 1015 1010 } 1016 1011 1012 + static bool svm_has_pending_gif_event(struct vcpu_svm *svm) 1013 + { 1014 + return svm->vcpu.arch.smi_pending || 1015 + svm->vcpu.arch.nmi_pending || 1016 + kvm_cpu_has_injectable_intr(&svm->vcpu) || 1017 + kvm_apic_has_pending_init_or_sipi(&svm->vcpu); 1018 + } 1019 + 1017 1020 /* Evaluate instruction intercepts that depend on guest CPUID features. */ 1018 1021 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 1019 1022 { ··· 1047 1034 } 1048 1035 1049 1036 /* 1050 - * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is 1051 - * always set if vls is enabled. If the intercepts are set, the bit is 1052 - * meaningless anyway. 1037 + * Intercept instructions that #UD if EFER.SVME=0, as SVME must be set 1038 + * even when running the guest, i.e. hardware will only ever see 1039 + * EFER.SVME=1. 1040 + * 1041 + * No need to toggle any of the vgif/vls/etc. enable bits here, as they 1042 + * are set when the VMCB is initialized and never cleared (if the 1043 + * relevant intercepts are set, the enablements are meaningless anyway). 1044 + * 1045 + * FIXME: When #GP is not intercepted, a #GP on these instructions (e.g. 1046 + * due to CPL > 0) could be injected by hardware before the instruction 1047 + * is intercepted, leading to #GP taking precedence over #UD from the 1048 + * guest's perspective. 1053 1049 */ 1054 - if (guest_cpuid_is_intel_compatible(vcpu)) { 1050 + if (!(vcpu->arch.efer & EFER_SVME)) { 1055 1051 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1056 1052 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1053 + svm_set_intercept(svm, INTERCEPT_CLGI); 1054 + svm_set_intercept(svm, INTERCEPT_STGI); 1057 1055 } else { 1058 1056 /* 1059 1057 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1060 1058 * in VMCB and clear intercepts to avoid #VMEXIT. 1061 1059 */ 1062 - if (vls) { 1060 + if (guest_cpuid_is_intel_compatible(vcpu)) { 1061 + svm_set_intercept(svm, INTERCEPT_VMLOAD); 1062 + svm_set_intercept(svm, INTERCEPT_VMSAVE); 1063 + } else if (vls) { 1063 1064 svm_clr_intercept(svm, INTERCEPT_VMLOAD); 1064 1065 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1066 + } 1067 + 1068 + /* 1069 + * Process pending events when clearing STGI/CLGI intercepts if 1070 + * there's at least one pending event that is masked by GIF, so 1071 + * that KVM re-evaluates if the intercept needs to be set again 1072 + * to track when GIF is re-enabled (e.g. for NMI injection). 1073 + */ 1074 + if (vgif) { 1075 + svm_clr_intercept(svm, INTERCEPT_CLGI); 1076 + svm_clr_intercept(svm, INTERCEPT_STGI); 1077 + 1078 + if (svm_has_pending_gif_event(svm)) 1079 + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 1065 1080 } 1066 1081 } 1067 1082 ··· 1203 1162 1204 1163 if (npt_enabled) { 1205 1164 /* Setup VMCB for Nested Paging */ 1206 - control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1165 + control->misc_ctl |= SVM_MISC_ENABLE_NP; 1207 1166 svm_clr_intercept(svm, INTERCEPT_INVLPG); 1208 1167 clr_exception_intercept(svm, PF_VECTOR); 1209 1168 svm_clr_intercept(svm, INTERCEPT_CR3_READ); ··· 1235 1194 if (vnmi) 1236 1195 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; 1237 1196 1238 - if (vgif) { 1239 - svm_clr_intercept(svm, INTERCEPT_STGI); 1240 - svm_clr_intercept(svm, INTERCEPT_CLGI); 1197 + if (vgif) 1241 1198 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1242 - } 1243 1199 1244 1200 if (vls) 1245 - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1201 + svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; 1246 1202 1247 1203 if (vcpu->kvm->arch.bus_lock_detection_enabled) 1248 1204 svm_set_intercept(svm, INTERCEPT_BUSLOCK); ··· 2189 2151 2190 2152 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) 2191 2153 { 2154 + u64 vmcb12_gpa = kvm_register_read(vcpu, VCPU_REGS_RAX); 2192 2155 struct vcpu_svm *svm = to_svm(vcpu); 2193 2156 struct vmcb *vmcb12; 2194 2157 struct kvm_host_map map; ··· 2198 2159 if (nested_svm_check_permissions(vcpu)) 2199 2160 return 1; 2200 2161 2201 - ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 2202 - if (ret) { 2203 - if (ret == -EINVAL) 2204 - kvm_inject_gp(vcpu, 0); 2162 + if (!page_address_valid(vcpu, vmcb12_gpa)) { 2163 + kvm_inject_gp(vcpu, 0); 2205 2164 return 1; 2206 2165 } 2166 + 2167 + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) 2168 + return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 2207 2169 2208 2170 vmcb12 = map.hva; 2209 2171 ··· 2242 2202 return nested_svm_vmrun(vcpu); 2243 2203 } 2244 2204 2245 - enum { 2246 - NONE_SVM_INSTR, 2247 - SVM_INSTR_VMRUN, 2248 - SVM_INSTR_VMLOAD, 2249 - SVM_INSTR_VMSAVE, 2250 - }; 2251 - 2252 - /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ 2253 - static int svm_instr_opcode(struct kvm_vcpu *vcpu) 2205 + /* Return 0 if not SVM instr, otherwise return associated exit_code */ 2206 + static u64 svm_get_decoded_instr_exit_code(struct kvm_vcpu *vcpu) 2254 2207 { 2255 2208 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 2256 2209 2257 2210 if (ctxt->b != 0x1 || ctxt->opcode_len != 2) 2258 - return NONE_SVM_INSTR; 2211 + return 0; 2212 + 2213 + BUILD_BUG_ON(!SVM_EXIT_VMRUN || !SVM_EXIT_VMLOAD || !SVM_EXIT_VMSAVE); 2259 2214 2260 2215 switch (ctxt->modrm) { 2261 2216 case 0xd8: /* VMRUN */ 2262 - return SVM_INSTR_VMRUN; 2217 + return SVM_EXIT_VMRUN; 2263 2218 case 0xda: /* VMLOAD */ 2264 - return SVM_INSTR_VMLOAD; 2219 + return SVM_EXIT_VMLOAD; 2265 2220 case 0xdb: /* VMSAVE */ 2266 - return SVM_INSTR_VMSAVE; 2221 + return SVM_EXIT_VMSAVE; 2267 2222 default: 2268 2223 break; 2269 2224 } 2270 2225 2271 - return NONE_SVM_INSTR; 2272 - } 2273 - 2274 - static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) 2275 - { 2276 - const int guest_mode_exit_codes[] = { 2277 - [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, 2278 - [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, 2279 - [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, 2280 - }; 2281 - int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { 2282 - [SVM_INSTR_VMRUN] = vmrun_interception, 2283 - [SVM_INSTR_VMLOAD] = vmload_interception, 2284 - [SVM_INSTR_VMSAVE] = vmsave_interception, 2285 - }; 2286 - struct vcpu_svm *svm = to_svm(vcpu); 2287 - int ret; 2288 - 2289 - if (is_guest_mode(vcpu)) { 2290 - /* Returns '1' or -errno on failure, '0' on success. */ 2291 - ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); 2292 - if (ret) 2293 - return ret; 2294 - return 1; 2295 - } 2296 - return svm_instr_handlers[opcode](vcpu); 2226 + return 0; 2297 2227 } 2298 2228 2299 2229 /* ··· 2278 2268 { 2279 2269 struct vcpu_svm *svm = to_svm(vcpu); 2280 2270 u32 error_code = svm->vmcb->control.exit_info_1; 2281 - int opcode; 2271 + u64 svm_exit_code; 2282 2272 2283 2273 /* Both #GP cases have zero error_code */ 2284 2274 if (error_code) ··· 2288 2278 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) 2289 2279 goto reinject; 2290 2280 2291 - opcode = svm_instr_opcode(vcpu); 2281 + /* FIXME: Handle SVM instructions through the emulator */ 2282 + svm_exit_code = svm_get_decoded_instr_exit_code(vcpu); 2283 + if (svm_exit_code) { 2284 + if (!is_guest_mode(vcpu)) 2285 + return svm_invoke_exit_handler(vcpu, svm_exit_code); 2292 2286 2293 - if (opcode == NONE_SVM_INSTR) { 2294 - if (!enable_vmware_backdoor) 2287 + if (nested_svm_check_permissions(vcpu)) 2288 + return 1; 2289 + 2290 + if (!page_address_valid(vcpu, kvm_register_read(vcpu, VCPU_REGS_RAX))) 2295 2291 goto reinject; 2296 2292 2297 2293 /* 2298 - * VMware backdoor emulation on #GP interception only handles 2299 - * IN{S}, OUT{S}, and RDPMC. 2294 + * FIXME: Only synthesize a #VMEXIT if L1 sets the intercept, 2295 + * but only after the VMLOAD/VMSAVE exit handlers can properly 2296 + * handle VMLOAD/VMSAVE from L2 with VLS enabled in L1 (i.e. 2297 + * RAX is an L2 GPA that needs translation through L1's NPT). 2300 2298 */ 2301 - if (!is_guest_mode(vcpu)) 2302 - return kvm_emulate_instruction(vcpu, 2303 - EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2304 - } else { 2305 - /* All SVM instructions expect page aligned RAX */ 2306 - if (svm->vmcb->save.rax & ~PAGE_MASK) 2307 - goto reinject; 2308 - 2309 - return emulate_svm_instr(vcpu, opcode); 2299 + nested_svm_simple_vmexit(svm, svm_exit_code); 2300 + return 1; 2310 2301 } 2302 + 2303 + /* 2304 + * VMware backdoor emulation on #GP interception only handles 2305 + * IN{S}, OUT{S}, and RDPMC, and only for L1. 2306 + */ 2307 + if (!enable_vmware_backdoor || is_guest_mode(vcpu)) 2308 + goto reinject; 2309 + 2310 + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2311 2311 2312 2312 reinject: 2313 2313 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); ··· 2339 2319 svm_clear_vintr(svm); 2340 2320 2341 2321 enable_gif(svm); 2342 - if (svm->vcpu.arch.smi_pending || 2343 - svm->vcpu.arch.nmi_pending || 2344 - kvm_cpu_has_injectable_intr(&svm->vcpu) || 2345 - kvm_apic_has_pending_init_or_sipi(&svm->vcpu)) 2322 + if (svm_has_pending_gif_event(svm)) 2346 2323 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 2347 2324 } else { 2348 2325 disable_gif(svm); ··· 2382 2365 { 2383 2366 gva_t gva = kvm_rax_read(vcpu); 2384 2367 u32 asid = kvm_rcx_read(vcpu); 2368 + 2369 + if (nested_svm_check_permissions(vcpu)) 2370 + return 1; 2385 2371 2386 2372 /* FIXME: Handle an address size prefix. */ 2387 2373 if (!is_long_mode(vcpu)) ··· 2743 2723 return 0; 2744 2724 } 2745 2725 2726 + static u64 *svm_vmcb_lbr(struct vcpu_svm *svm, u32 msr) 2727 + { 2728 + switch (msr) { 2729 + case MSR_IA32_LASTBRANCHFROMIP: 2730 + return &svm->vmcb->save.br_from; 2731 + case MSR_IA32_LASTBRANCHTOIP: 2732 + return &svm->vmcb->save.br_to; 2733 + case MSR_IA32_LASTINTFROMIP: 2734 + return &svm->vmcb->save.last_excp_from; 2735 + case MSR_IA32_LASTINTTOIP: 2736 + return &svm->vmcb->save.last_excp_to; 2737 + default: 2738 + break; 2739 + } 2740 + KVM_BUG_ON(1, svm->vcpu.kvm); 2741 + return &svm->vmcb->save.br_from; 2742 + } 2743 + 2746 2744 static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, 2747 2745 struct msr_data *msr_info) 2748 2746 { ··· 2834 2796 msr_info->data = svm->tsc_aux; 2835 2797 break; 2836 2798 case MSR_IA32_DEBUGCTLMSR: 2837 - msr_info->data = svm->vmcb->save.dbgctl; 2799 + msr_info->data = lbrv ? svm->vmcb->save.dbgctl : 0; 2838 2800 break; 2839 2801 case MSR_IA32_LASTBRANCHFROMIP: 2840 - msr_info->data = svm->vmcb->save.br_from; 2841 - break; 2842 2802 case MSR_IA32_LASTBRANCHTOIP: 2843 - msr_info->data = svm->vmcb->save.br_to; 2844 - break; 2845 2803 case MSR_IA32_LASTINTFROMIP: 2846 - msr_info->data = svm->vmcb->save.last_excp_from; 2847 - break; 2848 2804 case MSR_IA32_LASTINTTOIP: 2849 - msr_info->data = svm->vmcb->save.last_excp_to; 2805 + msr_info->data = lbrv ? *svm_vmcb_lbr(svm, msr_info->index) : 0; 2850 2806 break; 2851 2807 case MSR_VM_HSAVE_PA: 2852 2808 msr_info->data = svm->nested.hsave_msr; ··· 3115 3083 vmcb_mark_dirty(svm->vmcb, VMCB_LBR); 3116 3084 svm_update_lbrv(vcpu); 3117 3085 break; 3086 + case MSR_IA32_LASTBRANCHFROMIP: 3087 + case MSR_IA32_LASTBRANCHTOIP: 3088 + case MSR_IA32_LASTINTFROMIP: 3089 + case MSR_IA32_LASTINTTOIP: 3090 + if (!lbrv) 3091 + return KVM_MSR_RET_UNSUPPORTED; 3092 + if (!msr->host_initiated) 3093 + return 1; 3094 + *svm_vmcb_lbr(svm, ecx) = data; 3095 + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); 3096 + break; 3118 3097 case MSR_VM_HSAVE_PA: 3119 3098 /* 3120 3099 * Old kernels did not validate the value written to ··· 3267 3224 vcpu->arch.complete_userspace_io = complete_userspace_buslock; 3268 3225 3269 3226 if (is_guest_mode(vcpu)) 3270 - svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; 3227 + svm->nested.last_bus_lock_rip = vcpu->arch.cui_linear_rip; 3271 3228 3272 3229 return 0; 3230 + } 3231 + 3232 + static int vmmcall_interception(struct kvm_vcpu *vcpu) 3233 + { 3234 + /* 3235 + * Inject a #UD if L2 is active and the VMMCALL isn't a Hyper-V TLB 3236 + * hypercall, as VMMCALL #UDs if it's not intercepted, and this path is 3237 + * reachable if and only if L1 doesn't want to intercept VMMCALL or has 3238 + * enabled L0 (KVM) handling of Hyper-V L2 TLB flush hypercalls. 3239 + */ 3240 + if (is_guest_mode(vcpu) && !nested_svm_is_l2_tlb_flush_hcall(vcpu)) { 3241 + kvm_queue_exception(vcpu, UD_VECTOR); 3242 + return 1; 3243 + } 3244 + 3245 + return kvm_emulate_hypercall(vcpu); 3273 3246 } 3274 3247 3275 3248 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { ··· 3338 3279 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 3339 3280 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 3340 3281 [SVM_EXIT_VMRUN] = vmrun_interception, 3341 - [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, 3282 + [SVM_EXIT_VMMCALL] = vmmcall_interception, 3342 3283 [SVM_EXIT_VMLOAD] = vmload_interception, 3343 3284 [SVM_EXIT_VMSAVE] = vmsave_interception, 3344 3285 [SVM_EXIT_STGI] = stgi_interception, ··· 3413 3354 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 3414 3355 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 3415 3356 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 3416 - pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 3357 + pr_err("%-20s%lld\n", "misc_ctl:", control->misc_ctl); 3417 3358 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 3418 3359 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 3419 3360 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); 3420 3361 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 3421 3362 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 3422 - pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 3363 + pr_err("%-20s%lld\n", "misc_ctl2:", control->misc_ctl2); 3423 3364 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 3424 3365 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 3425 3366 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); ··· 3697 3638 return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); 3698 3639 } 3699 3640 3641 + static void svm_set_nested_run_soft_int_state(struct kvm_vcpu *vcpu) 3642 + { 3643 + struct vcpu_svm *svm = to_svm(vcpu); 3644 + 3645 + svm->soft_int_csbase = svm->vmcb->save.cs.base; 3646 + svm->soft_int_old_rip = kvm_rip_read(vcpu); 3647 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 3648 + svm->soft_int_next_rip = kvm_rip_read(vcpu); 3649 + } 3650 + 3700 3651 static int pre_svm_run(struct kvm_vcpu *vcpu) 3701 3652 { 3702 3653 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); ··· 3806 3737 ++vcpu->stat.irq_injections; 3807 3738 3808 3739 svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type; 3740 + } 3741 + 3742 + static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu) 3743 + { 3744 + struct vcpu_svm *svm = to_svm(vcpu); 3745 + 3746 + if (!is_guest_mode(vcpu) || !vcpu->arch.nested_run_pending) 3747 + return; 3748 + 3749 + /* 3750 + * If nrips is supported in hardware but not exposed to L1, stuff the 3751 + * actual L2 RIP to emulate what a nrips=0 CPU would do (L1 is 3752 + * responsible for advancing RIP prior to injecting the event). Once L2 3753 + * runs after L1 executes VMRUN, NextRIP is updated by the CPU and/or 3754 + * KVM, and this is no longer needed. 3755 + * 3756 + * This is done here (as opposed to when preparing vmcb02) to use the 3757 + * most up-to-date value of RIP regardless of the order of restoring 3758 + * registers and nested state in the vCPU save+restore path. 3759 + */ 3760 + if (boot_cpu_has(X86_FEATURE_NRIPS) && 3761 + !guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) 3762 + svm->vmcb->control.next_rip = kvm_rip_read(vcpu); 3763 + 3764 + /* 3765 + * Simiarly, initialize the soft int metadata here to use the most 3766 + * up-to-date values of RIP and CS base, regardless of restore order. 3767 + */ 3768 + if (svm->soft_int_injected) 3769 + svm_set_nested_run_soft_int_state(vcpu); 3809 3770 } 3810 3771 3811 3772 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, ··· 3960 3861 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3961 3862 { 3962 3863 struct vcpu_svm *svm = to_svm(vcpu); 3963 - if (svm->nested.nested_run_pending) 3864 + if (vcpu->arch.nested_run_pending) 3964 3865 return -EBUSY; 3965 3866 3966 3867 if (svm_nmi_blocked(vcpu)) ··· 4002 3903 { 4003 3904 struct vcpu_svm *svm = to_svm(vcpu); 4004 3905 4005 - if (svm->nested.nested_run_pending) 3906 + if (vcpu->arch.nested_run_pending) 4006 3907 return -EBUSY; 4007 3908 4008 3909 if (svm_interrupt_blocked(vcpu)) ··· 4204 4105 bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); 4205 4106 bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); 4206 4107 struct vcpu_svm *svm = to_svm(vcpu); 4108 + 4109 + /* 4110 + * Initialize the soft int fields *before* reading them below if KVM 4111 + * aborted entry to the guest with a nested VMRUN pending. To ensure 4112 + * KVM uses up-to-date values for RIP and CS base across save/restore, 4113 + * regardless of restore order, KVM waits to set the soft int fields 4114 + * until VMRUN is imminent. But when canceling injection, KVM requeues 4115 + * the soft int and will reinject it via the standard injection flow, 4116 + * and so KVM needs to grab the state from the pending nested VMRUN. 4117 + */ 4118 + if (is_guest_mode(vcpu) && vcpu->arch.nested_run_pending) 4119 + svm_set_nested_run_soft_int_state(vcpu); 4207 4120 4208 4121 /* 4209 4122 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's ··· 4446 4335 kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS)) 4447 4336 svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; 4448 4337 4338 + svm_fixup_nested_rips(vcpu); 4339 + 4449 4340 svm_hv_update_vp_id(svm->vmcb, vcpu); 4450 4341 4451 4342 /* ··· 4468 4355 * VM-Exit), as running with the host's DEBUGCTL can negatively affect 4469 4356 * guest state and can even be fatal, e.g. due to Bus Lock Detect. 4470 4357 */ 4471 - if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4358 + if (!(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR) && 4472 4359 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4473 4360 update_debugctlmsr(svm->vmcb->save.dbgctl); 4474 4361 ··· 4499 4386 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4500 4387 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 4501 4388 4502 - if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4389 + if (!(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR) && 4503 4390 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4504 4391 update_debugctlmsr(vcpu->arch.host_debugctl); 4505 4392 ··· 4517 4404 nested_sync_control_from_vmcb02(svm); 4518 4405 4519 4406 /* Track VMRUNs that have made past consistency checking */ 4520 - if (svm->nested.nested_run_pending && 4407 + if (vcpu->arch.nested_run_pending && 4521 4408 !svm_is_vmrun_failure(svm->vmcb->control.exit_code)) 4522 4409 ++vcpu->stat.nested_run; 4523 4410 4524 - svm->nested.nested_run_pending = 0; 4411 + vcpu->arch.nested_run_pending = 0; 4525 4412 } 4526 4413 4527 4414 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; ··· 4548 4435 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4549 4436 4550 4437 svm_complete_interrupts(vcpu); 4438 + 4439 + /* 4440 + * Update the cache after completing interrupts to get an accurate 4441 + * NextRIP, e.g. when re-injecting a soft interrupt. 4442 + * 4443 + * FIXME: Rework svm_get_nested_state() to not pull data from the 4444 + * cache (except for maybe int_ctl). 4445 + */ 4446 + if (is_guest_mode(vcpu)) 4447 + svm->nested.ctl.next_rip = svm->vmcb->control.next_rip; 4551 4448 4552 4449 return svm_exit_handlers_fastpath(vcpu); 4553 4450 } ··· 4890 4767 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4891 4768 { 4892 4769 struct vcpu_svm *svm = to_svm(vcpu); 4893 - if (svm->nested.nested_run_pending) 4770 + if (vcpu->arch.nested_run_pending) 4894 4771 return -EBUSY; 4895 4772 4896 4773 if (svm_smi_blocked(vcpu)) ··· 4907 4784 { 4908 4785 struct vcpu_svm *svm = to_svm(vcpu); 4909 4786 struct kvm_host_map map_save; 4910 - int ret; 4911 4787 4912 4788 if (!is_guest_mode(vcpu)) 4913 4789 return 0; ··· 4926 4804 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4927 4805 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4928 4806 4929 - ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); 4930 - if (ret) 4931 - return ret; 4807 + nested_svm_simple_vmexit(svm, SVM_EXIT_SW); 4932 4808 4933 4809 /* 4934 4810 * KVM uses VMCB01 to store L1 host state while L2 runs but ··· 5004 4884 if (nested_svm_check_cached_vmcb12(vcpu) < 0) 5005 4885 goto unmap_save; 5006 4886 5007 - if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, 5008 - vmcb12, false) != 0) 4887 + if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, false) != 0) 5009 4888 goto unmap_save; 5010 4889 5011 4890 ret = 0; 5012 - svm->nested.nested_run_pending = 1; 4891 + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 5013 4892 5014 4893 unmap_save: 5015 4894 kvm_vcpu_unmap(vcpu, &map_save);
+59 -22
arch/x86/kvm/svm/svm.h
··· 140 140 }; 141 141 142 142 struct vmcb_save_area_cached { 143 + struct vmcb_seg es; 144 + struct vmcb_seg cs; 145 + struct vmcb_seg ss; 146 + struct vmcb_seg ds; 147 + struct vmcb_seg gdtr; 148 + struct vmcb_seg idtr; 149 + u8 cpl; 143 150 u64 efer; 144 151 u64 cr4; 145 152 u64 cr3; 146 153 u64 cr0; 147 154 u64 dr7; 148 155 u64 dr6; 156 + u64 rflags; 157 + u64 rip; 158 + u64 rsp; 159 + u64 s_cet; 160 + u64 ssp; 161 + u64 isst_addr; 162 + u64 rax; 163 + u64 cr2; 164 + u64 dbgctl; 165 + u64 br_from; 166 + u64 br_to; 167 + u64 last_excp_from; 168 + u64 last_excp_to; 149 169 }; 150 170 151 171 struct vmcb_ctrl_area_cached { ··· 186 166 u64 exit_info_2; 187 167 u32 exit_int_info; 188 168 u32 exit_int_info_err; 189 - u64 nested_ctl; 169 + u64 misc_ctl; 190 170 u32 event_inj; 191 171 u32 event_inj_err; 192 172 u64 next_rip; 193 173 u64 nested_cr3; 194 - u64 virt_ext; 174 + u64 misc_ctl2; 195 175 u32 clean; 196 - u64 bus_lock_rip; 197 176 union { 198 177 #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) 199 178 struct hv_vmcb_enlightenments hv_enlightenments; ··· 207 188 u64 vm_cr_msr; 208 189 u64 vmcb12_gpa; 209 190 u64 last_vmcb12_gpa; 191 + u64 last_bus_lock_rip; 210 192 211 193 /* 212 194 * The MSR permissions map used for vmcb02, which is the merge result 213 195 * of vmcb01 and vmcb12 214 196 */ 215 197 void *msrpm; 216 - 217 - /* A VMRUN has started but has not yet been performed, so 218 - * we cannot inject a nested vmexit yet. */ 219 - bool nested_run_pending; 220 198 221 199 /* cache for control fields of the guest */ 222 200 struct vmcb_ctrl_area_cached ctl; ··· 373 357 374 358 DECLARE_PER_CPU(struct svm_cpu_data, svm_data); 375 359 376 - void recalc_intercepts(struct vcpu_svm *svm); 377 - 378 360 static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) 379 361 { 380 362 return container_of(kvm, struct kvm_svm, kvm); ··· 429 415 vmcb->control.clean &= ~(1 << bit); 430 416 } 431 417 432 - static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) 418 + static inline bool vmcb12_is_dirty(struct vmcb_ctrl_area_cached *control, int bit) 433 419 { 434 - return !test_bit(bit, (unsigned long *)&vmcb->control.clean); 420 + return !test_bit(bit, (unsigned long *)&control->clean); 435 421 } 436 422 437 423 static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) ··· 500 486 return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit); 501 487 } 502 488 489 + void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm); 490 + 491 + static inline void svm_mark_intercepts_dirty(struct vcpu_svm *svm) 492 + { 493 + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_INTERCEPTS); 494 + 495 + /* 496 + * If L2 is active, recalculate the intercepts for vmcb02 to account 497 + * for the changes made to vmcb01. All intercept configuration is done 498 + * for vmcb01 and then propagated to vmcb02 to combine KVM's intercepts 499 + * with L1's intercepts (from the vmcb12 snapshot). 500 + */ 501 + if (is_guest_mode(&svm->vcpu)) 502 + nested_vmcb02_recalc_intercepts(svm); 503 + } 504 + 503 505 static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) 504 506 { 505 507 struct vmcb *vmcb = svm->vmcb01.ptr; ··· 523 493 WARN_ON_ONCE(bit >= 32); 524 494 vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); 525 495 526 - recalc_intercepts(svm); 496 + svm_mark_intercepts_dirty(svm); 527 497 } 528 498 529 499 static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) ··· 533 503 WARN_ON_ONCE(bit >= 32); 534 504 vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); 535 505 536 - recalc_intercepts(svm); 506 + svm_mark_intercepts_dirty(svm); 537 507 } 538 508 539 509 static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) ··· 542 512 543 513 vmcb_set_intercept(&vmcb->control, bit); 544 514 545 - recalc_intercepts(svm); 515 + svm_mark_intercepts_dirty(svm); 546 516 } 547 517 548 518 static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) ··· 551 521 552 522 vmcb_clr_intercept(&vmcb->control, bit); 553 523 554 - recalc_intercepts(svm); 524 + svm_mark_intercepts_dirty(svm); 555 525 } 556 526 557 527 static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) ··· 608 578 609 579 static inline bool nested_npt_enabled(struct vcpu_svm *svm) 610 580 { 611 - return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; 581 + return svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_NP; 612 582 } 613 583 614 584 static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) ··· 743 713 return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT); 744 714 } 745 715 716 + #define svm_copy_lbrs(to, from) \ 717 + do { \ 718 + (to)->dbgctl = (from)->dbgctl; \ 719 + (to)->br_from = (from)->br_from; \ 720 + (to)->br_to = (from)->br_to; \ 721 + (to)->last_excp_from = (from)->last_excp_from; \ 722 + (to)->last_excp_to = (from)->last_excp_to; \ 723 + } while (0) 724 + 746 725 void svm_vcpu_free_msrpm(void *msrpm); 747 - void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); 748 726 void svm_enable_lbrv(struct kvm_vcpu *vcpu); 749 727 void svm_update_lbrv(struct kvm_vcpu *vcpu); 750 728 ··· 814 776 815 777 int __init nested_svm_init_msrpm_merge_offsets(void); 816 778 817 - int enter_svm_guest_mode(struct kvm_vcpu *vcpu, 818 - u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); 779 + int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, bool from_vmrun); 819 780 void svm_leave_nested(struct kvm_vcpu *vcpu); 820 781 void svm_free_nested(struct vcpu_svm *svm); 821 782 int svm_allocate_nested(struct vcpu_svm *svm); ··· 822 785 void svm_copy_vmrun_state(struct vmcb_save_area *to_save, 823 786 struct vmcb_save_area *from_save); 824 787 void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb); 825 - int nested_svm_vmexit(struct vcpu_svm *svm); 788 + void nested_svm_vmexit(struct vcpu_svm *svm); 826 789 827 - static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) 790 + static inline void nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) 828 791 { 829 792 svm->vmcb->control.exit_code = exit_code; 830 793 svm->vmcb->control.exit_info_1 = 0; 831 794 svm->vmcb->control.exit_info_2 = 0; 832 - return nested_svm_vmexit(svm); 795 + nested_svm_vmexit(svm); 833 796 } 834 797 835 798 int nested_svm_exit_handled(struct vcpu_svm *svm);
+26 -24
arch/x86/kvm/vmx/nested.c
··· 2273 2273 2274 2274 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2275 2275 { 2276 - if (vmx->nested.nested_run_pending && 2276 + if (vmx->vcpu.arch.nested_run_pending && 2277 2277 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2278 2278 return vmcs12->guest_ia32_efer; 2279 2279 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ··· 2513 2513 /* 2514 2514 * Interrupt/Exception Fields 2515 2515 */ 2516 - if (vmx->nested.nested_run_pending) { 2516 + if (vmx->vcpu.arch.nested_run_pending) { 2517 2517 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2518 2518 vmcs12->vm_entry_intr_info_field); 2519 2519 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, ··· 2621 2621 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2622 2622 } 2623 2623 2624 - if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2624 + if (kvm_mpx_supported() && vmx->vcpu.arch.nested_run_pending && 2625 2625 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2626 2626 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2627 2627 } ··· 2718 2718 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2719 2719 } 2720 2720 2721 - if (vmx->nested.nested_run_pending && 2721 + if (vcpu->arch.nested_run_pending && 2722 2722 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2723 2723 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2724 2724 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & ··· 2728 2728 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2729 2729 } 2730 2730 2731 - if (!vmx->nested.nested_run_pending || 2731 + if (!vcpu->arch.nested_run_pending || 2732 2732 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2733 2733 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2734 2734 vmx->nested.pre_vmenter_ssp, 2735 2735 vmx->nested.pre_vmenter_ssp_tbl); 2736 2736 2737 - if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2737 + if (kvm_mpx_supported() && (!vcpu->arch.nested_run_pending || 2738 2738 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2739 2739 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2740 2740 vmx_set_rflags(vcpu, vmcs12->guest_rflags); ··· 2747 2747 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2748 2748 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2749 2749 2750 - if (vmx->nested.nested_run_pending && 2750 + if (vcpu->arch.nested_run_pending && 2751 2751 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2752 2752 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2753 2753 vcpu->arch.pat = vmcs12->guest_ia32_pat; ··· 3349 3349 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3350 3350 * CR0.PG) is 1. 3351 3351 */ 3352 - if (to_vmx(vcpu)->nested.nested_run_pending && 3352 + if (vcpu->arch.nested_run_pending && 3353 3353 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3354 3354 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3355 3355 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || ··· 3627 3627 3628 3628 kvm_service_local_tlb_flush_requests(vcpu); 3629 3629 3630 - if (!vmx->nested.nested_run_pending || 3630 + if (!vcpu->arch.nested_run_pending || 3631 3631 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3632 3632 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3633 3633 if (kvm_mpx_supported() && 3634 - (!vmx->nested.nested_run_pending || 3634 + (!vcpu->arch.nested_run_pending || 3635 3635 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3636 3636 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3637 3637 3638 - if (!vmx->nested.nested_run_pending || 3638 + if (!vcpu->arch.nested_run_pending || 3639 3639 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3640 3640 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3641 3641 &vmx->nested.pre_vmenter_ssp, ··· 3844 3844 * We're finally done with prerequisite checking, and can start with 3845 3845 * the nested entry. 3846 3846 */ 3847 - vmx->nested.nested_run_pending = 1; 3847 + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 3848 3848 vmx->nested.has_preemption_timer_deadline = false; 3849 3849 status = nested_vmx_enter_non_root_mode(vcpu, true); 3850 3850 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) ··· 3876 3876 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3877 3877 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3878 3878 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3879 - vmx->nested.nested_run_pending = 0; 3879 + vcpu->arch.nested_run_pending = 0; 3880 3880 return kvm_emulate_halt_noskip(vcpu); 3881 3881 } 3882 3882 break; 3883 3883 case GUEST_ACTIVITY_WAIT_SIPI: 3884 - vmx->nested.nested_run_pending = 0; 3884 + vcpu->arch.nested_run_pending = 0; 3885 3885 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3886 3886 break; 3887 3887 default: ··· 3891 3891 return 1; 3892 3892 3893 3893 vmentry_failed: 3894 - vmx->nested.nested_run_pending = 0; 3894 + vcpu->arch.nested_run_pending = 0; 3895 3895 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3896 3896 return 0; 3897 3897 if (status == NVMX_VMENTRY_VMEXIT) ··· 4288 4288 * previously injected event, the pending exception occurred while said 4289 4289 * event was being delivered and thus needs to be handled. 4290 4290 */ 4291 - bool block_nested_exceptions = vmx->nested.nested_run_pending; 4291 + bool block_nested_exceptions = vcpu->arch.nested_run_pending; 4292 4292 /* 4293 4293 * Events that don't require injection, i.e. that are virtualized by 4294 4294 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need ··· 4657 4657 4658 4658 if (nested_cpu_has_preemption_timer(vmcs12) && 4659 4659 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4660 - !vmx->nested.nested_run_pending) 4660 + !vcpu->arch.nested_run_pending) 4661 4661 vmcs12->vmx_preemption_timer_value = 4662 4662 vmx_get_preemption_timer_value(vcpu); 4663 4663 ··· 5056 5056 vmx->nested.mtf_pending = false; 5057 5057 5058 5058 /* trying to cancel vmlaunch/vmresume is a bug */ 5059 - WARN_ON_ONCE(vmx->nested.nested_run_pending); 5059 + kvm_warn_on_nested_run_pending(vcpu); 5060 5060 5061 5061 #ifdef CONFIG_KVM_HYPERV 5062 5062 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { ··· 6679 6679 unsigned long exit_qual; 6680 6680 u32 exit_intr_info; 6681 6681 6682 - WARN_ON_ONCE(vmx->nested.nested_run_pending); 6682 + kvm_warn_on_nested_run_pending(vcpu); 6683 6683 6684 6684 /* 6685 6685 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM ··· 6775 6775 if (is_guest_mode(vcpu)) { 6776 6776 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6777 6777 6778 - if (vmx->nested.nested_run_pending) 6778 + if (vcpu->arch.nested_run_pending) 6779 6779 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6780 6780 6781 6781 if (vmx->nested.mtf_pending) ··· 6850 6850 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6851 6851 { 6852 6852 if (is_guest_mode(vcpu)) { 6853 - to_vmx(vcpu)->nested.nested_run_pending = 0; 6853 + vcpu->arch.nested_run_pending = 0; 6854 6854 nested_vmx_vmexit(vcpu, -1, 0, 0); 6855 6855 } 6856 6856 free_nested(vcpu); ··· 7008 7008 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 7009 7009 return 0; 7010 7010 7011 - vmx->nested.nested_run_pending = 7012 - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 7011 + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) 7012 + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; 7013 + else 7014 + vcpu->arch.nested_run_pending = 0; 7013 7015 7014 7016 vmx->nested.mtf_pending = 7015 7017 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); ··· 7056 7054 return 0; 7057 7055 7058 7056 error_guest_mode: 7059 - vmx->nested.nested_run_pending = 0; 7057 + vcpu->arch.nested_run_pending = 0; 7060 7058 return ret; 7061 7059 } 7062 7060
+8 -8
arch/x86/kvm/vmx/vmx.c
··· 5279 5279 5280 5280 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5281 5281 { 5282 - if (to_vmx(vcpu)->nested.nested_run_pending) 5282 + if (vcpu->arch.nested_run_pending) 5283 5283 return -EBUSY; 5284 5284 5285 5285 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ ··· 5306 5306 5307 5307 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5308 5308 { 5309 - if (to_vmx(vcpu)->nested.nested_run_pending) 5309 + if (vcpu->arch.nested_run_pending) 5310 5310 return -EBUSY; 5311 5311 5312 5312 /* ··· 6118 6118 * only reachable if userspace modifies L2 guest state after KVM has 6119 6119 * performed the nested VM-Enter consistency checks. 6120 6120 */ 6121 - if (vmx->nested.nested_run_pending) 6121 + if (vcpu->arch.nested_run_pending) 6122 6122 return true; 6123 6123 6124 6124 /* ··· 6802 6802 * invalid guest state should never happen as that means KVM knowingly 6803 6803 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6804 6804 */ 6805 - if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6805 + if (KVM_BUG_ON(vcpu->arch.nested_run_pending, vcpu->kvm)) 6806 6806 return -EIO; 6807 6807 6808 6808 if (is_guest_mode(vcpu)) { ··· 7730 7730 * Track VMLAUNCH/VMRESUME that have made past guest state 7731 7731 * checking. 7732 7732 */ 7733 - if (vmx->nested.nested_run_pending && 7733 + if (vcpu->arch.nested_run_pending && 7734 7734 !vmx_get_exit_reason(vcpu).failed_vmentry) 7735 7735 ++vcpu->stat.nested_run; 7736 7736 7737 - vmx->nested.nested_run_pending = 0; 7737 + vcpu->arch.nested_run_pending = 0; 7738 7738 } 7739 7739 7740 7740 if (unlikely(vmx->fail)) ··· 8491 8491 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8492 8492 { 8493 8493 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8494 - if (to_vmx(vcpu)->nested.nested_run_pending) 8494 + if (vcpu->arch.nested_run_pending) 8495 8495 return -EBUSY; 8496 8496 return !is_smm(vcpu); 8497 8497 } ··· 8536 8536 if (ret != NVMX_VMENTRY_SUCCESS) 8537 8537 return 1; 8538 8538 8539 - vmx->nested.nested_run_pending = 1; 8539 + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 8540 8540 vmx->nested.smm.guest_mode = false; 8541 8541 } 8542 8542 return 0;
-3
arch/x86/kvm/vmx/vmx.h
··· 138 138 */ 139 139 bool enlightened_vmcs_enabled; 140 140 141 - /* L2 must run next, and mustn't decide to exit to L1. */ 142 - bool nested_run_pending; 143 - 144 141 /* Pending MTF VM-exit into L1. */ 145 142 bool mtf_pending; 146 143
+55 -23
arch/x86/kvm/x86.c
··· 351 351 MSR_IA32_U_CET, MSR_IA32_S_CET, 352 352 MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, 353 353 MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, 354 + MSR_IA32_DEBUGCTLMSR, 355 + MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP, 356 + MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP, 354 357 }; 355 358 356 359 static const u32 msrs_to_save_pmu[] = { ··· 867 864 vcpu->arch.exception.error_code = error_code; 868 865 vcpu->arch.exception.has_payload = has_payload; 869 866 vcpu->arch.exception.payload = payload; 870 - if (!is_guest_mode(vcpu)) 871 - kvm_deliver_exception_payload(vcpu, 872 - &vcpu->arch.exception); 873 867 return; 874 868 } 875 869 ··· 5531 5531 return 0; 5532 5532 } 5533 5533 5534 - static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 5535 - struct kvm_vcpu_events *events) 5534 + static struct kvm_queued_exception *kvm_get_exception_to_save(struct kvm_vcpu *vcpu) 5536 5535 { 5537 - struct kvm_queued_exception *ex; 5538 - 5539 - process_nmi(vcpu); 5540 - 5541 - #ifdef CONFIG_KVM_SMM 5542 - if (kvm_check_request(KVM_REQ_SMI, vcpu)) 5543 - process_smi(vcpu); 5544 - #endif 5545 - 5546 5536 /* 5547 5537 * KVM's ABI only allows for one exception to be migrated. Luckily, 5548 5538 * the only time there can be two queued exceptions is if there's a ··· 5543 5553 if (vcpu->arch.exception_vmexit.pending && 5544 5554 !vcpu->arch.exception.pending && 5545 5555 !vcpu->arch.exception.injected) 5546 - ex = &vcpu->arch.exception_vmexit; 5547 - else 5548 - ex = &vcpu->arch.exception; 5556 + return &vcpu->arch.exception_vmexit; 5557 + 5558 + return &vcpu->arch.exception; 5559 + } 5560 + 5561 + static void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu) 5562 + { 5563 + struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu); 5549 5564 5550 5565 /* 5551 - * In guest mode, payload delivery should be deferred if the exception 5552 - * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 5553 - * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability, 5554 - * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not 5555 - * propagate the payload and so it cannot be safely deferred. Deliver 5556 - * the payload if the capability hasn't been requested. 5566 + * If KVM_CAP_EXCEPTION_PAYLOAD is disabled, then (prematurely) deliver 5567 + * the pending exception payload when userspace saves *any* vCPU state 5568 + * that interacts with exception payloads to avoid breaking userspace. 5569 + * 5570 + * Architecturally, KVM must not deliver an exception payload until the 5571 + * exception is actually injected, e.g. to avoid losing pending #DB 5572 + * information (which VMX tracks in the VMCS), and to avoid clobbering 5573 + * state if the exception is never injected for whatever reason. But 5574 + * if KVM_CAP_EXCEPTION_PAYLOAD isn't enabled, then userspace may or 5575 + * may not propagate the payload across save+restore, and so KVM can't 5576 + * safely defer delivery of the payload. 5557 5577 */ 5558 5578 if (!vcpu->kvm->arch.exception_payload_enabled && 5559 5579 ex->pending && ex->has_payload) 5560 5580 kvm_deliver_exception_payload(vcpu, ex); 5581 + } 5582 + 5583 + static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 5584 + struct kvm_vcpu_events *events) 5585 + { 5586 + struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu); 5587 + 5588 + process_nmi(vcpu); 5589 + 5590 + #ifdef CONFIG_KVM_SMM 5591 + if (kvm_check_request(KVM_REQ_SMI, vcpu)) 5592 + process_smi(vcpu); 5593 + #endif 5594 + 5595 + kvm_handle_exception_payload_quirk(vcpu); 5561 5596 5562 5597 memset(events, 0, sizeof(*events)); 5563 5598 ··· 5760 5745 if (vcpu->kvm->arch.has_protected_state && 5761 5746 vcpu->arch.guest_state_protected) 5762 5747 return -EINVAL; 5748 + 5749 + kvm_handle_exception_payload_quirk(vcpu); 5763 5750 5764 5751 memset(dbgregs, 0, sizeof(*dbgregs)); 5765 5752 ··· 8914 8897 return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags); 8915 8898 } 8916 8899 8900 + static bool emulator_page_address_valid(struct x86_emulate_ctxt *ctxt, gpa_t gpa) 8901 + { 8902 + return page_address_valid(emul_to_vcpu(ctxt), gpa); 8903 + } 8904 + 8917 8905 static const struct x86_emulate_ops emulate_ops = { 8918 8906 .vm_bugged = emulator_vm_bugged, 8919 8907 .read_gpr = emulator_read_gpr, ··· 8966 8944 .set_xcr = emulator_set_xcr, 8967 8945 .get_untagged_addr = emulator_get_untagged_addr, 8968 8946 .is_canonical_addr = emulator_is_canonical_addr, 8947 + .page_address_valid = emulator_page_address_valid, 8969 8948 }; 8970 8949 8971 8950 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) ··· 11940 11917 static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) 11941 11918 { 11942 11919 /* 11920 + * Userspace may have modified vCPU state, mark nested_run_pending as 11921 + * "untrusted" to avoid triggering false-positive WARNs. 11922 + */ 11923 + if (vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING) 11924 + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; 11925 + 11926 + /* 11943 11927 * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and 11944 11928 * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted 11945 11929 * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be ··· 12185 12155 12186 12156 if (vcpu->arch.guest_state_protected) 12187 12157 goto skip_protected_regs; 12158 + 12159 + kvm_handle_exception_payload_quirk(vcpu); 12188 12160 12189 12161 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 12190 12162 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+10
arch/x86/kvm/x86.h
··· 188 188 return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu); 189 189 } 190 190 191 + /* 192 + * WARN if a nested VM-Enter is pending completion, and userspace hasn't gained 193 + * control since the nested VM-Enter was initiated (in which case, userspace 194 + * may have modified vCPU state to induce an architecturally invalid VM-Exit). 195 + */ 196 + static inline void kvm_warn_on_nested_run_pending(struct kvm_vcpu *vcpu) 197 + { 198 + WARN_ON_ONCE(vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING); 199 + } 200 + 191 201 static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) 192 202 { 193 203 vcpu->arch.mp_state = mp_state;
+3
tools/testing/selftests/kvm/Makefile.kvm
··· 111 111 TEST_GEN_PROGS_x86 += x86/vmx_preemption_timer_test 112 112 TEST_GEN_PROGS_x86 += x86/svm_vmcall_test 113 113 TEST_GEN_PROGS_x86 += x86/svm_int_ctl_test 114 + TEST_GEN_PROGS_x86 += x86/svm_nested_clear_efer_svme 114 115 TEST_GEN_PROGS_x86 += x86/svm_nested_shutdown_test 115 116 TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test 117 + TEST_GEN_PROGS_x86 += x86/svm_nested_vmcb12_gpa 118 + TEST_GEN_PROGS_x86 += x86/svm_lbr_nested_state 116 119 TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync 117 120 TEST_GEN_PROGS_x86 += x86/sync_regs_test 118 121 TEST_GEN_PROGS_x86 += x86/ucna_injection_test
+5
tools/testing/selftests/kvm/include/x86/processor.h
··· 1390 1390 return get_kvm_param_bool("ignore_msrs"); 1391 1391 } 1392 1392 1393 + static inline bool kvm_is_lbrv_enabled(void) 1394 + { 1395 + return !!get_kvm_amd_param_integer("lbrv"); 1396 + } 1397 + 1393 1398 uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr); 1394 1399 1395 1400 uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
+7 -7
tools/testing/selftests/kvm/include/x86/svm.h
··· 97 97 u64 exit_info_2; 98 98 u32 exit_int_info; 99 99 u32 exit_int_info_err; 100 - u64 nested_ctl; 100 + u64 misc_ctl; 101 101 u64 avic_vapic_bar; 102 102 u8 reserved_4[8]; 103 103 u32 event_inj; 104 104 u32 event_inj_err; 105 105 u64 nested_cr3; 106 - u64 virt_ext; 106 + u64 misc_ctl2; 107 107 u32 clean; 108 108 u32 reserved_5; 109 109 u64 next_rip; ··· 155 155 #define AVIC_ENABLE_SHIFT 31 156 156 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) 157 157 158 - #define LBR_CTL_ENABLE_MASK BIT_ULL(0) 159 - #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) 160 - 161 158 #define SVM_INTERRUPT_SHADOW_MASK 1 162 159 163 160 #define SVM_IOIO_STR_SHIFT 2 ··· 172 175 #define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL 173 176 #define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL 174 177 175 - #define SVM_NESTED_CTL_NP_ENABLE BIT(0) 176 - #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) 178 + #define SVM_MISC_ENABLE_NP BIT(0) 179 + #define SVM_MISC_ENABLE_SEV BIT(1) 180 + 181 + #define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0) 182 + #define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1) 177 183 178 184 struct __attribute__ ((__packed__)) vmcb_seg { 179 185 u16 selector;
+1 -1
tools/testing/selftests/kvm/lib/x86/svm.c
··· 126 126 guest_regs.rdi = (u64)svm; 127 127 128 128 if (svm->ncr3_gpa) { 129 - ctrl->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 129 + ctrl->misc_ctl |= SVM_MISC_ENABLE_NP; 130 130 ctrl->nested_cr3 = svm->ncr3_gpa; 131 131 } 132 132 }
+8 -8
tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
··· 79 79 svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) | 80 80 BIT_ULL(INTERCEPT_VMLOAD)); 81 81 82 - /* ..VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK cleared.. */ 83 - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 82 + /* ..SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE cleared.. */ 83 + svm->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; 84 84 85 85 svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; 86 86 run_guest(svm->vmcb, svm->vmcb_gpa); ··· 90 90 run_guest(svm->vmcb, svm->vmcb_gpa); 91 91 GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); 92 92 93 - /* ..and VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK set */ 94 - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 93 + /* ..and SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE set */ 94 + svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; 95 95 96 96 svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; 97 97 run_guest(svm->vmcb, svm->vmcb_gpa); ··· 106 106 BIT_ULL(INTERCEPT_VMLOAD)); 107 107 108 108 /* 109 - * Without VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be 109 + * Without SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE, the GPA will be 110 110 * interpreted as an L1 GPA, so VMCB0 should be used. 111 111 */ 112 112 svm->vmcb->save.rip = (u64)l2_guest_code_vmcb0; 113 - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 113 + svm->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; 114 114 run_guest(svm->vmcb, svm->vmcb_gpa); 115 115 GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); 116 116 117 117 /* 118 - * With VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be interpeted as 118 + * With SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE, the GPA will be interpeted as 119 119 * an L2 GPA, and translated through the NPT to VMCB1. 120 120 */ 121 121 svm->vmcb->save.rip = (u64)l2_guest_code_vmcb1; 122 - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 122 + svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; 123 123 run_guest(svm->vmcb, svm->vmcb_gpa); 124 124 GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); 125 125
+35
tools/testing/selftests/kvm/x86/state_test.c
··· 26 26 GUEST_SYNC(4); 27 27 /* Exit to L1 */ 28 28 vmcall(); 29 + clgi(); 29 30 GUEST_SYNC(6); 31 + stgi(); 30 32 /* Done, exit to L1 and never come back. */ 31 33 vmcall(); 32 34 } ··· 42 40 /* Prepare for L2 execution. */ 43 41 generic_svm_setup(svm, svm_l2_guest_code, 44 42 &l2_guest_stack[L2_GUEST_STACK_SIZE]); 43 + 44 + vmcb->control.int_ctl |= (V_GIF_ENABLE_MASK | V_GIF_MASK); 45 45 46 46 GUEST_SYNC(3); 47 47 run_guest(vmcb, svm->vmcb_gpa); ··· 226 222 GUEST_DONE(); 227 223 } 228 224 225 + void svm_check_nested_state(int stage, struct kvm_x86_state *state) 226 + { 227 + struct vmcb *vmcb = (struct vmcb *)state->nested.data.svm; 228 + 229 + if (kvm_cpu_has(X86_FEATURE_VGIF)) { 230 + if (stage == 4) 231 + TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 1); 232 + if (stage == 6) 233 + TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 0); 234 + } 235 + 236 + if (kvm_cpu_has(X86_FEATURE_NRIPS)) { 237 + /* 238 + * GUEST_SYNC() causes IO emulation in KVM, in which case the 239 + * RIP is advanced before exiting to userspace. Hence, the RIP 240 + * in the saved state should be the same as nRIP saved by the 241 + * CPU in the VMCB. 242 + */ 243 + if (stage == 6) 244 + TEST_ASSERT_EQ(vmcb->control.next_rip, state->regs.rip); 245 + } 246 + } 247 + 248 + void check_nested_state(int stage, struct kvm_x86_state *state) 249 + { 250 + if (kvm_has_cap(KVM_CAP_NESTED_STATE) && kvm_cpu_has(X86_FEATURE_SVM)) 251 + svm_check_nested_state(stage, state); 252 + } 253 + 229 254 int main(int argc, char *argv[]) 230 255 { 231 256 uint64_t *xstate_bv, saved_xstate_bv; ··· 310 277 vcpu_regs_get(vcpu, &regs1); 311 278 312 279 kvm_vm_release(vm); 280 + 281 + check_nested_state(stage, state); 313 282 314 283 /* Restore state in a new VM. */ 315 284 vcpu = vm_recreate_with_one_vcpu(vm);
+145
tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2026, Google, Inc. 4 + */ 5 + 6 + #include "test_util.h" 7 + #include "kvm_util.h" 8 + #include "processor.h" 9 + #include "svm_util.h" 10 + 11 + 12 + #define L2_GUEST_STACK_SIZE 64 13 + 14 + #define DO_BRANCH() do { asm volatile("jmp 1f\n 1: nop"); } while (0) 15 + 16 + struct lbr_branch { 17 + u64 from, to; 18 + }; 19 + 20 + volatile struct lbr_branch l2_branch; 21 + 22 + #define RECORD_AND_CHECK_BRANCH(b) \ 23 + do { \ 24 + wrmsr(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR); \ 25 + DO_BRANCH(); \ 26 + (b)->from = rdmsr(MSR_IA32_LASTBRANCHFROMIP); \ 27 + (b)->to = rdmsr(MSR_IA32_LASTBRANCHTOIP); \ 28 + /* Disable LBR right after to avoid overriding the IPs */ \ 29 + wrmsr(MSR_IA32_DEBUGCTLMSR, 0); \ 30 + \ 31 + GUEST_ASSERT_NE((b)->from, 0); \ 32 + GUEST_ASSERT_NE((b)->to, 0); \ 33 + } while (0) 34 + 35 + #define CHECK_BRANCH_MSRS(b) \ 36 + do { \ 37 + GUEST_ASSERT_EQ((b)->from, rdmsr(MSR_IA32_LASTBRANCHFROMIP)); \ 38 + GUEST_ASSERT_EQ((b)->to, rdmsr(MSR_IA32_LASTBRANCHTOIP)); \ 39 + } while (0) 40 + 41 + #define CHECK_BRANCH_VMCB(b, vmcb) \ 42 + do { \ 43 + GUEST_ASSERT_EQ((b)->from, vmcb->save.br_from); \ 44 + GUEST_ASSERT_EQ((b)->to, vmcb->save.br_to); \ 45 + } while (0) 46 + 47 + static void l2_guest_code(struct svm_test_data *svm) 48 + { 49 + /* Record a branch, trigger save/restore, and make sure LBRs are intact */ 50 + RECORD_AND_CHECK_BRANCH(&l2_branch); 51 + GUEST_SYNC(true); 52 + CHECK_BRANCH_MSRS(&l2_branch); 53 + vmmcall(); 54 + } 55 + 56 + static void l1_guest_code(struct svm_test_data *svm, bool nested_lbrv) 57 + { 58 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 59 + struct vmcb *vmcb = svm->vmcb; 60 + struct lbr_branch l1_branch; 61 + 62 + /* Record a branch, trigger save/restore, and make sure LBRs are intact */ 63 + RECORD_AND_CHECK_BRANCH(&l1_branch); 64 + GUEST_SYNC(true); 65 + CHECK_BRANCH_MSRS(&l1_branch); 66 + 67 + /* Run L2, which will also do the same */ 68 + generic_svm_setup(svm, l2_guest_code, 69 + &l2_guest_stack[L2_GUEST_STACK_SIZE]); 70 + 71 + if (nested_lbrv) 72 + vmcb->control.misc_ctl2 = SVM_MISC2_ENABLE_V_LBR; 73 + else 74 + vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_LBR; 75 + 76 + run_guest(vmcb, svm->vmcb_gpa); 77 + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); 78 + 79 + /* Trigger save/restore one more time before checking, just for kicks */ 80 + GUEST_SYNC(true); 81 + 82 + /* 83 + * If LBR_CTL_ENABLE is set, L1 and L2 should have separate LBR MSRs, so 84 + * expect L1's LBRs to remain intact and L2 LBRs to be in the VMCB. 85 + * Otherwise, the MSRs are shared between L1 & L2 so expect L2's LBRs. 86 + */ 87 + if (nested_lbrv) { 88 + CHECK_BRANCH_MSRS(&l1_branch); 89 + CHECK_BRANCH_VMCB(&l2_branch, vmcb); 90 + } else { 91 + CHECK_BRANCH_MSRS(&l2_branch); 92 + } 93 + GUEST_DONE(); 94 + } 95 + 96 + void test_lbrv_nested_state(bool nested_lbrv) 97 + { 98 + struct kvm_x86_state *state = NULL; 99 + struct kvm_vcpu *vcpu; 100 + vm_vaddr_t svm_gva; 101 + struct kvm_vm *vm; 102 + struct ucall uc; 103 + 104 + pr_info("Testing with nested LBRV %s\n", nested_lbrv ? "enabled" : "disabled"); 105 + 106 + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); 107 + vcpu_alloc_svm(vm, &svm_gva); 108 + vcpu_args_set(vcpu, 2, svm_gva, nested_lbrv); 109 + 110 + for (;;) { 111 + vcpu_run(vcpu); 112 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 113 + switch (get_ucall(vcpu, &uc)) { 114 + case UCALL_SYNC: 115 + /* Save the vCPU state and restore it in a new VM on sync */ 116 + pr_info("Guest triggered save/restore.\n"); 117 + state = vcpu_save_state(vcpu); 118 + kvm_vm_release(vm); 119 + vcpu = vm_recreate_with_one_vcpu(vm); 120 + vcpu_load_state(vcpu, state); 121 + kvm_x86_state_cleanup(state); 122 + break; 123 + case UCALL_ABORT: 124 + REPORT_GUEST_ASSERT(uc); 125 + /* NOT REACHED */ 126 + case UCALL_DONE: 127 + goto done; 128 + default: 129 + TEST_FAIL("Unknown ucall %lu", uc.cmd); 130 + } 131 + } 132 + done: 133 + kvm_vm_free(vm); 134 + } 135 + 136 + int main(int argc, char *argv[]) 137 + { 138 + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); 139 + TEST_REQUIRE(kvm_is_lbrv_enabled()); 140 + 141 + test_lbrv_nested_state(/*nested_lbrv=*/false); 142 + test_lbrv_nested_state(/*nested_lbrv=*/true); 143 + 144 + return 0; 145 + }
+55
tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2026, Google LLC. 4 + */ 5 + #include "kvm_util.h" 6 + #include "vmx.h" 7 + #include "svm_util.h" 8 + #include "kselftest.h" 9 + 10 + 11 + #define L2_GUEST_STACK_SIZE 64 12 + 13 + static void l2_guest_code(void) 14 + { 15 + unsigned long efer = rdmsr(MSR_EFER); 16 + 17 + /* generic_svm_setup() initializes EFER_SVME set for L2 */ 18 + GUEST_ASSERT(efer & EFER_SVME); 19 + wrmsr(MSR_EFER, efer & ~EFER_SVME); 20 + 21 + /* Unreachable, L1 should be shutdown */ 22 + GUEST_ASSERT(0); 23 + } 24 + 25 + static void l1_guest_code(struct svm_test_data *svm) 26 + { 27 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 28 + 29 + generic_svm_setup(svm, l2_guest_code, 30 + &l2_guest_stack[L2_GUEST_STACK_SIZE]); 31 + run_guest(svm->vmcb, svm->vmcb_gpa); 32 + 33 + /* Unreachable, L1 should be shutdown */ 34 + GUEST_ASSERT(0); 35 + } 36 + 37 + int main(int argc, char *argv[]) 38 + { 39 + struct kvm_vcpu *vcpu; 40 + struct kvm_vm *vm; 41 + vm_vaddr_t nested_gva = 0; 42 + 43 + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); 44 + 45 + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); 46 + 47 + vcpu_alloc_svm(vm, &nested_gva); 48 + vcpu_args_set(vcpu, 1, nested_gva); 49 + 50 + vcpu_run(vcpu); 51 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); 52 + 53 + kvm_vm_free(vm); 54 + return 0; 55 + }
+176
tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2026, Google LLC. 4 + */ 5 + #include "kvm_util.h" 6 + #include "vmx.h" 7 + #include "svm_util.h" 8 + #include "kselftest.h" 9 + #include "kvm_test_harness.h" 10 + #include "test_util.h" 11 + 12 + 13 + #define L2_GUEST_STACK_SIZE 64 14 + 15 + #define SYNC_GP 101 16 + #define SYNC_L2_STARTED 102 17 + 18 + static unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; 19 + 20 + static void guest_gp_handler(struct ex_regs *regs) 21 + { 22 + GUEST_SYNC(SYNC_GP); 23 + } 24 + 25 + static void l2_code(void) 26 + { 27 + GUEST_SYNC(SYNC_L2_STARTED); 28 + vmcall(); 29 + } 30 + 31 + static void l1_vmrun(struct svm_test_data *svm, u64 gpa) 32 + { 33 + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 34 + 35 + asm volatile ("vmrun %[gpa]" : : [gpa] "a" (gpa) : "memory"); 36 + } 37 + 38 + static void l1_vmload(struct svm_test_data *svm, u64 gpa) 39 + { 40 + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 41 + 42 + asm volatile ("vmload %[gpa]" : : [gpa] "a" (gpa) : "memory"); 43 + } 44 + 45 + static void l1_vmsave(struct svm_test_data *svm, u64 gpa) 46 + { 47 + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 48 + 49 + asm volatile ("vmsave %[gpa]" : : [gpa] "a" (gpa) : "memory"); 50 + } 51 + 52 + static void l1_vmexit(struct svm_test_data *svm, u64 gpa) 53 + { 54 + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); 55 + 56 + run_guest(svm->vmcb, svm->vmcb_gpa); 57 + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); 58 + GUEST_DONE(); 59 + } 60 + 61 + static u64 unmappable_gpa(struct kvm_vcpu *vcpu) 62 + { 63 + struct userspace_mem_region *region; 64 + u64 region_gpa_end, vm_gpa_end = 0; 65 + int i; 66 + 67 + hash_for_each(vcpu->vm->regions.slot_hash, i, region, slot_node) { 68 + region_gpa_end = region->region.guest_phys_addr + region->region.memory_size; 69 + vm_gpa_end = max(vm_gpa_end, region_gpa_end); 70 + } 71 + 72 + return vm_gpa_end; 73 + } 74 + 75 + static void test_invalid_vmcb12(struct kvm_vcpu *vcpu) 76 + { 77 + vm_vaddr_t nested_gva = 0; 78 + struct ucall uc; 79 + 80 + 81 + vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler); 82 + vcpu_alloc_svm(vcpu->vm, &nested_gva); 83 + vcpu_args_set(vcpu, 2, nested_gva, -1ULL); 84 + vcpu_run(vcpu); 85 + 86 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 87 + TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC); 88 + TEST_ASSERT_EQ(uc.args[1], SYNC_GP); 89 + } 90 + 91 + static void test_unmappable_vmcb12(struct kvm_vcpu *vcpu) 92 + { 93 + vm_vaddr_t nested_gva = 0; 94 + 95 + vcpu_alloc_svm(vcpu->vm, &nested_gva); 96 + vcpu_args_set(vcpu, 2, nested_gva, unmappable_gpa(vcpu)); 97 + vcpu_run(vcpu); 98 + 99 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); 100 + TEST_ASSERT_EQ(vcpu->run->emulation_failure.suberror, KVM_INTERNAL_ERROR_EMULATION); 101 + } 102 + 103 + static void test_unmappable_vmcb12_vmexit(struct kvm_vcpu *vcpu) 104 + { 105 + struct kvm_x86_state *state; 106 + vm_vaddr_t nested_gva = 0; 107 + struct ucall uc; 108 + 109 + /* 110 + * Enter L2 (with a legit vmcb12 GPA), then overwrite vmcb12 GPA with an 111 + * unmappable GPA. KVM will fail to map vmcb12 on nested VM-Exit and 112 + * cause a shutdown. 113 + */ 114 + vcpu_alloc_svm(vcpu->vm, &nested_gva); 115 + vcpu_args_set(vcpu, 2, nested_gva, unmappable_gpa(vcpu)); 116 + vcpu_run(vcpu); 117 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); 118 + TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC); 119 + TEST_ASSERT_EQ(uc.args[1], SYNC_L2_STARTED); 120 + 121 + state = vcpu_save_state(vcpu); 122 + state->nested.hdr.svm.vmcb_pa = unmappable_gpa(vcpu); 123 + vcpu_load_state(vcpu, state); 124 + vcpu_run(vcpu); 125 + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); 126 + 127 + kvm_x86_state_cleanup(state); 128 + } 129 + 130 + KVM_ONE_VCPU_TEST_SUITE(vmcb12_gpa); 131 + 132 + KVM_ONE_VCPU_TEST(vmcb12_gpa, vmrun_invalid, l1_vmrun) 133 + { 134 + test_invalid_vmcb12(vcpu); 135 + } 136 + 137 + KVM_ONE_VCPU_TEST(vmcb12_gpa, vmload_invalid, l1_vmload) 138 + { 139 + test_invalid_vmcb12(vcpu); 140 + } 141 + 142 + KVM_ONE_VCPU_TEST(vmcb12_gpa, vmsave_invalid, l1_vmsave) 143 + { 144 + test_invalid_vmcb12(vcpu); 145 + } 146 + 147 + KVM_ONE_VCPU_TEST(vmcb12_gpa, vmrun_unmappable, l1_vmrun) 148 + { 149 + test_unmappable_vmcb12(vcpu); 150 + } 151 + 152 + KVM_ONE_VCPU_TEST(vmcb12_gpa, vmload_unmappable, l1_vmload) 153 + { 154 + test_unmappable_vmcb12(vcpu); 155 + } 156 + 157 + KVM_ONE_VCPU_TEST(vmcb12_gpa, vmsave_unmappable, l1_vmsave) 158 + { 159 + test_unmappable_vmcb12(vcpu); 160 + } 161 + 162 + /* 163 + * Invalid vmcb12_gpa cannot be test for #VMEXIT as KVM_SET_NESTED_STATE will 164 + * reject it. 165 + */ 166 + KVM_ONE_VCPU_TEST(vmcb12_gpa, vmexit_unmappable, l1_vmexit) 167 + { 168 + test_unmappable_vmcb12_vmexit(vcpu); 169 + } 170 + 171 + int main(int argc, char *argv[]) 172 + { 173 + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); 174 + 175 + return test_harness_run(argc, argv); 176 + }