Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+10

arch/powerpc/kvm/book3s_64_mmu_radix.c

··· 646 646 */ 647 647 local_irq_disable(); 648 648 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); 649 + /* 650 + * If the PTE disappeared temporarily due to a THP 651 + * collapse, just return and let the guest try again. 652 + */ 653 + if (!ptep) { 654 + local_irq_enable(); 655 + if (page) 656 + put_page(page); 657 + return RESUME_GUEST; 658 + } 649 659 pte = *ptep; 650 660 local_irq_enable(); 651 661

+20 -4

arch/x86/kvm/mmu.c

··· 249 249 */ 250 250 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; 251 251 252 + /* 253 + * In some cases, we need to preserve the GFN of a non-present or reserved 254 + * SPTE when we usurp the upper five bits of the physical address space to 255 + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll 256 + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask 257 + * left into the reserved bits, i.e. the GFN in the SPTE will be split into 258 + * high and low parts. This mask covers the lower bits of the GFN. 259 + */ 260 + static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; 261 + 262 + 252 263 static void mmu_spte_set(u64 *sptep, u64 spte); 253 264 static union kvm_mmu_page_role 254 265 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); ··· 368 357 369 358 static gfn_t get_mmio_spte_gfn(u64 spte) 370 359 { 371 - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | 372 - shadow_nonpresent_or_rsvd_mask; 373 - u64 gpa = spte & ~mask; 360 + u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; 374 361 375 362 gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) 376 363 & shadow_nonpresent_or_rsvd_mask; ··· 432 423 433 424 static void kvm_mmu_reset_all_pte_masks(void) 434 425 { 426 + u8 low_phys_bits; 427 + 435 428 shadow_user_mask = 0; 436 429 shadow_accessed_mask = 0; 437 430 shadow_dirty_mask = 0; ··· 448 437 * appropriate mask to guard against L1TF attacks. Otherwise, it is 449 438 * assumed that the CPU is not vulnerable to L1TF. 450 439 */ 440 + low_phys_bits = boot_cpu_data.x86_phys_bits; 451 441 if (boot_cpu_data.x86_phys_bits < 452 - 52 - shadow_nonpresent_or_rsvd_mask_len) 442 + 52 - shadow_nonpresent_or_rsvd_mask_len) { 453 443 shadow_nonpresent_or_rsvd_mask = 454 444 rsvd_bits(boot_cpu_data.x86_phys_bits - 455 445 shadow_nonpresent_or_rsvd_mask_len, 456 446 boot_cpu_data.x86_phys_bits - 1); 447 + low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; 448 + } 449 + shadow_nonpresent_or_rsvd_lower_gfn_mask = 450 + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); 457 451 } 458 452 459 453 static int is_cpuid_PSE36(void)

+75 -60

arch/x86/kvm/vmx.c

··· 121 121 122 122 #define MSR_BITMAP_MODE_X2APIC 1 123 123 #define MSR_BITMAP_MODE_X2APIC_APICV 2 124 - #define MSR_BITMAP_MODE_LM 4 125 124 126 125 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 127 126 ··· 856 857 857 858 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 858 859 u64 vmcs01_debugctl; 860 + u64 vmcs01_guest_bndcfgs; 859 861 860 862 u16 vpid02; 861 863 u16 last_vpid; ··· 2899 2899 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 2900 2900 } 2901 2901 2902 - if (is_long_mode(&vmx->vcpu)) 2903 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2902 + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2904 2903 #else 2905 2904 savesegment(fs, fs_sel); 2906 2905 savesegment(gs, gs_sel); ··· 2950 2951 vmx->loaded_cpu_state = NULL; 2951 2952 2952 2953 #ifdef CONFIG_X86_64 2953 - if (is_long_mode(&vmx->vcpu)) 2954 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2954 + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2955 2955 #endif 2956 2956 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 2957 2957 kvm_load_ldt(host_state->ldt_sel); ··· 2978 2980 #ifdef CONFIG_X86_64 2979 2981 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 2980 2982 { 2981 - if (is_long_mode(&vmx->vcpu)) { 2982 - preempt_disable(); 2983 - if (vmx->loaded_cpu_state) 2984 - rdmsrl(MSR_KERNEL_GS_BASE, 2985 - vmx->msr_guest_kernel_gs_base); 2986 - preempt_enable(); 2987 - } 2983 + preempt_disable(); 2984 + if (vmx->loaded_cpu_state) 2985 + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2986 + preempt_enable(); 2988 2987 return vmx->msr_guest_kernel_gs_base; 2989 2988 } 2990 2989 2991 2990 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 2992 2991 { 2993 - if (is_long_mode(&vmx->vcpu)) { 2994 - preempt_disable(); 2995 - if (vmx->loaded_cpu_state) 2996 - wrmsrl(MSR_KERNEL_GS_BASE, data); 2997 - preempt_enable(); 2998 - } 2992 + preempt_disable(); 2993 + if (vmx->loaded_cpu_state) 2994 + wrmsrl(MSR_KERNEL_GS_BASE, data); 2995 + preempt_enable(); 2999 2996 vmx->msr_guest_kernel_gs_base = data; 3000 2997 } 3001 2998 #endif ··· 3526 3533 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 3527 3534 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 3528 3535 3529 - if (kvm_mpx_supported()) 3530 - msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 3531 - 3532 3536 /* We support free control of debug control saving. */ 3533 3537 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 3534 3538 ··· 3542 3552 VM_ENTRY_LOAD_IA32_PAT; 3543 3553 msrs->entry_ctls_high |= 3544 3554 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 3545 - if (kvm_mpx_supported()) 3546 - msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 3547 3555 3548 3556 /* We support free control of debug control loading. */ 3549 3557 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; ··· 3589 3601 msrs->secondary_ctls_high); 3590 3602 msrs->secondary_ctls_low = 0; 3591 3603 msrs->secondary_ctls_high &= 3592 - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 3593 3604 SECONDARY_EXEC_DESC | 3594 3605 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3595 3606 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3596 3607 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3597 3608 SECONDARY_EXEC_WBINVD_EXITING; 3609 + 3598 3610 /* 3599 3611 * We can emulate "VMCS shadowing," even if the hardware 3600 3612 * doesn't support it. ··· 3650 3662 if (enable_unrestricted_guest) 3651 3663 msrs->secondary_ctls_high |= 3652 3664 SECONDARY_EXEC_UNRESTRICTED_GUEST; 3665 + 3666 + if (flexpriority_enabled) 3667 + msrs->secondary_ctls_high |= 3668 + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 3653 3669 3654 3670 /* miscellaneous data */ 3655 3671 rdmsr(MSR_IA32_VMX_MISC, ··· 5065 5073 if (!msr) 5066 5074 return; 5067 5075 5068 - /* 5069 - * MSR_KERNEL_GS_BASE is not intercepted when the guest is in 5070 - * 64-bit mode as a 64-bit kernel may frequently access the 5071 - * MSR. This means we need to manually save/restore the MSR 5072 - * when switching between guest and host state, but only if 5073 - * the guest is in 64-bit mode. Sync our cached value if the 5074 - * guest is transitioning to 32-bit mode and the CPU contains 5075 - * guest state, i.e. the cache is stale. 5076 - */ 5077 - #ifdef CONFIG_X86_64 5078 - if (!(efer & EFER_LMA)) 5079 - (void)vmx_read_guest_kernel_gs_base(vmx); 5080 - #endif 5081 5076 vcpu->arch.efer = efer; 5082 5077 if (efer & EFER_LMA) { 5083 5078 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); ··· 6057 6078 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 6058 6079 } 6059 6080 6060 - if (is_long_mode(vcpu)) 6061 - mode |= MSR_BITMAP_MODE_LM; 6062 - 6063 6081 return mode; 6064 6082 } 6065 6083 ··· 6096 6120 6097 6121 if (!changed) 6098 6122 return; 6099 - 6100 - vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, 6101 - !(mode & MSR_BITMAP_MODE_LM)); 6102 6123 6103 6124 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) 6104 6125 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); ··· 6162 6189 nested_mark_vmcs12_pages_dirty(vcpu); 6163 6190 } 6164 6191 6192 + static u8 vmx_get_rvi(void) 6193 + { 6194 + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; 6195 + } 6196 + 6165 6197 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 6166 6198 { 6167 6199 struct vcpu_vmx *vmx = to_vmx(vcpu); ··· 6179 6201 WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) 6180 6202 return false; 6181 6203 6182 - rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff; 6204 + rvi = vmx_get_rvi(); 6183 6205 6184 6206 vapic_page = kmap(vmx->nested.virtual_apic_page); 6185 6207 vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); ··· 10223 10245 if (!lapic_in_kernel(vcpu)) 10224 10246 return; 10225 10247 10248 + if (!flexpriority_enabled && 10249 + !cpu_has_vmx_virtualize_x2apic_mode()) 10250 + return; 10251 + 10226 10252 /* Postpone execution until vmcs01 is the current VMCS. */ 10227 10253 if (is_guest_mode(vcpu)) { 10228 10254 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; 10229 10255 return; 10230 10256 } 10231 - 10232 - if (!cpu_need_tpr_shadow(vcpu)) 10233 - return; 10234 10257 10235 10258 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 10236 10259 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | ··· 10352 10373 } 10353 10374 vmx_hwapic_irr_update(vcpu, max_irr); 10354 10375 return max_irr; 10376 + } 10377 + 10378 + static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 10379 + { 10380 + u8 rvi = vmx_get_rvi(); 10381 + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 10382 + 10383 + return ((rvi & 0xf0) > (vppr & 0xf0)); 10355 10384 } 10356 10385 10357 10386 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) ··· 11251 11264 #undef cr4_fixed1_update 11252 11265 } 11253 11266 11267 + static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 11268 + { 11269 + struct vcpu_vmx *vmx = to_vmx(vcpu); 11270 + 11271 + if (kvm_mpx_supported()) { 11272 + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); 11273 + 11274 + if (mpx_enabled) { 11275 + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 11276 + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 11277 + } else { 11278 + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; 11279 + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; 11280 + } 11281 + } 11282 + } 11283 + 11254 11284 static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 11255 11285 { 11256 11286 struct vcpu_vmx *vmx = to_vmx(vcpu); ··· 11284 11280 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 11285 11281 ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 11286 11282 11287 - if (nested_vmx_allowed(vcpu)) 11283 + if (nested_vmx_allowed(vcpu)) { 11288 11284 nested_vmx_cr_fixed1_bits_update(vcpu); 11285 + nested_vmx_entry_exit_ctls_update(vcpu); 11286 + } 11289 11287 } 11290 11288 11291 11289 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) ··· 12055 12049 12056 12050 set_cr4_guest_host_mask(vmx); 12057 12051 12058 - if (vmx_mpx_supported()) 12059 - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 12052 + if (kvm_mpx_supported()) { 12053 + if (vmx->nested.nested_run_pending && 12054 + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 12055 + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 12056 + else 12057 + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 12058 + } 12060 12059 12061 12060 if (enable_vpid) { 12062 12061 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) ··· 12606 12595 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 12607 12596 bool from_vmentry = !!exit_qual; 12608 12597 u32 dummy_exit_qual; 12609 - u32 vmcs01_cpu_exec_ctrl; 12598 + bool evaluate_pending_interrupts; 12610 12599 int r = 0; 12611 12600 12612 - vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 12601 + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 12602 + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 12603 + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 12604 + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 12613 12605 12614 12606 enter_guest_mode(vcpu); 12615 12607 12616 12608 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 12617 12609 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 12610 + if (kvm_mpx_supported() && 12611 + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 12612 + vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 12618 12613 12619 12614 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 12620 12615 vmx_segment_cache_clear(vmx); ··· 12660 12643 * to L1 or delivered directly to L2 (e.g. In case L1 don't 12661 12644 * intercept EXTERNAL_INTERRUPT). 12662 12645 * 12663 - * Usually this would be handled by L0 requesting a 12664 - * IRQ/NMI window by setting VMCS accordingly. However, 12665 - * this setting was done on VMCS01 and now VMCS02 is active 12666 - * instead. Thus, we force L0 to perform pending event 12667 - * evaluation by requesting a KVM_REQ_EVENT. 12646 + * Usually this would be handled by the processor noticing an 12647 + * IRQ/NMI window request, or checking RVI during evaluation of 12648 + * pending virtual interrupts. However, this setting was done 12649 + * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 12650 + * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 12668 12651 */ 12669 - if (vmcs01_cpu_exec_ctrl & 12670 - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) { 12652 + if (unlikely(evaluate_pending_interrupts)) 12671 12653 kvm_make_request(KVM_REQ_EVENT, vcpu); 12672 - } 12673 12654 12674 12655 /* 12675 12656 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point

+1 -1

arch/x86/kvm/x86.c

··· 4698 4698 */ 4699 4699 switch (msrs_to_save[i]) { 4700 4700 case MSR_IA32_BNDCFGS: 4701 - if (!kvm_x86_ops->mpx_supported()) 4701 + if (!kvm_mpx_supported()) 4702 4702 continue; 4703 4703 break; 4704 4704 case MSR_TSC_AUX:

+1 -1

tools/kvm/kvm_stat/kvm_stat

··· 1325 1325 msg = '' 1326 1326 while True: 1327 1327 self.screen.erase() 1328 - self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' % 1328 + self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' % 1329 1329 DELAY_DEFAULT, curses.A_BOLD) 1330 1330 self.screen.addstr(4, 0, msg) 1331 1331 self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %

Configure Feed

Configure Feed