Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini:
"ARM:

- Plug race between enabling MTE and creating vcpus

- Fix off-by-one bug when checking whether an address range is RAM

x86:

- Fixes for the new MMU, especially a memory leak on hosts with <39
physical address bits

- Remove bogus EFER.NX checks on 32-bit non-PAE hosts

- WAITPKG fix"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock
KVM: x86/mmu: Don't step down in the TDP iterator when zapping all SPTEs
KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all SPTEs
KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF
kvm: vmx: Sync all matching EPTPs when injecting nested EPT fault
KVM: x86: remove dead initialization
KVM: x86: Allow guest to set EFER.NX=1 on non-PAE 32-bit kernels
KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation
KVM: arm64: Fix race when enabling KVM_ARM_CAP_MTE
KVM: arm64: Fix off-by-one in range_is_memory

+118 -62
+4 -4
Documentation/virt/kvm/locking.rst
··· 25 25 26 26 - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock 27 27 28 - - kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is 29 - taken inside kvm->arch.mmu_lock, and cannot be taken without already 30 - holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise 31 - there's no need to take kvm->arch.tdp_mmu_pages_lock at all). 28 + - kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and 29 + kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and 30 + cannot be taken without already holding kvm->arch.mmu_lock (typically with 31 + ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). 32 32 33 33 Everything else is a leaf: no other lock is taken inside the critical 34 34 sections.
+8 -4
arch/arm64/kvm/arm.c
··· 94 94 kvm->arch.return_nisv_io_abort_to_user = true; 95 95 break; 96 96 case KVM_CAP_ARM_MTE: 97 - if (!system_supports_mte() || kvm->created_vcpus) 98 - return -EINVAL; 99 - r = 0; 100 - kvm->arch.mte_enabled = true; 97 + mutex_lock(&kvm->lock); 98 + if (!system_supports_mte() || kvm->created_vcpus) { 99 + r = -EINVAL; 100 + } else { 101 + r = 0; 102 + kvm->arch.mte_enabled = true; 103 + } 104 + mutex_unlock(&kvm->lock); 101 105 break; 102 106 default: 103 107 r = -EINVAL;
+1 -1
arch/arm64/kvm/hyp/nvhe/mem_protect.c
··· 193 193 { 194 194 struct kvm_mem_range r1, r2; 195 195 196 - if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2)) 196 + if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2)) 197 197 return false; 198 198 if (r1.start != r2.start) 199 199 return false;
+7
arch/x86/include/asm/kvm_host.h
··· 1038 1038 struct list_head lpage_disallowed_mmu_pages; 1039 1039 struct kvm_page_track_notifier_node mmu_sp_tracker; 1040 1040 struct kvm_page_track_notifier_head track_notifier_head; 1041 + /* 1042 + * Protects marking pages unsync during page faults, as TDP MMU page 1043 + * faults only take mmu_lock for read. For simplicity, the unsync 1044 + * pages lock is always taken when marking pages unsync regardless of 1045 + * whether mmu_lock is held for read or write. 1046 + */ 1047 + spinlock_t mmu_unsync_pages_lock; 1041 1048 1042 1049 struct list_head assigned_dev_head; 1043 1050 struct iommu_domain *iommu_domain;
+1 -27
arch/x86/kvm/cpuid.c
··· 208 208 kvm_mmu_after_set_cpuid(vcpu); 209 209 } 210 210 211 - static int is_efer_nx(void) 212 - { 213 - return host_efer & EFER_NX; 214 - } 215 - 216 - static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 217 - { 218 - int i; 219 - struct kvm_cpuid_entry2 *e, *entry; 220 - 221 - entry = NULL; 222 - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 223 - e = &vcpu->arch.cpuid_entries[i]; 224 - if (e->function == 0x80000001) { 225 - entry = e; 226 - break; 227 - } 228 - } 229 - if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) { 230 - cpuid_entry_clear(entry, X86_FEATURE_NX); 231 - printk(KERN_INFO "kvm: guest NX capability removed\n"); 232 - } 233 - } 234 - 235 211 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) 236 212 { 237 213 struct kvm_cpuid_entry2 *best; ··· 278 302 vcpu->arch.cpuid_entries = e2; 279 303 vcpu->arch.cpuid_nent = cpuid->nent; 280 304 281 - cpuid_fix_nx_cap(vcpu); 282 305 kvm_update_cpuid_runtime(vcpu); 283 306 kvm_vcpu_after_set_cpuid(vcpu); 284 307 ··· 376 401 377 402 void kvm_set_cpu_caps(void) 378 403 { 379 - unsigned int f_nx = is_efer_nx() ? F(NX) : 0; 380 404 #ifdef CONFIG_X86_64 381 405 unsigned int f_gbpages = F(GBPAGES); 382 406 unsigned int f_lm = F(LM); ··· 489 515 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 490 516 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 491 517 F(PAT) | F(PSE36) | 0 /* Reserved */ | 492 - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 518 + F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 493 519 F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | 494 520 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) 495 521 );
+1 -1
arch/x86/kvm/hyperv.c
··· 1933 1933 void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) 1934 1934 { 1935 1935 struct kvm_cpuid_entry2 *entry; 1936 - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 1936 + struct kvm_vcpu_hv *hv_vcpu; 1937 1937 1938 1938 entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); 1939 1939 if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
+28
arch/x86/kvm/mmu/mmu.c
··· 2535 2535 int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) 2536 2536 { 2537 2537 struct kvm_mmu_page *sp; 2538 + bool locked = false; 2538 2539 2539 2540 /* 2540 2541 * Force write-protection if the page is being tracked. Note, the page ··· 2558 2557 if (sp->unsync) 2559 2558 continue; 2560 2559 2560 + /* 2561 + * TDP MMU page faults require an additional spinlock as they 2562 + * run with mmu_lock held for read, not write, and the unsync 2563 + * logic is not thread safe. Take the spinklock regardless of 2564 + * the MMU type to avoid extra conditionals/parameters, there's 2565 + * no meaningful penalty if mmu_lock is held for write. 2566 + */ 2567 + if (!locked) { 2568 + locked = true; 2569 + spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); 2570 + 2571 + /* 2572 + * Recheck after taking the spinlock, a different vCPU 2573 + * may have since marked the page unsync. A false 2574 + * positive on the unprotected check above is not 2575 + * possible as clearing sp->unsync _must_ hold mmu_lock 2576 + * for write, i.e. unsync cannot transition from 0->1 2577 + * while this CPU holds mmu_lock for read (or write). 2578 + */ 2579 + if (READ_ONCE(sp->unsync)) 2580 + continue; 2581 + } 2582 + 2561 2583 WARN_ON(sp->role.level != PG_LEVEL_4K); 2562 2584 kvm_unsync_page(vcpu, sp); 2563 2585 } 2586 + if (locked) 2587 + spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); 2564 2588 2565 2589 /* 2566 2590 * We need to ensure that the marking of unsync pages is visible ··· 5562 5536 void kvm_mmu_init_vm(struct kvm *kvm) 5563 5537 { 5564 5538 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 5539 + 5540 + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); 5565 5541 5566 5542 if (!kvm_mmu_init_tdp_mmu(kvm)) 5567 5543 /*
+24 -11
arch/x86/kvm/mmu/tdp_mmu.c
··· 43 43 if (!kvm->arch.tdp_mmu_enabled) 44 44 return; 45 45 46 + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 46 47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 47 48 48 49 /* ··· 82 81 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 83 82 bool shared) 84 83 { 85 - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 86 - 87 84 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 88 85 89 86 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) ··· 93 94 list_del_rcu(&root->link); 94 95 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 95 96 96 - zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); 97 + zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); 97 98 98 99 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 99 100 } ··· 723 724 gfn_t start, gfn_t end, bool can_yield, bool flush, 724 725 bool shared) 725 726 { 727 + gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 728 + bool zap_all = (start == 0 && end >= max_gfn_host); 726 729 struct tdp_iter iter; 730 + 731 + /* 732 + * No need to try to step down in the iterator when zapping all SPTEs, 733 + * zapping the top-level non-leaf SPTEs will recurse on their children. 734 + */ 735 + int min_level = zap_all ? root->role.level : PG_LEVEL_4K; 736 + 737 + /* 738 + * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will 739 + * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, 740 + * and so KVM will never install a SPTE for such addresses. 741 + */ 742 + end = min(end, max_gfn_host); 727 743 728 744 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 729 745 730 746 rcu_read_lock(); 731 747 732 - tdp_root_for_each_pte(iter, root, start, end) { 748 + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 749 + min_level, start, end) { 733 750 retry: 734 751 if (can_yield && 735 752 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { ··· 759 744 /* 760 745 * If this is a non-last-level SPTE that covers a larger range 761 746 * than should be zapped, continue, and zap the mappings at a 762 - * lower level. 747 + * lower level, except when zapping all SPTEs. 763 748 */ 764 - if ((iter.gfn < start || 749 + if (!zap_all && 750 + (iter.gfn < start || 765 751 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 766 752 !is_last_spte(iter.old_spte, iter.level)) 767 753 continue; ··· 810 794 811 795 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 812 796 { 813 - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 814 797 bool flush = false; 815 798 int i; 816 799 817 800 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 818 - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, 801 + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, 819 802 flush, false); 820 803 821 804 if (flush) ··· 853 838 */ 854 839 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 855 840 { 856 - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 857 841 struct kvm_mmu_page *next_root; 858 842 struct kvm_mmu_page *root; 859 843 bool flush = false; ··· 868 854 869 855 rcu_read_unlock(); 870 856 871 - flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, 872 - true); 857 + flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); 873 858 874 859 /* 875 860 * Put the reference acquired in
+43 -13
arch/x86/kvm/vmx/nested.c
··· 330 330 vcpu_put(vcpu); 331 331 } 332 332 333 + #define EPTP_PA_MASK GENMASK_ULL(51, 12) 334 + 335 + static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 336 + { 337 + return VALID_PAGE(root_hpa) && 338 + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 339 + } 340 + 341 + static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 342 + gpa_t addr) 343 + { 344 + uint i; 345 + struct kvm_mmu_root_info *cached_root; 346 + 347 + WARN_ON_ONCE(!mmu_is_nested(vcpu)); 348 + 349 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 350 + cached_root = &vcpu->arch.mmu->prev_roots[i]; 351 + 352 + if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 353 + eptp)) 354 + vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); 355 + } 356 + } 357 + 333 358 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 334 359 struct x86_exception *fault) 335 360 { ··· 367 342 vm_exit_reason = EXIT_REASON_PML_FULL; 368 343 vmx->nested.pml_full = false; 369 344 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 370 - } else if (fault->error_code & PFERR_RSVD_MASK) 371 - vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 372 - else 373 - vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 345 + } else { 346 + if (fault->error_code & PFERR_RSVD_MASK) 347 + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 348 + else 349 + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 350 + 351 + /* 352 + * Although the caller (kvm_inject_emulated_page_fault) would 353 + * have already synced the faulting address in the shadow EPT 354 + * tables for the current EPTP12, we also need to sync it for 355 + * any other cached EPTP02s based on the same EP4TA, since the 356 + * TLB associates mappings to the EP4TA rather than the full EPTP. 357 + */ 358 + nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 359 + fault->address); 360 + } 374 361 375 362 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 376 363 vmcs12->guest_physical_address = fault->address; ··· 5362 5325 return nested_vmx_succeed(vcpu); 5363 5326 } 5364 5327 5365 - #define EPTP_PA_MASK GENMASK_ULL(51, 12) 5366 - 5367 - static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 5368 - { 5369 - return VALID_PAGE(root_hpa) && 5370 - ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 5371 - } 5372 - 5373 5328 /* Emulate the INVEPT instruction */ 5374 5329 static int handle_invept(struct kvm_vcpu *vcpu) 5375 5330 { ··· 5855 5826 if (is_nmi(intr_info)) 5856 5827 return true; 5857 5828 else if (is_page_fault(intr_info)) 5858 - return vcpu->arch.apf.host_apf_flags || !enable_ept; 5829 + return vcpu->arch.apf.host_apf_flags || 5830 + vmx_need_pf_intercept(vcpu); 5859 5831 else if (is_debug(intr_info) && 5860 5832 vcpu->guest_debug & 5861 5833 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+1 -1
arch/x86/kvm/vmx/vmx.h
··· 522 522 523 523 static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) 524 524 { 525 - return vmx->secondary_exec_control & 525 + return secondary_exec_controls_get(vmx) & 526 526 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 527 527 } 528 528