Merge tag 'kvm-x86-mmu-6.17' of https://github.com/kvm-x86/linux into HEAD

+3 -3

arch/x86/include/asm/kvm_host.h

··· 1358 1358 bool has_private_mem; 1359 1359 bool has_protected_state; 1360 1360 bool pre_fault_allowed; 1361 - struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 1361 + struct hlist_head *mmu_page_hash; 1362 1362 struct list_head active_mmu_pages; 1363 1363 /* 1364 1364 * A list of kvm_mmu_page structs that, if zapped, could possibly be ··· 1985 1985 #define __KVM_HAVE_ARCH_VM_ALLOC 1986 1986 static inline struct kvm *kvm_arch_alloc_vm(void) 1987 1987 { 1988 - return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1988 + return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT); 1989 1989 } 1990 1990 1991 1991 #define __KVM_HAVE_ARCH_VM_FREE ··· 2030 2030 2031 2031 void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 2032 2032 int kvm_mmu_create(struct kvm_vcpu *vcpu); 2033 - void kvm_mmu_init_vm(struct kvm *kvm); 2033 + int kvm_mmu_init_vm(struct kvm *kvm); 2034 2034 void kvm_mmu_uninit_vm(struct kvm *kvm); 2035 2035 2036 2036 void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,

+69 -6

arch/x86/kvm/mmu/mmu.c

··· 1983 1983 return true; 1984 1984 } 1985 1985 1986 + static __ro_after_init HLIST_HEAD(empty_page_hash); 1987 + 1988 + static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn) 1989 + { 1990 + /* 1991 + * Ensure the load of the hash table pointer itself is ordered before 1992 + * loads to walk the table. The pointer is set at runtime outside of 1993 + * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of 1994 + * shadow pages becomes necessary only when KVM needs to shadow L1's 1995 + * TDP for an L2 guest. Pairs with the smp_store_release() in 1996 + * kvm_mmu_alloc_page_hash(). 1997 + */ 1998 + struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash); 1999 + 2000 + lockdep_assert_held(&kvm->mmu_lock); 2001 + 2002 + if (!page_hash) 2003 + return &empty_page_hash; 2004 + 2005 + return &page_hash[kvm_page_table_hashfn(gfn)]; 2006 + } 2007 + 1986 2008 #define for_each_valid_sp(_kvm, _sp, _list) \ 1987 2009 hlist_for_each_entry(_sp, _list, hash_link) \ 1988 2010 if (is_obsolete_sp((_kvm), (_sp))) { \ 1989 2011 } else 1990 2012 1991 2013 #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ 1992 - for_each_valid_sp(_kvm, _sp, \ 1993 - &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ 2014 + for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \ 1994 2015 if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else 1995 2016 1996 2017 static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) ··· 2379 2358 struct kvm_mmu_page *sp; 2380 2359 bool created = false; 2381 2360 2361 + /* 2362 + * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as 2363 + * mmu_page_hash must be set prior to creating the first shadow root, 2364 + * i.e. reaching this point is fully serialized by slots_arch_lock. 2365 + */ 2366 + BUG_ON(!kvm->arch.mmu_page_hash); 2382 2367 sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; 2383 2368 2384 2369 sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); ··· 3909 3882 return r; 3910 3883 } 3911 3884 3885 + static int kvm_mmu_alloc_page_hash(struct kvm *kvm) 3886 + { 3887 + struct hlist_head *h; 3888 + 3889 + if (kvm->arch.mmu_page_hash) 3890 + return 0; 3891 + 3892 + h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT); 3893 + if (!h) 3894 + return -ENOMEM; 3895 + 3896 + /* 3897 + * Ensure the hash table pointer is set only after all stores to zero 3898 + * the memory are retired. Pairs with the smp_load_acquire() in 3899 + * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to 3900 + * add (or remove) shadow pages, and so readers are guaranteed to see 3901 + * an empty list for their current mmu_lock critical section. 3902 + */ 3903 + smp_store_release(&kvm->arch.mmu_page_hash, h); 3904 + return 0; 3905 + } 3906 + 3912 3907 static int mmu_first_shadow_root_alloc(struct kvm *kvm) 3913 3908 { 3914 3909 struct kvm_memslots *slots; ··· 3950 3901 if (kvm_shadow_root_allocated(kvm)) 3951 3902 goto out_unlock; 3952 3903 3904 + r = kvm_mmu_alloc_page_hash(kvm); 3905 + if (r) 3906 + goto out_unlock; 3907 + 3953 3908 /* 3954 - * Check if anything actually needs to be allocated, e.g. all metadata 3955 - * will be allocated upfront if TDP is disabled. 3909 + * Check if memslot metadata actually needs to be allocated, e.g. all 3910 + * metadata will be allocated upfront if TDP is disabled. 3956 3911 */ 3957 3912 if (kvm_memslots_have_rmaps(kvm) && 3958 3913 kvm_page_track_write_tracking_enabled(kvm)) ··· 6735 6682 kvm_tdp_mmu_zap_invalidated_roots(kvm, true); 6736 6683 } 6737 6684 6738 - void kvm_mmu_init_vm(struct kvm *kvm) 6685 + int kvm_mmu_init_vm(struct kvm *kvm) 6739 6686 { 6687 + int r; 6688 + 6740 6689 kvm->arch.shadow_mmio_value = shadow_mmio_value; 6741 6690 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6742 6691 INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); 6743 6692 spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); 6744 6693 6745 - if (tdp_mmu_enabled) 6694 + if (tdp_mmu_enabled) { 6746 6695 kvm_mmu_init_tdp_mmu(kvm); 6696 + } else { 6697 + r = kvm_mmu_alloc_page_hash(kvm); 6698 + if (r) 6699 + return r; 6700 + } 6747 6701 6748 6702 kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; 6749 6703 kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; ··· 6759 6699 6760 6700 kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; 6761 6701 kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; 6702 + return 0; 6762 6703 } 6763 6704 6764 6705 static void mmu_free_vm_memory_caches(struct kvm *kvm) ··· 6771 6710 6772 6711 void kvm_mmu_uninit_vm(struct kvm *kvm) 6773 6712 { 6713 + kvfree(kvm->arch.mmu_page_hash); 6714 + 6774 6715 if (tdp_mmu_enabled) 6775 6716 kvm_mmu_uninit_tdp_mmu(kvm); 6776 6717

+6 -2

arch/x86/kvm/mmu/paging_tmpl.h

··· 804 804 if (r != RET_PF_CONTINUE) 805 805 return r; 806 806 807 + #if PTTYPE != PTTYPE_EPT 807 808 /* 808 - * Do not change pte_access if the pfn is a mmio page, otherwise 809 - * we will cache the incorrect access into mmio spte. 809 + * Treat the guest PTE protections as writable, supervisor-only if this 810 + * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore 811 + * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO, 812 + * otherwise KVM will cache incorrect access information in the SPTE. 810 813 */ 811 814 if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && 812 815 !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { ··· 825 822 if (is_cr4_smep(vcpu->arch.mmu)) 826 823 walker.pte_access &= ~ACC_EXEC_MASK; 827 824 } 825 + #endif 828 826 829 827 r = RET_PF_RETRY; 830 828 write_lock(&vcpu->kvm->mmu_lock);

+2

arch/x86/kvm/svm/svm.c

··· 5494 5494 { 5495 5495 int r; 5496 5496 5497 + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm); 5498 + 5497 5499 __unused_size_checks(); 5498 5500 5499 5501 if (!kvm_is_svm_supported())

+2 -34

arch/x86/kvm/vmx/main.c

··· 29 29 if (ret) 30 30 return ret; 31 31 32 - /* 33 - * Update vt_x86_ops::vm_size here so it is ready before 34 - * kvm_ops_update() is called in kvm_x86_vendor_init(). 35 - * 36 - * Note, the actual bringing up of TDX must be done after 37 - * kvm_ops_update() because enabling TDX requires enabling 38 - * hardware virtualization first, i.e., all online CPUs must 39 - * be in post-VMXON state. This means the @vm_size here 40 - * may be updated to TDX's size but TDX may fail to enable 41 - * at later time. 42 - * 43 - * The VMX/VT code could update kvm_x86_ops::vm_size again 44 - * after bringing up TDX, but this would require exporting 45 - * either kvm_x86_ops or kvm_ops_update() from the base KVM 46 - * module, which looks overkill. Anyway, the worst case here 47 - * is KVM may allocate couple of more bytes than needed for 48 - * each VM. 49 - */ 50 - if (enable_tdx) { 51 - vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, 52 - sizeof(struct kvm_tdx)); 53 - /* 54 - * Note, TDX may fail to initialize in a later time in 55 - * vt_init(), in which case it is not necessary to setup 56 - * those callbacks. But making them valid here even 57 - * when TDX fails to init later is fine because those 58 - * callbacks won't be called if the VM isn't TDX guest. 59 - */ 60 - vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; 61 - vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; 62 - vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; 63 - vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; 64 - vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; 65 - } 32 + if (enable_tdx) 33 + tdx_hardware_setup(); 66 34 67 35 return 0; 68 36 }

+34 -13

arch/x86/kvm/vmx/tdx.c

··· 743 743 !to_tdx(vcpu)->vp_enter_args.r12; 744 744 } 745 745 746 - bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 746 + static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) 747 747 { 748 748 u64 vcpu_state_details; 749 749 ··· 1638 1638 return 0; 1639 1639 } 1640 1640 1641 - int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1642 - enum pg_level level, kvm_pfn_t pfn) 1641 + static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1642 + enum pg_level level, kvm_pfn_t pfn) 1643 1643 { 1644 1644 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1645 1645 struct page *page = pfn_to_page(pfn); ··· 1719 1719 return 0; 1720 1720 } 1721 1721 1722 - int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1723 - enum pg_level level, void *private_spt) 1722 + static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 1723 + enum pg_level level, void *private_spt) 1724 1724 { 1725 1725 int tdx_level = pg_level_to_tdx_sept_level(level); 1726 1726 gpa_t gpa = gfn_to_gpa(gfn); ··· 1855 1855 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1856 1856 } 1857 1857 1858 - int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1859 - enum pg_level level, void *private_spt) 1858 + static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 1859 + enum pg_level level, void *private_spt) 1860 1860 { 1861 1861 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1862 1862 ··· 1878 1878 return tdx_reclaim_page(virt_to_page(private_spt)); 1879 1879 } 1880 1880 1881 - int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1882 - enum pg_level level, kvm_pfn_t pfn) 1881 + static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1882 + enum pg_level level, kvm_pfn_t pfn) 1883 1883 { 1884 1884 struct page *page = pfn_to_page(pfn); 1885 1885 int ret; ··· 3603 3603 r = __tdx_bringup(); 3604 3604 if (r) { 3605 3605 /* 3606 - * Disable TDX only but don't fail to load module if 3607 - * the TDX module could not be loaded. No need to print 3608 - * message saying "module is not loaded" because it was 3609 - * printed when the first SEAMCALL failed. 3606 + * Disable TDX only but don't fail to load module if the TDX 3607 + * module could not be loaded. No need to print message saying 3608 + * "module is not loaded" because it was printed when the first 3609 + * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or 3610 + * vm_size, as kvm_x86_ops have already been finalized (and are 3611 + * intentionally not exported). The S-EPT code is unreachable, 3612 + * and allocating a few more bytes per VM in a should-be-rare 3613 + * failure scenario is a non-issue. 3610 3614 */ 3611 3615 if (r == -ENODEV) 3612 3616 goto success_disable_tdx; ··· 3623 3619 success_disable_tdx: 3624 3620 enable_tdx = 0; 3625 3621 return 0; 3622 + } 3623 + 3624 + void __init tdx_hardware_setup(void) 3625 + { 3626 + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx); 3627 + 3628 + /* 3629 + * Note, if the TDX module can't be loaded, KVM TDX support will be 3630 + * disabled but KVM will continue loading (see tdx_bringup()). 3631 + */ 3632 + vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); 3633 + 3634 + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; 3635 + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; 3636 + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; 3637 + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; 3638 + vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; 3626 3639 }

+1

arch/x86/kvm/vmx/tdx.h

··· 8 8 #ifdef CONFIG_KVM_INTEL_TDX 9 9 #include "common.h" 10 10 11 + void tdx_hardware_setup(void); 11 12 int tdx_bringup(void); 12 13 void tdx_cleanup(void); 13 14

+2

arch/x86/kvm/vmx/vmx.c

··· 8552 8552 { 8553 8553 int r, cpu; 8554 8554 8555 + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); 8556 + 8555 8557 if (!kvm_is_vmx_supported()) 8556 8558 return -EOPNOTSUPP; 8557 8559

-10

arch/x86/kvm/vmx/x86_ops.h

··· 136 136 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags); 137 137 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 138 138 void tdx_vcpu_put(struct kvm_vcpu *vcpu); 139 - bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu); 140 139 int tdx_handle_exit(struct kvm_vcpu *vcpu, 141 140 enum exit_fastpath_completion fastpath); 142 141 ··· 149 150 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 150 151 151 152 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); 152 - 153 - int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, 154 - enum pg_level level, void *private_spt); 155 - int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, 156 - enum pg_level level, void *private_spt); 157 - int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 158 - enum pg_level level, kvm_pfn_t pfn); 159 - int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 160 - enum pg_level level, kvm_pfn_t pfn); 161 153 162 154 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); 163 155 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);

+4 -1

arch/x86/kvm/x86.c

··· 12699 12699 if (ret) 12700 12700 goto out; 12701 12701 12702 - kvm_mmu_init_vm(kvm); 12702 + ret = kvm_mmu_init_vm(kvm); 12703 + if (ret) 12704 + goto out_cleanup_page_track; 12703 12705 12704 12706 ret = kvm_x86_call(vm_init)(kvm); 12705 12707 if (ret) ··· 12747 12745 12748 12746 out_uninit_mmu: 12749 12747 kvm_mmu_uninit_vm(kvm); 12748 + out_cleanup_page_track: 12750 12749 kvm_page_track_cleanup(kvm); 12751 12750 out: 12752 12751 return ret;

+22

arch/x86/kvm/x86.h

··· 55 55 56 56 void kvm_spurious_fault(void); 57 57 58 + #define SIZE_OF_MEMSLOTS_HASHTABLE \ 59 + (sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES) 60 + 61 + /* Sanity check the size of the memslot hash tables. */ 62 + static_assert(SIZE_OF_MEMSLOTS_HASHTABLE == 63 + (1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM)))); 64 + 65 + /* 66 + * Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation. 67 + * Spilling over to an order-2 allocation isn't fundamentally problematic, but 68 + * isn't expected to happen in the foreseeable future (O(years)). Assert that 69 + * the size is an order-0 allocation when ignoring the memslot hash tables, to 70 + * help detect and debug unexpected size increases. 71 + */ 72 + #define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \ 73 + do { \ 74 + BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \ 75 + !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \ 76 + BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \ 77 + !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \ 78 + } while (0) 79 + 58 80 #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ 59 81 ({ \ 60 82 bool failed = (consistency_check); \

Configure Feed

Configure Feed