Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-x86-tdx-6.19' of https://github.com/kvm-x86/linux into HEAD

KVM TDX changes for 6.19:

- Overhaul the TDX code to address systemic races where KVM (acting on behalf
of userspace) could inadvertantly trigger lock contention in the TDX-Module,
which KVM was either working around in weird, ugly ways, or was simply
oblivious to (as proven by Yan tripping several KVM_BUG_ON()s with clever
selftests).

- Fix a bug where KVM could corrupt a vCPU's cpu_list when freeing a vCPU if
creating said vCPU failed partway through.

- Fix a few sparse warnings (bad annotation, 0 != NULL).

- Use struct_size() to simplify copying capabilities to userspace.

+493 -446
+6
arch/arm64/kvm/arm.c
··· 1835 1835 return r; 1836 1836 } 1837 1837 1838 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, 1839 + unsigned long arg) 1840 + { 1841 + return -ENOIOCTLCMD; 1842 + } 1843 + 1838 1844 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1839 1845 { 1840 1846
-1
arch/loongarch/kvm/Kconfig
··· 25 25 select HAVE_KVM_IRQCHIP 26 26 select HAVE_KVM_MSI 27 27 select HAVE_KVM_READONLY_MEM 28 - select HAVE_KVM_VCPU_ASYNC_IOCTL 29 28 select KVM_COMMON 30 29 select KVM_GENERIC_DIRTYLOG_READ_PROTECT 31 30 select KVM_GENERIC_HARDWARE_ENABLING
+2 -2
arch/loongarch/kvm/vcpu.c
··· 1473 1473 return 0; 1474 1474 } 1475 1475 1476 - long kvm_arch_vcpu_async_ioctl(struct file *filp, 1477 - unsigned int ioctl, unsigned long arg) 1476 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, 1477 + unsigned long arg) 1478 1478 { 1479 1479 void __user *argp = (void __user *)arg; 1480 1480 struct kvm_vcpu *vcpu = filp->private_data;
-1
arch/mips/kvm/Kconfig
··· 22 22 select EXPORT_UASM 23 23 select KVM_COMMON 24 24 select KVM_GENERIC_DIRTYLOG_READ_PROTECT 25 - select HAVE_KVM_VCPU_ASYNC_IOCTL 26 25 select KVM_MMIO 27 26 select KVM_GENERIC_MMU_NOTIFIER 28 27 select KVM_GENERIC_HARDWARE_ENABLING
+2 -2
arch/mips/kvm/mips.c
··· 895 895 return r; 896 896 } 897 897 898 - long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, 899 - unsigned long arg) 898 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, 899 + unsigned long arg) 900 900 { 901 901 struct kvm_vcpu *vcpu = filp->private_data; 902 902 void __user *argp = (void __user *)arg;
-1
arch/powerpc/kvm/Kconfig
··· 20 20 config KVM 21 21 bool 22 22 select KVM_COMMON 23 - select HAVE_KVM_VCPU_ASYNC_IOCTL 24 23 select KVM_VFIO 25 24 select HAVE_KVM_IRQ_BYPASS 26 25
+2 -2
arch/powerpc/kvm/powerpc.c
··· 2028 2028 return -EINVAL; 2029 2029 } 2030 2030 2031 - long kvm_arch_vcpu_async_ioctl(struct file *filp, 2032 - unsigned int ioctl, unsigned long arg) 2031 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, 2032 + unsigned long arg) 2033 2033 { 2034 2034 struct kvm_vcpu *vcpu = filp->private_data; 2035 2035 void __user *argp = (void __user *)arg;
-1
arch/riscv/kvm/Kconfig
··· 23 23 select HAVE_KVM_IRQCHIP 24 24 select HAVE_KVM_IRQ_ROUTING 25 25 select HAVE_KVM_MSI 26 - select HAVE_KVM_VCPU_ASYNC_IOCTL 27 26 select HAVE_KVM_READONLY_MEM 28 27 select HAVE_KVM_DIRTY_RING_ACQ_REL 29 28 select KVM_COMMON
+2 -2
arch/riscv/kvm/vcpu.c
··· 238 238 return VM_FAULT_SIGBUS; 239 239 } 240 240 241 - long kvm_arch_vcpu_async_ioctl(struct file *filp, 242 - unsigned int ioctl, unsigned long arg) 241 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, 242 + unsigned long arg) 243 243 { 244 244 struct kvm_vcpu *vcpu = filp->private_data; 245 245 void __user *argp = (void __user *)arg;
-1
arch/s390/kvm/Kconfig
··· 20 20 def_tristate y 21 21 prompt "Kernel-based Virtual Machine (KVM) support" 22 22 select HAVE_KVM_CPU_RELAX_INTERCEPT 23 - select HAVE_KVM_VCPU_ASYNC_IOCTL 24 23 select KVM_ASYNC_PF 25 24 select KVM_ASYNC_PF_SYNC 26 25 select KVM_COMMON
+2 -2
arch/s390/kvm/kvm-s390.c
··· 5730 5730 return r; 5731 5731 } 5732 5732 5733 - long kvm_arch_vcpu_async_ioctl(struct file *filp, 5734 - unsigned int ioctl, unsigned long arg) 5733 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, 5734 + unsigned long arg) 5735 5735 { 5736 5736 struct kvm_vcpu *vcpu = filp->private_data; 5737 5737 void __user *argp = (void __user *)arg;
+1
arch/x86/include/asm/kvm-x86-ops.h
··· 128 128 KVM_X86_OP_OPTIONAL(dev_get_attr) 129 129 KVM_X86_OP_OPTIONAL(mem_enc_ioctl) 130 130 KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) 131 + KVM_X86_OP_OPTIONAL(vcpu_mem_enc_unlocked_ioctl) 131 132 KVM_X86_OP_OPTIONAL(mem_enc_register_region) 132 133 KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) 133 134 KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
+4 -3
arch/x86/include/asm/kvm_host.h
··· 1843 1843 void *external_spt); 1844 1844 /* Update the external page table from spte getting set. */ 1845 1845 int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, 1846 - kvm_pfn_t pfn_for_gfn); 1846 + u64 mirror_spte); 1847 1847 1848 1848 /* Update external page tables for page table about to be freed. */ 1849 1849 int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, 1850 1850 void *external_spt); 1851 1851 1852 1852 /* Update external page table from spte getting removed, and flush TLB. */ 1853 - int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, 1854 - kvm_pfn_t pfn_for_gfn); 1853 + void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, 1854 + u64 mirror_spte); 1855 1855 1856 1856 bool (*has_wbinvd_exit)(void); 1857 1857 ··· 1909 1909 int (*dev_get_attr)(u32 group, u64 attr, u64 *val); 1910 1910 int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); 1911 1911 int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); 1912 + int (*vcpu_mem_enc_unlocked_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); 1912 1913 int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); 1913 1914 int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); 1914 1915 int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+1 -2
arch/x86/kvm/mmu.h
··· 255 255 #define tdp_mmu_enabled false 256 256 #endif 257 257 258 - bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); 259 - int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); 258 + int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); 260 259 261 260 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) 262 261 {
+83 -4
arch/x86/kvm/mmu/mmu.c
··· 4924 4924 return direct_page_fault(vcpu, fault); 4925 4925 } 4926 4926 4927 - int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) 4927 + static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa, 4928 + u64 error_code, u8 *level) 4928 4929 { 4929 4930 int r; 4930 4931 ··· 4967 4966 return -EIO; 4968 4967 } 4969 4968 } 4970 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page); 4971 4969 4972 4970 long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, 4973 4971 struct kvm_pre_fault_memory *range) ··· 5002 5002 * Shadow paging uses GVA for kvm page fault, so restrict to 5003 5003 * two-dimensional paging. 5004 5004 */ 5005 - r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); 5005 + r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level); 5006 5006 if (r < 0) 5007 5007 return r; 5008 5008 ··· 5013 5013 end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); 5014 5014 return min(range->size, end - range->gpa); 5015 5015 } 5016 + 5017 + #ifdef CONFIG_KVM_GUEST_MEMFD 5018 + static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot) 5019 + { 5020 + #ifdef CONFIG_PROVE_LOCKING 5021 + if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) || 5022 + WARN_ON_ONCE(!slot->gmem.file) || 5023 + WARN_ON_ONCE(!file_count(slot->gmem.file))) 5024 + return; 5025 + 5026 + lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock); 5027 + #endif 5028 + } 5029 + 5030 + int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) 5031 + { 5032 + struct kvm_page_fault fault = { 5033 + .addr = gfn_to_gpa(gfn), 5034 + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS, 5035 + .prefetch = true, 5036 + .is_tdp = true, 5037 + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm), 5038 + 5039 + .max_level = PG_LEVEL_4K, 5040 + .req_level = PG_LEVEL_4K, 5041 + .goal_level = PG_LEVEL_4K, 5042 + .is_private = true, 5043 + 5044 + .gfn = gfn, 5045 + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn), 5046 + .pfn = pfn, 5047 + .map_writable = true, 5048 + }; 5049 + struct kvm *kvm = vcpu->kvm; 5050 + int r; 5051 + 5052 + lockdep_assert_held(&kvm->slots_lock); 5053 + 5054 + /* 5055 + * Mapping a pre-determined private pfn is intended only for use when 5056 + * populating a guest_memfd instance. Assert that the slot is backed 5057 + * by guest_memfd and that the gmem instance's invalidate_lock is held. 5058 + */ 5059 + kvm_assert_gmem_invalidate_lock_held(fault.slot); 5060 + 5061 + if (KVM_BUG_ON(!tdp_mmu_enabled, kvm)) 5062 + return -EIO; 5063 + 5064 + if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn)) 5065 + return -EPERM; 5066 + 5067 + r = kvm_mmu_reload(vcpu); 5068 + if (r) 5069 + return r; 5070 + 5071 + r = mmu_topup_memory_caches(vcpu, false); 5072 + if (r) 5073 + return r; 5074 + 5075 + do { 5076 + if (signal_pending(current)) 5077 + return -EINTR; 5078 + 5079 + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) 5080 + return -EIO; 5081 + 5082 + cond_resched(); 5083 + 5084 + guard(read_lock)(&kvm->mmu_lock); 5085 + 5086 + r = kvm_tdp_mmu_map(vcpu, &fault); 5087 + } while (r == RET_PF_RETRY); 5088 + 5089 + if (r != RET_PF_FIXED) 5090 + return -EIO; 5091 + 5092 + return 0; 5093 + } 5094 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn); 5095 + #endif 5016 5096 5017 5097 static void nonpaging_init_context(struct kvm_mmu *context) 5018 5098 { ··· 6077 5997 out: 6078 5998 return r; 6079 5999 } 6080 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load); 6081 6000 6082 6001 void kvm_mmu_unload(struct kvm_vcpu *vcpu) 6083 6002 {
+10 -40
arch/x86/kvm/mmu/tdp_mmu.c
··· 362 362 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, 363 363 int level) 364 364 { 365 - kvm_pfn_t old_pfn = spte_to_pfn(old_spte); 366 - int ret; 367 - 368 365 /* 369 366 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external 370 367 * PTs are removed in a special order, involving free_external_spt(). ··· 374 377 375 378 /* Zapping leaf spte is allowed only when write lock is held. */ 376 379 lockdep_assert_held_write(&kvm->mmu_lock); 377 - /* Because write lock is held, operation should success. */ 378 - ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); 379 - KVM_BUG_ON(ret, kvm); 380 + 381 + kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte); 380 382 } 381 383 382 384 /** ··· 515 519 bool was_present = is_shadow_present_pte(old_spte); 516 520 bool is_present = is_shadow_present_pte(new_spte); 517 521 bool is_leaf = is_present && is_last_spte(new_spte, level); 518 - kvm_pfn_t new_pfn = spte_to_pfn(new_spte); 519 522 int ret = 0; 520 523 521 524 KVM_BUG_ON(was_present, kvm); ··· 533 538 * external page table, or leaf. 534 539 */ 535 540 if (is_leaf) { 536 - ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); 541 + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte); 537 542 } else { 538 543 void *external_spt = get_external_spt(gfn, new_spte, level); 539 544 ··· 1268 1273 struct kvm_mmu_page *sp; 1269 1274 int ret = RET_PF_RETRY; 1270 1275 1276 + KVM_MMU_WARN_ON(!root || root->role.invalid); 1277 + 1271 1278 kvm_mmu_hugepage_adjust(vcpu, fault); 1272 1279 1273 1280 trace_kvm_mmu_spte_requested(fault); ··· 1936 1939 * 1937 1940 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1938 1941 */ 1939 - static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1940 - struct kvm_mmu_page *root) 1942 + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1943 + int *root_level) 1941 1944 { 1945 + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); 1942 1946 struct tdp_iter iter; 1943 1947 gfn_t gfn = addr >> PAGE_SHIFT; 1944 1948 int leaf = -1; 1949 + 1950 + *root_level = vcpu->arch.mmu->root_role.level; 1945 1951 1946 1952 for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { 1947 1953 leaf = iter.level; ··· 1953 1953 1954 1954 return leaf; 1955 1955 } 1956 - 1957 - int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1958 - int *root_level) 1959 - { 1960 - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); 1961 - *root_level = vcpu->arch.mmu->root_role.level; 1962 - 1963 - return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); 1964 - } 1965 - 1966 - bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) 1967 - { 1968 - struct kvm *kvm = vcpu->kvm; 1969 - bool is_direct = kvm_is_addr_direct(kvm, gpa); 1970 - hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : 1971 - vcpu->arch.mmu->mirror_root_hpa; 1972 - u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; 1973 - int leaf; 1974 - 1975 - lockdep_assert_held(&kvm->mmu_lock); 1976 - rcu_read_lock(); 1977 - leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); 1978 - rcu_read_unlock(); 1979 - if (leaf < 0) 1980 - return false; 1981 - 1982 - spte = sptes[leaf]; 1983 - return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); 1984 - } 1985 - EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped); 1986 1956 1987 1957 /* 1988 1958 * Returns the last level spte pointer of the shadow page walk for the given
+9
arch/x86/kvm/vmx/main.c
··· 831 831 return tdx_vcpu_ioctl(vcpu, argp); 832 832 } 833 833 834 + static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 835 + { 836 + if (!is_td_vcpu(vcpu)) 837 + return -EINVAL; 838 + 839 + return tdx_vcpu_unlocked_ioctl(vcpu, argp); 840 + } 841 + 834 842 static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, 835 843 bool is_private) 836 844 { ··· 1013 1005 1014 1006 .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), 1015 1007 .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), 1008 + .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl), 1016 1009 1017 1010 .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level) 1018 1011 };
+344 -362
arch/x86/kvm/vmx/tdx.c
··· 24 24 #undef pr_fmt 25 25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 26 27 - #define pr_tdx_error(__fn, __err) \ 28 - pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) 27 + #define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \ 28 + ({ \ 29 + struct kvm *_kvm = (__kvm); \ 30 + bool __ret = !!(__err); \ 31 + \ 32 + if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \ 33 + if (_kvm) \ 34 + kvm_vm_bugged(_kvm); \ 35 + pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\ 36 + __err, __args); \ 37 + } \ 38 + unlikely(__ret); \ 39 + }) 29 40 30 - #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ 31 - pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) 41 + #define TDX_BUG_ON(__err, __fn, __kvm) \ 42 + __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "") 32 43 33 - #define pr_tdx_error_1(__fn, __err, __rcx) \ 34 - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) 44 + #define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \ 45 + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1) 35 46 36 - #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ 37 - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) 47 + #define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \ 48 + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2) 38 49 39 - #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ 40 - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) 50 + #define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \ 51 + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \ 52 + a1, a2, a3) 53 + 41 54 42 55 bool enable_tdx __ro_after_init; 43 56 module_param_named(tdx, enable_tdx, bool, 0444); ··· 294 281 vcpu->cpu = -1; 295 282 } 296 283 297 - static void tdx_no_vcpus_enter_start(struct kvm *kvm) 298 - { 299 - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 300 - 301 - lockdep_assert_held_write(&kvm->mmu_lock); 302 - 303 - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); 304 - 305 - kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 306 - } 307 - 308 - static void tdx_no_vcpus_enter_stop(struct kvm *kvm) 309 - { 310 - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 311 - 312 - lockdep_assert_held_write(&kvm->mmu_lock); 313 - 314 - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); 315 - } 284 + /* 285 + * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single 286 + * retry (if necessary) after forcing vCPUs to exit and wait for the operation 287 + * to complete. All flows that remove/block S-EPT entries run with mmu_lock 288 + * held for write, i.e. are mutually exclusive with each other, but they aren't 289 + * mutually exclusive with running vCPUs, and so can fail with "operand busy" 290 + * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL. 291 + * 292 + * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs. 293 + */ 294 + #define tdh_do_no_vcpus(tdh_func, kvm, args...) \ 295 + ({ \ 296 + struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \ 297 + u64 __err; \ 298 + \ 299 + lockdep_assert_held_write(&kvm->mmu_lock); \ 300 + \ 301 + __err = tdh_func(args); \ 302 + if (unlikely(tdx_operand_busy(__err))) { \ 303 + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \ 304 + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \ 305 + \ 306 + __err = tdh_func(args); \ 307 + \ 308 + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \ 309 + } \ 310 + __err; \ 311 + }) 316 312 317 313 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ 318 314 static int __tdx_reclaim_page(struct page *page) ··· 335 313 * before the HKID is released and control pages have also been 336 314 * released at this point, so there is no possibility of contention. 337 315 */ 338 - if (WARN_ON_ONCE(err)) { 339 - pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); 316 + if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL)) 340 317 return -EIO; 341 - } 318 + 342 319 return 0; 343 320 } 344 321 ··· 425 404 return; 426 405 427 406 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); 428 - if (KVM_BUG_ON(arg.err, vcpu->kvm)) 429 - pr_tdx_error(TDH_VP_FLUSH, arg.err); 407 + 408 + TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm); 430 409 } 431 410 432 411 void tdx_disable_virtualization_cpu(void) ··· 485 464 } 486 465 487 466 out: 488 - if (WARN_ON_ONCE(err)) 489 - pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); 467 + TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL); 490 468 } 491 469 492 470 void tdx_mmu_release_hkid(struct kvm *kvm) ··· 524 504 err = tdh_mng_vpflushdone(&kvm_tdx->td); 525 505 if (err == TDX_FLUSHVP_NOT_DONE) 526 506 goto out; 527 - if (KVM_BUG_ON(err, kvm)) { 528 - pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); 507 + if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) { 529 508 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", 530 509 kvm_tdx->hkid); 531 510 goto out; ··· 547 528 * tdh_mng_key_freeid() will fail. 548 529 */ 549 530 err = tdh_mng_key_freeid(&kvm_tdx->td); 550 - if (KVM_BUG_ON(err, kvm)) { 551 - pr_tdx_error(TDH_MNG_KEY_FREEID, err); 531 + if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) { 552 532 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", 553 533 kvm_tdx->hkid); 554 534 } else { ··· 598 580 * when it is reclaiming TDCS). 599 581 */ 600 582 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); 601 - if (KVM_BUG_ON(err, kvm)) { 602 - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 583 + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) 603 584 return; 604 - } 585 + 605 586 tdx_quirk_reset_page(kvm_tdx->td.tdr_page); 606 587 607 588 __free_page(kvm_tdx->td.tdr_page); ··· 623 606 624 607 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ 625 608 err = tdh_mng_key_config(&kvm_tdx->td); 626 - 627 - if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { 628 - pr_tdx_error(TDH_MNG_KEY_CONFIG, err); 609 + if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm)) 629 610 return -EIO; 630 - } 631 611 632 612 return 0; 633 613 } ··· 838 824 tdx_prepare_switch_to_host(vcpu); 839 825 } 840 826 827 + /* 828 + * Life cycles for a TD and a vCPU: 829 + * 1. KVM_CREATE_VM ioctl. 830 + * TD state is TD_STATE_UNINITIALIZED. 831 + * hkid is not assigned at this stage. 832 + * 2. KVM_TDX_INIT_VM ioctl. 833 + * TD transitions to TD_STATE_INITIALIZED. 834 + * hkid is assigned after this stage. 835 + * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED). 836 + * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED. 837 + * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create(). 838 + * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create() 839 + * kvm_arch_vcpu_destroy() --> tdx_vcpu_free(). 840 + * 4. KVM_TDX_INIT_VCPU ioctl. 841 + * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED. 842 + * vCPU control structures are allocated at this stage. 843 + * 5. kvm_destroy_vm(). 844 + * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs. 845 + * (2) puts hkid to !assigned state. 846 + * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free(): 847 + * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state. 848 + * 5.3 tdx_vm_destroy() 849 + * transitions TD to TD_STATE_UNINITIALIZED state. 850 + * 851 + * tdx_vcpu_free() can be invoked only at 3.3 or 5.2. 852 + * - If at 3.3, hkid is still assigned, but the vCPU must be in 853 + * VCPU_TD_STATE_UNINITIALIZED state. 854 + * - if at 5.2, hkid must be !assigned and all vCPUs must be in 855 + * VCPU_TD_STATE_INITIALIZED state and have been dissociated. 856 + */ 841 857 void tdx_vcpu_free(struct kvm_vcpu *vcpu) 842 858 { 843 859 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); 844 860 struct vcpu_tdx *tdx = to_tdx(vcpu); 845 861 int i; 846 862 863 + if (vcpu->cpu != -1) { 864 + KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm); 865 + tdx_flush_vp_on_cpu(vcpu); 866 + return; 867 + } 868 + 847 869 /* 848 870 * It is not possible to reclaim pages while hkid is assigned. It might 849 - * be assigned if: 850 - * 1. the TD VM is being destroyed but freeing hkid failed, in which 851 - * case the pages are leaked 852 - * 2. TD VCPU creation failed and this on the error path, in which case 853 - * there is nothing to do anyway 871 + * be assigned if the TD VM is being destroyed but freeing hkid failed, 872 + * in which case the pages are leaked. 854 873 */ 855 874 if (is_hkid_assigned(kvm_tdx)) 856 875 return; ··· 898 851 } 899 852 if (tdx->vp.tdvpr_page) { 900 853 tdx_reclaim_control_page(tdx->vp.tdvpr_page); 901 - tdx->vp.tdvpr_page = 0; 854 + tdx->vp.tdvpr_page = NULL; 902 855 tdx->vp.tdvpr_pa = 0; 903 856 } 904 857 ··· 1621 1574 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); 1622 1575 } 1623 1576 1624 - static void tdx_unpin(struct kvm *kvm, struct page *page) 1577 + static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, 1578 + kvm_pfn_t pfn) 1625 1579 { 1626 - put_page(page); 1580 + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1581 + u64 err, entry, level_state; 1582 + gpa_t gpa = gfn_to_gpa(gfn); 1583 + 1584 + lockdep_assert_held(&kvm->slots_lock); 1585 + 1586 + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) || 1587 + KVM_BUG_ON(!kvm_tdx->page_add_src, kvm)) 1588 + return -EIO; 1589 + 1590 + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 1591 + kvm_tdx->page_add_src, &entry, &level_state); 1592 + if (unlikely(tdx_operand_busy(err))) 1593 + return -EBUSY; 1594 + 1595 + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm)) 1596 + return -EIO; 1597 + 1598 + return 0; 1627 1599 } 1628 1600 1629 1601 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, 1630 - enum pg_level level, struct page *page) 1602 + enum pg_level level, kvm_pfn_t pfn) 1631 1603 { 1632 1604 int tdx_level = pg_level_to_tdx_sept_level(level); 1633 1605 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1606 + struct page *page = pfn_to_page(pfn); 1634 1607 gpa_t gpa = gfn_to_gpa(gfn); 1635 1608 u64 entry, level_state; 1636 1609 u64 err; 1637 1610 1638 1611 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); 1639 - if (unlikely(tdx_operand_busy(err))) { 1640 - tdx_unpin(kvm, page); 1612 + if (unlikely(tdx_operand_busy(err))) 1641 1613 return -EBUSY; 1642 - } 1643 1614 1644 - if (KVM_BUG_ON(err, kvm)) { 1645 - pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); 1646 - tdx_unpin(kvm, page); 1615 + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm)) 1647 1616 return -EIO; 1648 - } 1649 1617 1650 - return 0; 1651 - } 1652 - 1653 - /* 1654 - * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the 1655 - * callback tdx_gmem_post_populate() then maps pages into private memory. 1656 - * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the 1657 - * private EPT structures for the page to have been built before, which is 1658 - * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that 1659 - * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). 1660 - * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there 1661 - * are no half-initialized shared EPT pages. 1662 - */ 1663 - static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, 1664 - enum pg_level level, kvm_pfn_t pfn) 1665 - { 1666 - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1667 - 1668 - if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) 1669 - return -EINVAL; 1670 - 1671 - /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ 1672 - atomic64_inc(&kvm_tdx->nr_premapped); 1673 1618 return 0; 1674 1619 } 1675 1620 1676 1621 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, 1677 - enum pg_level level, kvm_pfn_t pfn) 1622 + enum pg_level level, u64 mirror_spte) 1678 1623 { 1679 1624 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1680 - struct page *page = pfn_to_page(pfn); 1625 + kvm_pfn_t pfn = spte_to_pfn(mirror_spte); 1681 1626 1682 1627 /* TODO: handle large pages. */ 1683 1628 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1684 - return -EINVAL; 1629 + return -EIO; 1630 + 1631 + WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) || 1632 + (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); 1685 1633 1686 1634 /* 1687 - * Because guest_memfd doesn't support page migration with 1688 - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page 1689 - * migration. Until guest_memfd supports page migration, prevent page 1690 - * migration. 1691 - * TODO: Once guest_memfd introduces callback on page migration, 1692 - * implement it and remove get_page/put_page(). 1693 - */ 1694 - get_page(page); 1695 - 1696 - /* 1697 - * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching 1698 - * barrier in tdx_td_finalize(). 1635 + * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() 1636 + * before kvm_tdx->state. Userspace must not be allowed to pre-fault 1637 + * arbitrary memory until the initial memory image is finalized. Pairs 1638 + * with the smp_wmb() in tdx_td_finalize(). 1699 1639 */ 1700 1640 smp_rmb(); 1701 - if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) 1702 - return tdx_mem_page_aug(kvm, gfn, level, page); 1703 - 1704 - return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); 1705 - } 1706 - 1707 - static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, 1708 - enum pg_level level, struct page *page) 1709 - { 1710 - int tdx_level = pg_level_to_tdx_sept_level(level); 1711 - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1712 - gpa_t gpa = gfn_to_gpa(gfn); 1713 - u64 err, entry, level_state; 1714 - 1715 - /* TODO: handle large pages. */ 1716 - if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1717 - return -EINVAL; 1718 - 1719 - if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) 1720 - return -EINVAL; 1721 1641 1722 1642 /* 1723 - * When zapping private page, write lock is held. So no race condition 1724 - * with other vcpu sept operation. 1725 - * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1643 + * If the TD isn't finalized/runnable, then userspace is initializing 1644 + * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD. 1726 1645 */ 1727 - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1728 - &level_state); 1646 + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1647 + return tdx_mem_page_add(kvm, gfn, level, pfn); 1729 1648 1730 - if (unlikely(tdx_operand_busy(err))) { 1731 - /* 1732 - * The second retry is expected to succeed after kicking off all 1733 - * other vCPUs and prevent them from invoking TDH.VP.ENTER. 1734 - */ 1735 - tdx_no_vcpus_enter_start(kvm); 1736 - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, 1737 - &level_state); 1738 - tdx_no_vcpus_enter_stop(kvm); 1739 - } 1740 - 1741 - if (KVM_BUG_ON(err, kvm)) { 1742 - pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); 1743 - return -EIO; 1744 - } 1745 - 1746 - err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1747 - 1748 - if (KVM_BUG_ON(err, kvm)) { 1749 - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); 1750 - return -EIO; 1751 - } 1752 - tdx_quirk_reset_page(page); 1753 - tdx_unpin(kvm, page); 1754 - return 0; 1649 + return tdx_mem_page_aug(kvm, gfn, level, pfn); 1755 1650 } 1756 1651 1757 1652 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, ··· 1709 1720 if (unlikely(tdx_operand_busy(err))) 1710 1721 return -EBUSY; 1711 1722 1712 - if (KVM_BUG_ON(err, kvm)) { 1713 - pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); 1723 + if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) 1714 1724 return -EIO; 1715 - } 1716 1725 1717 1726 return 0; 1718 - } 1719 - 1720 - /* 1721 - * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is 1722 - * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called 1723 - * successfully. 1724 - * 1725 - * Since tdh_mem_sept_add() must have been invoked successfully before a 1726 - * non-leaf entry present in the mirrored page table, the SEPT ZAP related 1727 - * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead 1728 - * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the 1729 - * SEPT. 1730 - * 1731 - * Further check if the returned entry from SEPT walking is with RWX permissions 1732 - * to filter out anything unexpected. 1733 - * 1734 - * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from 1735 - * level_state returned from a SEAMCALL error is the same as that passed into 1736 - * the SEAMCALL. 1737 - */ 1738 - static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, 1739 - u64 entry, int level) 1740 - { 1741 - if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) 1742 - return false; 1743 - 1744 - if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) 1745 - return false; 1746 - 1747 - if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) 1748 - return false; 1749 - 1750 - return true; 1751 - } 1752 - 1753 - static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, 1754 - enum pg_level level, struct page *page) 1755 - { 1756 - int tdx_level = pg_level_to_tdx_sept_level(level); 1757 - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1758 - gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); 1759 - u64 err, entry, level_state; 1760 - 1761 - /* For now large page isn't supported yet. */ 1762 - WARN_ON_ONCE(level != PG_LEVEL_4K); 1763 - 1764 - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1765 - 1766 - if (unlikely(tdx_operand_busy(err))) { 1767 - /* After no vCPUs enter, the second retry is expected to succeed */ 1768 - tdx_no_vcpus_enter_start(kvm); 1769 - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); 1770 - tdx_no_vcpus_enter_stop(kvm); 1771 - } 1772 - if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && 1773 - !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { 1774 - atomic64_dec(&kvm_tdx->nr_premapped); 1775 - tdx_unpin(kvm, page); 1776 - return 0; 1777 - } 1778 - 1779 - if (KVM_BUG_ON(err, kvm)) { 1780 - pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); 1781 - return -EIO; 1782 - } 1783 - return 1; 1784 1727 } 1785 1728 1786 1729 /* ··· 1748 1827 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) 1749 1828 return; 1750 1829 1830 + /* 1831 + * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest 1832 + * mode must be serialized, as TDH.MEM.TRACK will fail if the previous 1833 + * tracking epoch hasn't completed. 1834 + */ 1751 1835 lockdep_assert_held_write(&kvm->mmu_lock); 1752 1836 1753 - err = tdh_mem_track(&kvm_tdx->td); 1754 - if (unlikely(tdx_operand_busy(err))) { 1755 - /* After no vCPUs enter, the second retry is expected to succeed */ 1756 - tdx_no_vcpus_enter_start(kvm); 1757 - err = tdh_mem_track(&kvm_tdx->td); 1758 - tdx_no_vcpus_enter_stop(kvm); 1759 - } 1760 - 1761 - if (KVM_BUG_ON(err, kvm)) 1762 - pr_tdx_error(TDH_MEM_TRACK, err); 1837 + err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td); 1838 + TDX_BUG_ON(err, TDH_MEM_TRACK, kvm); 1763 1839 1764 1840 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); 1765 1841 } ··· 1775 1857 * and slot move/deletion. 1776 1858 */ 1777 1859 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) 1778 - return -EINVAL; 1860 + return -EIO; 1779 1861 1780 1862 /* 1781 1863 * The HKID assigned to this TD was already freed and cache was ··· 1784 1866 return tdx_reclaim_page(virt_to_page(private_spt)); 1785 1867 } 1786 1868 1787 - static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1788 - enum pg_level level, kvm_pfn_t pfn) 1869 + static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, 1870 + enum pg_level level, u64 mirror_spte) 1789 1871 { 1790 - struct page *page = pfn_to_page(pfn); 1791 - int ret; 1872 + struct page *page = pfn_to_page(spte_to_pfn(mirror_spte)); 1873 + int tdx_level = pg_level_to_tdx_sept_level(level); 1874 + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 1875 + gpa_t gpa = gfn_to_gpa(gfn); 1876 + u64 err, entry, level_state; 1877 + 1878 + lockdep_assert_held_write(&kvm->mmu_lock); 1792 1879 1793 1880 /* 1794 1881 * HKID is released after all private pages have been removed, and set ··· 1801 1878 * there can't be anything populated in the private EPT. 1802 1879 */ 1803 1880 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) 1804 - return -EINVAL; 1881 + return; 1805 1882 1806 - ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); 1807 - if (ret <= 0) 1808 - return ret; 1883 + /* TODO: handle large pages. */ 1884 + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) 1885 + return; 1886 + 1887 + err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, 1888 + tdx_level, &entry, &level_state); 1889 + if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) 1890 + return; 1809 1891 1810 1892 /* 1811 1893 * TDX requires TLB tracking before dropping private page. Do ··· 1818 1890 */ 1819 1891 tdx_track(kvm); 1820 1892 1821 - return tdx_sept_drop_private_spte(kvm, gfn, level, page); 1893 + /* 1894 + * When zapping private page, write lock is held. So no race condition 1895 + * with other vcpu sept operation. 1896 + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. 1897 + */ 1898 + err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa, 1899 + tdx_level, &entry, &level_state); 1900 + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) 1901 + return; 1902 + 1903 + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); 1904 + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) 1905 + return; 1906 + 1907 + tdx_quirk_reset_page(page); 1822 1908 } 1823 1909 1824 1910 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, ··· 2211 2269 if (cmd->flags) 2212 2270 return -EINVAL; 2213 2271 2214 - caps = kzalloc(sizeof(*caps) + 2215 - sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, 2216 - GFP_KERNEL); 2272 + user_caps = u64_to_user_ptr(cmd->data); 2273 + if (get_user(nr_user_entries, &user_caps->cpuid.nent)) 2274 + return -EFAULT; 2275 + 2276 + if (nr_user_entries < td_conf->num_cpuid_config) 2277 + return -E2BIG; 2278 + 2279 + caps = kzalloc(struct_size(caps, cpuid.entries, 2280 + td_conf->num_cpuid_config), GFP_KERNEL); 2217 2281 if (!caps) 2218 2282 return -ENOMEM; 2219 - 2220 - user_caps = u64_to_user_ptr(cmd->data); 2221 - if (get_user(nr_user_entries, &user_caps->cpuid.nent)) { 2222 - ret = -EFAULT; 2223 - goto out; 2224 - } 2225 - 2226 - if (nr_user_entries < td_conf->num_cpuid_config) { 2227 - ret = -E2BIG; 2228 - goto out; 2229 - } 2230 2283 2231 2284 ret = init_kvm_tdx_caps(td_conf, caps); 2232 2285 if (ret) 2233 2286 goto out; 2234 2287 2235 - if (copy_to_user(user_caps, caps, sizeof(*caps))) { 2288 + if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries, 2289 + caps->cpuid.nent))) { 2236 2290 ret = -EFAULT; 2237 2291 goto out; 2238 2292 } 2239 - 2240 - if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, 2241 - caps->cpuid.nent * 2242 - sizeof(caps->cpuid.entries[0]))) 2243 - ret = -EFAULT; 2244 2293 2245 2294 out: 2246 2295 /* kfree() accepts NULL. */ ··· 2457 2524 goto free_packages; 2458 2525 } 2459 2526 2460 - if (WARN_ON_ONCE(err)) { 2461 - pr_tdx_error(TDH_MNG_CREATE, err); 2527 + if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) { 2462 2528 ret = -EIO; 2463 2529 goto free_packages; 2464 2530 } ··· 2498 2566 ret = -EAGAIN; 2499 2567 goto teardown; 2500 2568 } 2501 - if (WARN_ON_ONCE(err)) { 2502 - pr_tdx_error(TDH_MNG_ADDCX, err); 2569 + if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) { 2503 2570 ret = -EIO; 2504 2571 goto teardown; 2505 2572 } ··· 2515 2584 *seamcall_err = err; 2516 2585 ret = -EINVAL; 2517 2586 goto teardown; 2518 - } else if (WARN_ON_ONCE(err)) { 2519 - pr_tdx_error_1(TDH_MNG_INIT, err, rcx); 2587 + } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) { 2520 2588 ret = -EIO; 2521 2589 goto teardown; 2522 2590 } ··· 2559 2629 free_tdr: 2560 2630 if (tdr_page) 2561 2631 __free_page(tdr_page); 2562 - kvm_tdx->td.tdr_page = 0; 2632 + kvm_tdx->td.tdr_page = NULL; 2563 2633 2564 2634 free_hkid: 2565 2635 tdx_hkid_free(kvm_tdx); ··· 2663 2733 2664 2734 return -EIO; 2665 2735 } 2736 + 2737 + typedef void *tdx_vm_state_guard_t; 2738 + 2739 + static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm) 2740 + { 2741 + int r; 2742 + 2743 + mutex_lock(&kvm->lock); 2744 + 2745 + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) { 2746 + r = -EBUSY; 2747 + goto out_err; 2748 + } 2749 + 2750 + r = kvm_lock_all_vcpus(kvm); 2751 + if (r) 2752 + goto out_err; 2753 + 2754 + /* 2755 + * Note the unintuitive ordering! vcpu->mutex must be taken outside 2756 + * kvm->slots_lock! 2757 + */ 2758 + mutex_lock(&kvm->slots_lock); 2759 + return kvm; 2760 + 2761 + out_err: 2762 + mutex_unlock(&kvm->lock); 2763 + return ERR_PTR(r); 2764 + } 2765 + 2766 + static void tdx_release_vm_state_locks(struct kvm *kvm) 2767 + { 2768 + mutex_unlock(&kvm->slots_lock); 2769 + kvm_unlock_all_vcpus(kvm); 2770 + mutex_unlock(&kvm->lock); 2771 + } 2772 + 2773 + DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t, 2774 + if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T), 2775 + tdx_acquire_vm_state_locks(kvm), struct kvm *kvm); 2666 2776 2667 2777 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) 2668 2778 { ··· 2825 2855 { 2826 2856 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 2827 2857 2828 - guard(mutex)(&kvm->slots_lock); 2829 - 2830 2858 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 2831 - return -EINVAL; 2832 - /* 2833 - * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue 2834 - * TDH.MEM.PAGE.ADD(). 2835 - */ 2836 - if (atomic64_read(&kvm_tdx->nr_premapped)) 2837 2859 return -EINVAL; 2838 2860 2839 2861 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); 2840 2862 if (tdx_operand_busy(cmd->hw_error)) 2841 2863 return -EBUSY; 2842 - if (KVM_BUG_ON(cmd->hw_error, kvm)) { 2843 - pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); 2864 + if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm)) 2844 2865 return -EIO; 2845 - } 2846 2866 2847 2867 kvm_tdx->state = TD_STATE_RUNNABLE; 2848 2868 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ ··· 2841 2881 return 0; 2842 2882 } 2843 2883 2884 + static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd) 2885 + { 2886 + if (copy_from_user(cmd, argp, sizeof(*cmd))) 2887 + return -EFAULT; 2888 + 2889 + /* 2890 + * Userspace should never set hw_error. KVM writes hw_error to report 2891 + * hardware-defined error back to userspace. 2892 + */ 2893 + if (cmd->hw_error) 2894 + return -EINVAL; 2895 + 2896 + return 0; 2897 + } 2898 + 2844 2899 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) 2845 2900 { 2846 2901 struct kvm_tdx_cmd tdx_cmd; 2847 2902 int r; 2848 2903 2849 - if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) 2850 - return -EFAULT; 2904 + r = tdx_get_cmd(argp, &tdx_cmd); 2905 + if (r) 2906 + return r; 2851 2907 2852 - /* 2853 - * Userspace should never set hw_error. It is used to fill 2854 - * hardware-defined error by the kernel. 2855 - */ 2856 - if (tdx_cmd.hw_error) 2857 - return -EINVAL; 2908 + if (tdx_cmd.id == KVM_TDX_CAPABILITIES) 2909 + return tdx_get_capabilities(&tdx_cmd); 2858 2910 2859 - mutex_lock(&kvm->lock); 2911 + CLASS(tdx_vm_state_guard, guard)(kvm); 2912 + if (IS_ERR(guard)) 2913 + return PTR_ERR(guard); 2860 2914 2861 2915 switch (tdx_cmd.id) { 2862 - case KVM_TDX_CAPABILITIES: 2863 - r = tdx_get_capabilities(&tdx_cmd); 2864 - break; 2865 2916 case KVM_TDX_INIT_VM: 2866 2917 r = tdx_td_init(kvm, &tdx_cmd); 2867 2918 break; ··· 2880 2909 r = tdx_td_finalize(kvm, &tdx_cmd); 2881 2910 break; 2882 2911 default: 2883 - r = -EINVAL; 2884 - goto out; 2912 + return -EINVAL; 2885 2913 } 2886 2914 2887 2915 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) 2888 - r = -EFAULT; 2916 + return -EFAULT; 2889 2917 2890 - out: 2891 - mutex_unlock(&kvm->lock); 2892 2918 return r; 2893 2919 } 2894 2920 ··· 2927 2959 } 2928 2960 2929 2961 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); 2930 - if (KVM_BUG_ON(err, vcpu->kvm)) { 2962 + if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) { 2931 2963 ret = -EIO; 2932 - pr_tdx_error(TDH_VP_CREATE, err); 2933 2964 goto free_tdcx; 2934 2965 } 2935 2966 2936 2967 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { 2937 2968 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); 2938 - if (KVM_BUG_ON(err, vcpu->kvm)) { 2939 - pr_tdx_error(TDH_VP_ADDCX, err); 2969 + if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) { 2940 2970 /* 2941 2971 * Pages already added are reclaimed by the vcpu_free 2942 2972 * method, but the rest are freed here. ··· 2947 2981 } 2948 2982 } 2949 2983 2950 - err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2951 - if (KVM_BUG_ON(err, vcpu->kvm)) { 2952 - pr_tdx_error(TDH_VP_INIT, err); 2953 - return -EIO; 2984 + /* 2985 + * tdh_vp_init() can take an exclusive lock of the TDR resource inside 2986 + * the TDX-Module. The TDR resource is also taken as shared in several 2987 + * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention 2988 + * (TDX-Module locks are try-lock implementations with no slow path). 2989 + * Take mmu_lock for write to reflect the nature of the lock taken by 2990 + * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if 2991 + * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs. 2992 + */ 2993 + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { 2994 + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); 2995 + if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) 2996 + return -EIO; 2954 2997 } 2955 2998 2956 2999 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; ··· 2978 3003 free_tdvpr: 2979 3004 if (tdx->vp.tdvpr_page) 2980 3005 __free_page(tdx->vp.tdvpr_page); 2981 - tdx->vp.tdvpr_page = 0; 3006 + tdx->vp.tdvpr_page = NULL; 2982 3007 tdx->vp.tdvpr_pa = 0; 2983 3008 2984 3009 return ret; ··· 3016 3041 3017 3042 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) 3018 3043 { 3019 - struct kvm_cpuid2 __user *output, *td_cpuid; 3044 + struct kvm_cpuid2 __user *output; 3045 + struct kvm_cpuid2 *td_cpuid; 3020 3046 int r = 0, i = 0, leaf; 3021 3047 u32 level; 3022 3048 ··· 3130 3154 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 3131 3155 void __user *src, int order, void *_arg) 3132 3156 { 3133 - u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; 3134 - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3135 3157 struct tdx_gmem_post_populate_arg *arg = _arg; 3136 - struct kvm_vcpu *vcpu = arg->vcpu; 3158 + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3159 + u64 err, entry, level_state; 3137 3160 gpa_t gpa = gfn_to_gpa(gfn); 3138 - u8 level = PG_LEVEL_4K; 3139 3161 struct page *src_page; 3140 3162 int ret, i; 3141 - u64 err, entry, level_state; 3163 + 3164 + if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) 3165 + return -EIO; 3142 3166 3143 3167 /* 3144 3168 * Get the source page if it has been faulted in. Return failure if the ··· 3150 3174 if (ret != 1) 3151 3175 return -ENOMEM; 3152 3176 3153 - ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); 3154 - if (ret < 0) 3155 - goto out; 3177 + kvm_tdx->page_add_src = src_page; 3178 + ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); 3179 + kvm_tdx->page_add_src = NULL; 3180 + 3181 + put_page(src_page); 3182 + 3183 + if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) 3184 + return ret; 3156 3185 3157 3186 /* 3158 - * The private mem cannot be zapped after kvm_tdp_map_page() 3159 - * because all paths are covered by slots_lock and the 3160 - * filemap invalidate lock. Check that they are indeed enough. 3187 + * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed 3188 + * between mapping the pfn and now, but slots_lock prevents memslot 3189 + * updates, filemap_invalidate_lock() prevents guest_memfd updates, 3190 + * mmu_notifier events can't reach S-EPT entries, and KVM's internal 3191 + * zapping flows are mutually exclusive with S-EPT mappings. 3161 3192 */ 3162 - if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 3163 - scoped_guard(read_lock, &kvm->mmu_lock) { 3164 - if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { 3165 - ret = -EIO; 3166 - goto out; 3167 - } 3168 - } 3193 + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3194 + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state); 3195 + if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm)) 3196 + return -EIO; 3169 3197 } 3170 3198 3171 - ret = 0; 3172 - err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), 3173 - src_page, &entry, &level_state); 3174 - if (err) { 3175 - ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; 3176 - goto out; 3177 - } 3178 - 3179 - if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) 3180 - atomic64_dec(&kvm_tdx->nr_premapped); 3181 - 3182 - if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { 3183 - for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { 3184 - err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, 3185 - &level_state); 3186 - if (err) { 3187 - ret = -EIO; 3188 - break; 3189 - } 3190 - } 3191 - } 3192 - 3193 - out: 3194 - put_page(src_page); 3195 - return ret; 3199 + return 0; 3196 3200 } 3197 3201 3198 3202 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) ··· 3187 3231 3188 3232 if (tdx->state != VCPU_TD_STATE_INITIALIZED) 3189 3233 return -EINVAL; 3190 - 3191 - guard(mutex)(&kvm->slots_lock); 3192 3234 3193 3235 /* Once TD is finalized, the initial guest memory is fixed. */ 3194 3236 if (kvm_tdx->state == TD_STATE_RUNNABLE) ··· 3205 3251 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) 3206 3252 return -EINVAL; 3207 3253 3208 - kvm_mmu_reload(vcpu); 3209 3254 ret = 0; 3210 3255 while (region.nr_pages) { 3211 3256 if (signal_pending(current)) { ··· 3241 3288 return ret; 3242 3289 } 3243 3290 3291 + int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3292 + { 3293 + struct kvm *kvm = vcpu->kvm; 3294 + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 3295 + struct kvm_tdx_cmd cmd; 3296 + int r; 3297 + 3298 + r = tdx_get_cmd(argp, &cmd); 3299 + if (r) 3300 + return r; 3301 + 3302 + CLASS(tdx_vm_state_guard, guard)(kvm); 3303 + if (IS_ERR(guard)) 3304 + return PTR_ERR(guard); 3305 + 3306 + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3307 + return -EINVAL; 3308 + 3309 + vcpu_load(vcpu); 3310 + 3311 + switch (cmd.id) { 3312 + case KVM_TDX_INIT_MEM_REGION: 3313 + r = tdx_vcpu_init_mem_region(vcpu, &cmd); 3314 + break; 3315 + case KVM_TDX_INIT_VCPU: 3316 + r = tdx_vcpu_init(vcpu, &cmd); 3317 + break; 3318 + default: 3319 + r = -ENOIOCTLCMD; 3320 + break; 3321 + } 3322 + 3323 + vcpu_put(vcpu); 3324 + 3325 + return r; 3326 + } 3327 + 3244 3328 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) 3245 3329 { 3246 3330 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); ··· 3287 3297 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) 3288 3298 return -EINVAL; 3289 3299 3290 - if (copy_from_user(&cmd, argp, sizeof(cmd))) 3291 - return -EFAULT; 3292 - 3293 - if (cmd.hw_error) 3294 - return -EINVAL; 3300 + ret = tdx_get_cmd(argp, &cmd); 3301 + if (ret) 3302 + return ret; 3295 3303 3296 3304 switch (cmd.id) { 3297 - case KVM_TDX_INIT_VCPU: 3298 - ret = tdx_vcpu_init(vcpu, &cmd); 3299 - break; 3300 - case KVM_TDX_INIT_MEM_REGION: 3301 - ret = tdx_vcpu_init_mem_region(vcpu, &cmd); 3302 - break; 3303 3305 case KVM_TDX_GET_CPUID: 3304 3306 ret = tdx_vcpu_get_cpuid(vcpu, &cmd); 3305 3307 break;
+6 -2
arch/x86/kvm/vmx/tdx.h
··· 36 36 37 37 struct tdx_td td; 38 38 39 - /* For KVM_TDX_INIT_MEM_REGION. */ 40 - atomic64_t nr_premapped; 39 + /* 40 + * Scratch pointer used to pass the source page to tdx_mem_page_add(). 41 + * Protected by slots_lock, and non-NULL only when mapping a private 42 + * pfn via tdx_gmem_post_populate(). 43 + */ 44 + struct page *page_add_src; 41 45 42 46 /* 43 47 * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
+1
arch/x86/kvm/vmx/x86_ops.h
··· 149 149 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 150 150 151 151 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); 152 + int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp); 152 153 153 154 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); 154 155 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
+13
arch/x86/kvm/x86.c
··· 7200 7200 return 0; 7201 7201 } 7202 7202 7203 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, 7204 + unsigned long arg) 7205 + { 7206 + struct kvm_vcpu *vcpu = filp->private_data; 7207 + void __user *argp = (void __user *)arg; 7208 + 7209 + if (ioctl == KVM_MEMORY_ENCRYPT_OP && 7210 + kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl) 7211 + return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp); 7212 + 7213 + return -ENOIOCTLCMD; 7214 + } 7215 + 7203 7216 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 7204 7217 { 7205 7218 struct kvm *kvm = filp->private_data;
+2 -12
include/linux/kvm_host.h
··· 1557 1557 unsigned int ioctl, unsigned long arg); 1558 1558 long kvm_arch_vcpu_ioctl(struct file *filp, 1559 1559 unsigned int ioctl, unsigned long arg); 1560 + long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, 1561 + unsigned int ioctl, unsigned long arg); 1560 1562 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); 1561 1563 1562 1564 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); ··· 2438 2436 return false; 2439 2437 } 2440 2438 #endif /* CONFIG_HAVE_KVM_NO_POLL */ 2441 - 2442 - #ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL 2443 - long kvm_arch_vcpu_async_ioctl(struct file *filp, 2444 - unsigned int ioctl, unsigned long arg); 2445 - #else 2446 - static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, 2447 - unsigned int ioctl, 2448 - unsigned long arg) 2449 - { 2450 - return -ENOIOCTLCMD; 2451 - } 2452 - #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ 2453 2439 2454 2440 void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); 2455 2441
-3
virt/kvm/Kconfig
··· 78 78 tristate 79 79 select IRQ_BYPASS_MANAGER 80 80 81 - config HAVE_KVM_VCPU_ASYNC_IOCTL 82 - bool 83 - 84 81 config HAVE_KVM_VCPU_RUN_PID_CHANGE 85 82 bool 86 83
+3 -3
virt/kvm/kvm_main.c
··· 4434 4434 return r; 4435 4435 4436 4436 /* 4437 - * Some architectures have vcpu ioctls that are asynchronous to vcpu 4438 - * execution; mutex_lock() would break them. 4437 + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, 4438 + * e.g. to support ioctls that can run asynchronous to vCPU execution. 4439 4439 */ 4440 - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 4440 + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); 4441 4441 if (r != -ENOIOCTLCMD) 4442 4442 return r; 4443 4443