Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+4 -2

Documentation/virt/kvm/nested-vmx.rst

··· 37 37 Running nested VMX 38 38 ------------------ 39 39 40 - The nested VMX feature is disabled by default. It can be enabled by giving 41 - the "nested=1" option to the kvm-intel module. 40 + The nested VMX feature is enabled by default since Linux kernel v4.20. For 41 + older Linux kernel, it can be enabled by giving the "nested=1" option to the 42 + kvm-intel module. 43 + 42 44 43 45 No modifications are required to user space (qemu). However, qemu's default 44 46 emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be

+1 -1

Documentation/virt/kvm/running-nested-guests.rst

··· 74 74 Enabling "nested" (x86) 75 75 ----------------------- 76 76 77 - From Linux kernel v4.19 onwards, the ``nested`` KVM parameter is enabled 77 + From Linux kernel v4.20 onwards, the ``nested`` KVM parameter is enabled 78 78 by default for Intel and AMD. (Though your Linux distribution might 79 79 override this default.) 80 80

+11 -9

arch/arm64/kvm/hyp/nvhe/hyp-init.S

··· 47 47 b . 48 48 49 49 /* 50 + * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers. 51 + * 50 52 * x0: SMCCC function ID 51 53 * x1: struct kvm_nvhe_init_params PA 52 54 */ ··· 72 70 eret 73 71 74 72 1: mov x0, x1 75 - mov x4, lr 76 - bl ___kvm_hyp_init 77 - mov lr, x4 73 + mov x3, lr 74 + bl ___kvm_hyp_init // Clobbers x0..x2 75 + mov lr, x3 78 76 79 77 /* Hello, World! */ 80 78 mov x0, #SMCCC_RET_SUCCESS ··· 84 82 /* 85 83 * Initialize the hypervisor in EL2. 86 84 * 87 - * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers 88 - * and leave x4 for the caller. 85 + * Only uses x0..x2 so as to not clobber callee-saved SMCCC registers 86 + * and leave x3 for the caller. 89 87 * 90 88 * x0: struct kvm_nvhe_init_params PA 91 89 */ ··· 114 112 /* 115 113 * Set the PS bits in TCR_EL2. 116 114 */ 117 - ldr x1, [x0, #NVHE_INIT_TCR_EL2] 118 - tcr_compute_pa_size x1, #TCR_EL2_PS_SHIFT, x2, x3 119 - msr tcr_el2, x1 115 + ldr x0, [x0, #NVHE_INIT_TCR_EL2] 116 + tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 117 + msr tcr_el2, x0 120 118 121 119 isb 122 120 ··· 195 193 196 194 /* Enable MMU, set vectors and stack. */ 197 195 mov x0, x28 198 - bl ___kvm_hyp_init // Clobbers x0..x3 196 + bl ___kvm_hyp_init // Clobbers x0..x2 199 197 200 198 /* Leave idmap. */ 201 199 mov x0, x29

+1 -1

arch/x86/kvm/cpuid.c

··· 321 321 if (cpuid->nent < vcpu->arch.cpuid_nent) 322 322 goto out; 323 323 r = -EFAULT; 324 - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 324 + if (copy_to_user(entries, vcpu->arch.cpuid_entries, 325 325 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 326 326 goto out; 327 327 return 0;

+2

arch/x86/kvm/emulate.c

··· 2879 2879 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); 2880 2880 *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data : 2881 2881 (u32)msr_data; 2882 + if (efer & EFER_LMA) 2883 + ctxt->mode = X86EMUL_MODE_PROT64; 2882 2884 2883 2885 return X86EMUL_CONTINUE; 2884 2886 }

+3 -3

arch/x86/kvm/mmu/tdp_mmu.c

··· 1049 1049 } 1050 1050 1051 1051 /* 1052 - * Clear non-leaf entries (and free associated page tables) which could 1053 - * be replaced by large mappings, for GFNs within the slot. 1052 + * Clear leaf entries which could be replaced by large mappings, for 1053 + * GFNs within the slot. 1054 1054 */ 1055 1055 static void zap_collapsible_spte_range(struct kvm *kvm, 1056 1056 struct kvm_mmu_page *root, ··· 1062 1062 1063 1063 tdp_root_for_each_pte(iter, root, start, end) { 1064 1064 if (!is_shadow_present_pte(iter.old_spte) || 1065 - is_last_spte(iter.old_spte, iter.level)) 1065 + !is_last_spte(iter.old_spte, iter.level)) 1066 1066 continue; 1067 1067 1068 1068 pfn = spte_to_pfn(iter.old_spte);

+3 -10

arch/x86/kvm/svm/nested.c

··· 231 231 232 232 static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) 233 233 { 234 + struct kvm_vcpu *vcpu = &svm->vcpu; 234 235 bool vmcb12_lma; 235 236 236 237 if ((vmcb12->save.efer & EFER_SVME) == 0) ··· 245 244 246 245 vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); 247 246 248 - if (!vmcb12_lma) { 249 - if (vmcb12->save.cr4 & X86_CR4_PAE) { 250 - if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) 251 - return false; 252 - } else { 253 - if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) 254 - return false; 255 - } 256 - } else { 247 + if (vmcb12_lma) { 257 248 if (!(vmcb12->save.cr4 & X86_CR4_PAE) || 258 249 !(vmcb12->save.cr0 & X86_CR0_PE) || 259 - (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) 250 + (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits)) 260 251 return false; 261 252 } 262 253 if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))

+10 -7

arch/x86/kvm/svm/sev.c

··· 342 342 unsigned long first, last; 343 343 int ret; 344 344 345 + lockdep_assert_held(&kvm->lock); 346 + 345 347 if (ulen == 0 || uaddr + ulen < uaddr) 346 348 return ERR_PTR(-EINVAL); 347 349 ··· 1121 1119 if (!region) 1122 1120 return -ENOMEM; 1123 1121 1122 + mutex_lock(&kvm->lock); 1124 1123 region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1); 1125 1124 if (IS_ERR(region->pages)) { 1126 1125 ret = PTR_ERR(region->pages); 1126 + mutex_unlock(&kvm->lock); 1127 1127 goto e_free; 1128 1128 } 1129 + 1130 + region->uaddr = range->addr; 1131 + region->size = range->size; 1132 + 1133 + list_add_tail(&region->list, &sev->regions_list); 1134 + mutex_unlock(&kvm->lock); 1129 1135 1130 1136 /* 1131 1137 * The guest may change the memory encryption attribute from C=0 -> C=1 ··· 1142 1132 * correct C-bit. 1143 1133 */ 1144 1134 sev_clflush_pages(region->pages, region->npages); 1145 - 1146 - region->uaddr = range->addr; 1147 - region->size = range->size; 1148 - 1149 - mutex_lock(&kvm->lock); 1150 - list_add_tail(&region->list, &sev->regions_list); 1151 - mutex_unlock(&kvm->lock); 1152 1135 1153 1136 return ret; 1154 1137

+5

arch/x86/kvm/svm/svm.c

··· 454 454 return 0; 455 455 } 456 456 457 + if (sev_active()) { 458 + pr_info("KVM is unsupported when running as an SEV guest\n"); 459 + return 0; 460 + } 461 + 457 462 return 1; 458 463 } 459 464

-3

arch/x86/kvm/svm/svm.h

··· 403 403 } 404 404 405 405 /* svm.c */ 406 - #define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U 407 - #define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U 408 - #define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U 409 406 #define MSR_INVALID 0xffffffffU 410 407 411 408 extern int sev;

+13 -4

arch/x86/kvm/vmx/vmx.c

··· 6860 6860 switch (index) { 6861 6861 case MSR_IA32_TSX_CTRL: 6862 6862 /* 6863 - * No need to pass TSX_CTRL_CPUID_CLEAR through, so 6864 - * let's avoid changing CPUID bits under the host 6865 - * kernel's feet. 6863 + * TSX_CTRL_CPUID_CLEAR is handled in the CPUID 6864 + * interception. Keep the host value unchanged to avoid 6865 + * changing CPUID bits under the host kernel's feet. 6866 + * 6867 + * hle=0, rtm=0, tsx_ctrl=1 can be found with some 6868 + * combinations of new kernel and old userspace. If 6869 + * those guests run on a tsx=off host, do allow guests 6870 + * to use TSX_CTRL, but do not change the value on the 6871 + * host so that TSX remains always disabled. 6866 6872 */ 6867 - vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 6873 + if (boot_cpu_has(X86_FEATURE_RTM)) 6874 + vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 6875 + else 6876 + vmx->guest_uret_msrs[j].mask = 0; 6868 6877 break; 6869 6878 default: 6870 6879 vmx->guest_uret_msrs[j].mask = -1ull;

+21 -10

arch/x86/kvm/x86.c

··· 1394 1394 if (!boot_cpu_has_bug(X86_BUG_MDS)) 1395 1395 data |= ARCH_CAP_MDS_NO; 1396 1396 1397 - /* 1398 - * On TAA affected systems: 1399 - * - nothing to do if TSX is disabled on the host. 1400 - * - we emulate TSX_CTRL if present on the host. 1401 - * This lets the guest use VERW to clear CPU buffers. 1402 - */ 1403 - if (!boot_cpu_has(X86_FEATURE_RTM)) 1404 - data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); 1405 - else if (!boot_cpu_has_bug(X86_BUG_TAA)) 1397 + if (!boot_cpu_has(X86_FEATURE_RTM)) { 1398 + /* 1399 + * If RTM=0 because the kernel has disabled TSX, the host might 1400 + * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 1401 + * and therefore knows that there cannot be TAA) but keep 1402 + * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, 1403 + * and we want to allow migrating those guests to tsx=off hosts. 1404 + */ 1405 + data &= ~ARCH_CAP_TAA_NO; 1406 + } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { 1406 1407 data |= ARCH_CAP_TAA_NO; 1408 + } else { 1409 + /* 1410 + * Nothing to do here; we emulate TSX_CTRL if present on the 1411 + * host so the guest can choose between disabling TSX or 1412 + * using VERW to clear CPU buffers. 1413 + */ 1414 + } 1407 1415 1408 1416 return data; 1409 1417 } ··· 9624 9616 */ 9625 9617 if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) 9626 9618 return false; 9619 + if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits) 9620 + return false; 9627 9621 } else { 9628 9622 /* 9629 9623 * Not in 64-bit mode: EFER.LMA is clear and the code ··· 10003 9993 fx_init(vcpu); 10004 9994 10005 9995 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 9996 + vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); 10006 9997 10007 9998 vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; 10008 9999 ··· 10505 10494 return 0; 10506 10495 10507 10496 old_npages = slot->npages; 10508 - hva = 0; 10497 + hva = slot->userspace_addr; 10509 10498 } 10510 10499 10511 10500 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {

+2

arch/x86/kvm/x86.h

··· 425 425 __reserved_bits |= X86_CR4_UMIP; \ 426 426 if (!__cpu_has(__c, X86_FEATURE_VMX)) \ 427 427 __reserved_bits |= X86_CR4_VMXE; \ 428 + if (!__cpu_has(__c, X86_FEATURE_PCID)) \ 429 + __reserved_bits |= X86_CR4_PCIDE; \ 428 430 __reserved_bits; \ 429 431 }) 430 432

+1

arch/x86/mm/mem_encrypt.c

··· 382 382 { 383 383 return sev_status & MSR_AMD64_SEV_ENABLED; 384 384 } 385 + EXPORT_SYMBOL_GPL(sev_active); 385 386 386 387 /* Needs to be called from non-instrumentable code */ 387 388 bool noinstr sev_es_active(void)

Configure Feed

Configure Feed