Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"ARM64:

- Fix the guest view of the ID registers, making the relevant fields
writable from userspace (affecting ID_AA64DFR0_EL1 and
ID_AA64PFR1_EL1)

- Correcly expose S1PIE to guests, fixing a regression introduced in
6.12-rc1 with the S1POE support

- Fix the recycling of stage-2 shadow MMUs by tracking the context
(are we allowed to block or not) as well as the recycling state

- Address a couple of issues with the vgic when userspace
misconfigures the emulation, resulting in various splats. Headaches
courtesy of our Syzkaller friends

- Stop wasting space in the HYP idmap, as we are dangerously close to
the 4kB limit, and this has already exploded in -next

- Fix another race in vgic_init()

- Fix a UBSAN error when faking the cache topology with MTE enabled

RISCV:

- RISCV: KVM: use raw_spinlock for critical section in imsic

x86:

- A bandaid for lack of XCR0 setup in selftests, which causes trouble
if the compiler is configured to have x86-64-v3 (with AVX) as the
default ISA. Proper XCR0 setup will come in the next merge window.

- Fix an issue where KVM would not ignore low bits of the nested CR3
and potentially leak up to 31 bytes out of the guest memory's
bounds

- Fix case in which an out-of-date cached value for the segments
could by returned by KVM_GET_SREGS.

- More cleanups for KVM_X86_QUIRK_SLOT_ZAP_ALL

- Override MTRR state for KVM confidential guests, making it WB by
default as is already the case for Hyper-V guests.

Generic:

- Remove a couple of unused functions"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (27 commits)
RISCV: KVM: use raw_spinlock for critical section in imsic
KVM: selftests: Fix out-of-bounds reads in CPUID test's array lookups
KVM: selftests: x86: Avoid using SSE/AVX instructions
KVM: nSVM: Ignore nCR3[4:0] when loading PDPTEs from memory
KVM: VMX: reset the segment cache after segment init in vmx_vcpu_reset()
KVM: x86: Clean up documentation for KVM_X86_QUIRK_SLOT_ZAP_ALL
KVM: x86/mmu: Add lockdep assert to enforce safe usage of kvm_unmap_gfn_range()
KVM: x86/mmu: Zap only SPs that shadow gPTEs when deleting memslot
x86/kvm: Override default caching mode for SEV-SNP and TDX
KVM: Remove unused kvm_vcpu_gfn_to_pfn_atomic
KVM: Remove unused kvm_vcpu_gfn_to_pfn
KVM: arm64: Ensure vgic_ready() is ordered against MMIO registration
KVM: arm64: vgic: Don't check for vgic_ready() when setting NR_IRQS
KVM: arm64: Fix shift-out-of-bounds bug
KVM: arm64: Shave a few bytes from the EL2 idmap code
KVM: arm64: Don't eagerly teardown the vgic on init error
KVM: arm64: Expose S1PIE to guests
KVM: arm64: nv: Clarify safety of allowing TLBI unmaps to reschedule
KVM: arm64: nv: Punt stage-2 recycling to a vCPU request
KVM: arm64: nv: Do not block when unmapping stage-2 if disallowed
...

+277 -103
+9 -7
Documentation/virt/kvm/api.rst
··· 8098 8098 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is 8099 8099 disabled. 8100 8100 8101 - KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in 8102 - fast way for memslot deletion when VM type 8103 - is KVM_X86_DEFAULT_VM. 8104 - When this quirk is disabled or when VM type 8105 - is other than KVM_X86_DEFAULT_VM, KVM zaps 8106 - only leaf SPTEs that are within the range of 8107 - the memslot being deleted. 8101 + KVM_X86_QUIRK_SLOT_ZAP_ALL By default, for KVM_X86_DEFAULT_VM VMs, KVM 8102 + invalidates all SPTEs in all memslots and 8103 + address spaces when a memslot is deleted or 8104 + moved. When this quirk is disabled (or the 8105 + VM type isn't KVM_X86_DEFAULT_VM), KVM only 8106 + ensures the backing memory of the deleted 8107 + or moved memslot isn't reachable, i.e KVM 8108 + _may_ invalidate only SPTEs related to the 8109 + memslot. 8108 8110 =================================== ============================================ 8109 8111 8110 8112 7.32 KVM_CAP_MAX_VCPU_ID
+1 -1
Documentation/virt/kvm/locking.rst
··· 136 136 to gfn. For indirect sp, we disabled fast page fault for simplicity. 137 137 138 138 A solution for indirect sp could be to pin the gfn, for example via 139 - kvm_vcpu_gfn_to_pfn_atomic, before the cmpxchg. After the pinning: 139 + gfn_to_pfn_memslot_atomic, before the cmpxchg. After the pinning: 140 140 141 141 - We have held the refcount of pfn; that means the pfn can not be freed and 142 142 be reused for another gfn.
+1
arch/arm64/include/asm/kvm_asm.h
··· 178 178 unsigned long hcr_el2; 179 179 unsigned long vttbr; 180 180 unsigned long vtcr; 181 + unsigned long tmp; 181 182 }; 182 183 183 184 /*
+7
arch/arm64/include/asm/kvm_host.h
··· 51 51 #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) 52 52 #define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) 53 53 #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) 54 + #define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) 54 55 55 56 #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ 56 57 KVM_DIRTY_LOG_INITIALLY_SET) ··· 211 210 * HCR_EL2.VM == 1 212 211 */ 213 212 bool nested_stage2_enabled; 213 + 214 + /* 215 + * true when this MMU needs to be unmapped before being used for a new 216 + * purpose. 217 + */ 218 + bool pending_unmap; 214 219 215 220 /* 216 221 * 0: Nobody is currently using this, check vttbr for validity
+2 -1
arch/arm64/include/asm/kvm_mmu.h
··· 166 166 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr); 167 167 void __init free_hyp_pgds(void); 168 168 169 - void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size); 169 + void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 170 + u64 size, bool may_block); 170 171 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); 171 172 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); 172 173
+3 -1
arch/arm64/include/asm/kvm_nested.h
··· 78 78 extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); 79 79 extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); 80 80 81 + extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu); 82 + 81 83 struct kvm_s2_trans { 82 84 phys_addr_t output; 83 85 unsigned long block_size; ··· 126 124 struct kvm_s2_trans *trans); 127 125 extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); 128 126 extern void kvm_nested_s2_wp(struct kvm *kvm); 129 - extern void kvm_nested_s2_unmap(struct kvm *kvm); 127 + extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block); 130 128 extern void kvm_nested_s2_flush(struct kvm *kvm); 131 129 132 130 unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val);
+1
arch/arm64/kernel/asm-offsets.c
··· 146 146 DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); 147 147 DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); 148 148 DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); 149 + DEFINE(NVHE_INIT_TMP, offsetof(struct kvm_nvhe_init_params, tmp)); 149 150 #endif 150 151 #ifdef CONFIG_CPU_PM 151 152 DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));
+5
arch/arm64/kvm/arm.c
··· 997 997 static int check_vcpu_requests(struct kvm_vcpu *vcpu) 998 998 { 999 999 if (kvm_request_pending(vcpu)) { 1000 + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) 1001 + return -EIO; 1002 + 1000 1003 if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) 1001 1004 kvm_vcpu_sleep(vcpu); 1002 1005 ··· 1034 1031 1035 1032 if (kvm_dirty_ring_check_request(vcpu)) 1036 1033 return 0; 1034 + 1035 + check_nested_vcpu_requests(vcpu); 1037 1036 } 1038 1037 1039 1038 return 1;
+29 -23
arch/arm64/kvm/hyp/nvhe/hyp-init.S
··· 24 24 .align 11 25 25 26 26 SYM_CODE_START(__kvm_hyp_init) 27 - ventry __invalid // Synchronous EL2t 28 - ventry __invalid // IRQ EL2t 29 - ventry __invalid // FIQ EL2t 30 - ventry __invalid // Error EL2t 27 + ventry . // Synchronous EL2t 28 + ventry . // IRQ EL2t 29 + ventry . // FIQ EL2t 30 + ventry . // Error EL2t 31 31 32 - ventry __invalid // Synchronous EL2h 33 - ventry __invalid // IRQ EL2h 34 - ventry __invalid // FIQ EL2h 35 - ventry __invalid // Error EL2h 32 + ventry . // Synchronous EL2h 33 + ventry . // IRQ EL2h 34 + ventry . // FIQ EL2h 35 + ventry . // Error EL2h 36 36 37 37 ventry __do_hyp_init // Synchronous 64-bit EL1 38 - ventry __invalid // IRQ 64-bit EL1 39 - ventry __invalid // FIQ 64-bit EL1 40 - ventry __invalid // Error 64-bit EL1 38 + ventry . // IRQ 64-bit EL1 39 + ventry . // FIQ 64-bit EL1 40 + ventry . // Error 64-bit EL1 41 41 42 - ventry __invalid // Synchronous 32-bit EL1 43 - ventry __invalid // IRQ 32-bit EL1 44 - ventry __invalid // FIQ 32-bit EL1 45 - ventry __invalid // Error 32-bit EL1 46 - 47 - __invalid: 48 - b . 42 + ventry . // Synchronous 32-bit EL1 43 + ventry . // IRQ 32-bit EL1 44 + ventry . // FIQ 32-bit EL1 45 + ventry . // Error 32-bit EL1 49 46 50 47 /* 51 48 * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers. ··· 73 76 eret 74 77 SYM_CODE_END(__kvm_hyp_init) 75 78 79 + SYM_CODE_START_LOCAL(__kvm_init_el2_state) 80 + /* Initialize EL2 CPU state to sane values. */ 81 + init_el2_state // Clobbers x0..x2 82 + finalise_el2_state 83 + ret 84 + SYM_CODE_END(__kvm_init_el2_state) 85 + 76 86 /* 77 87 * Initialize the hypervisor in EL2. 78 88 * ··· 106 102 // TPIDR_EL2 is used to preserve x0 across the macro maze... 107 103 isb 108 104 msr tpidr_el2, x0 109 - init_el2_state 110 - finalise_el2_state 105 + str lr, [x0, #NVHE_INIT_TMP] 106 + 107 + bl __kvm_init_el2_state 108 + 111 109 mrs x0, tpidr_el2 110 + ldr lr, [x0, #NVHE_INIT_TMP] 112 111 113 112 1: 114 113 ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] ··· 206 199 207 200 2: msr SPsel, #1 // We want to use SP_EL{1,2} 208 201 209 - /* Initialize EL2 CPU state to sane values. */ 210 - init_el2_state // Clobbers x0..x2 211 - finalise_el2_state 202 + bl __kvm_init_el2_state 203 + 212 204 __init_el2_nvhe_prepare_eret 213 205 214 206 /* Enable MMU, set vectors and stack. */
+6 -6
arch/arm64/kvm/hypercalls.c
··· 317 317 * to the guest, and hide SSBS so that the 318 318 * guest stays protected. 319 319 */ 320 - if (cpus_have_final_cap(ARM64_SSBS)) 320 + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) 321 321 break; 322 322 fallthrough; 323 323 case SPECTRE_UNAFFECTED: ··· 428 428 * Convert the workaround level into an easy-to-compare number, where higher 429 429 * values mean better protection. 430 430 */ 431 - static int get_kernel_wa_level(u64 regid) 431 + static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid) 432 432 { 433 433 switch (regid) { 434 434 case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: ··· 449 449 * don't have any FW mitigation if SSBS is there at 450 450 * all times. 451 451 */ 452 - if (cpus_have_final_cap(ARM64_SSBS)) 452 + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) 453 453 return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; 454 454 fallthrough; 455 455 case SPECTRE_UNAFFECTED: ··· 486 486 case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: 487 487 case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: 488 488 case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: 489 - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; 489 + val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK; 490 490 break; 491 491 case KVM_REG_ARM_STD_BMAP: 492 492 val = READ_ONCE(smccc_feat->std_bmap); ··· 588 588 if (val & ~KVM_REG_FEATURE_LEVEL_MASK) 589 589 return -EINVAL; 590 590 591 - if (get_kernel_wa_level(reg->id) < val) 591 + if (get_kernel_wa_level(vcpu, reg->id) < val) 592 592 return -EINVAL; 593 593 594 594 return 0; ··· 624 624 * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the 625 625 * other way around. 626 626 */ 627 - if (get_kernel_wa_level(reg->id) < wa_level) 627 + if (get_kernel_wa_level(vcpu, reg->id) < wa_level) 628 628 return -EINVAL; 629 629 630 630 return 0;
+8 -7
arch/arm64/kvm/mmu.c
··· 328 328 may_block)); 329 329 } 330 330 331 - void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) 331 + void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 332 + u64 size, bool may_block) 332 333 { 333 - __unmap_stage2_range(mmu, start, size, true); 334 + __unmap_stage2_range(mmu, start, size, may_block); 334 335 } 335 336 336 337 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) ··· 1016 1015 1017 1016 if (!(vma->vm_flags & VM_PFNMAP)) { 1018 1017 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1019 - kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start); 1018 + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1020 1019 } 1021 1020 hva = vm_end; 1022 1021 } while (hva < reg_end); ··· 1043 1042 kvm_for_each_memslot(memslot, bkt, slots) 1044 1043 stage2_unmap_memslot(kvm, memslot); 1045 1044 1046 - kvm_nested_s2_unmap(kvm); 1045 + kvm_nested_s2_unmap(kvm, true); 1047 1046 1048 1047 write_unlock(&kvm->mmu_lock); 1049 1048 mmap_read_unlock(current->mm); ··· 1913 1912 (range->end - range->start) << PAGE_SHIFT, 1914 1913 range->may_block); 1915 1914 1916 - kvm_nested_s2_unmap(kvm); 1915 + kvm_nested_s2_unmap(kvm, range->may_block); 1917 1916 return false; 1918 1917 } 1919 1918 ··· 2180 2179 phys_addr_t size = slot->npages << PAGE_SHIFT; 2181 2180 2182 2181 write_lock(&kvm->mmu_lock); 2183 - kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size); 2184 - kvm_nested_s2_unmap(kvm); 2182 + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2183 + kvm_nested_s2_unmap(kvm, true); 2185 2184 write_unlock(&kvm->mmu_lock); 2186 2185 } 2187 2186
+46 -7
arch/arm64/kvm/nested.c
··· 632 632 /* Set the scene for the next search */ 633 633 kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; 634 634 635 - /* Clear the old state */ 635 + /* Make sure we don't forget to do the laundry */ 636 636 if (kvm_s2_mmu_valid(s2_mmu)) 637 - kvm_stage2_unmap_range(s2_mmu, 0, kvm_phys_size(s2_mmu)); 637 + s2_mmu->pending_unmap = true; 638 638 639 639 /* 640 640 * The virtual VMID (modulo CnP) will be used as a key when matching ··· 650 650 651 651 out: 652 652 atomic_inc(&s2_mmu->refcnt); 653 + 654 + /* 655 + * Set the vCPU request to perform an unmap, even if the pending unmap 656 + * originates from another vCPU. This guarantees that the MMU has been 657 + * completely unmapped before any vCPU actually uses it, and allows 658 + * multiple vCPUs to lend a hand with completing the unmap. 659 + */ 660 + if (s2_mmu->pending_unmap) 661 + kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu); 662 + 653 663 return s2_mmu; 654 664 } 655 665 ··· 673 663 674 664 void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) 675 665 { 666 + /* 667 + * The vCPU kept its reference on the MMU after the last put, keep 668 + * rolling with it. 669 + */ 670 + if (vcpu->arch.hw_mmu) 671 + return; 672 + 676 673 if (is_hyp_ctxt(vcpu)) { 677 674 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; 678 675 } else { ··· 691 674 692 675 void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) 693 676 { 694 - if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) { 677 + /* 678 + * Keep a reference on the associated stage-2 MMU if the vCPU is 679 + * scheduling out and not in WFI emulation, suggesting it is likely to 680 + * reuse the MMU sometime soon. 681 + */ 682 + if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI)) 683 + return; 684 + 685 + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) 695 686 atomic_dec(&vcpu->arch.hw_mmu->refcnt); 696 - vcpu->arch.hw_mmu = NULL; 697 - } 687 + 688 + vcpu->arch.hw_mmu = NULL; 698 689 } 699 690 700 691 /* ··· 755 730 } 756 731 } 757 732 758 - void kvm_nested_s2_unmap(struct kvm *kvm) 733 + void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) 759 734 { 760 735 int i; 761 736 ··· 765 740 struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; 766 741 767 742 if (kvm_s2_mmu_valid(mmu)) 768 - kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu)); 743 + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); 769 744 } 770 745 } 771 746 ··· 1208 1183 set_sysreg_masks(kvm, SCTLR_EL1, res0, res1); 1209 1184 1210 1185 return 0; 1186 + } 1187 + 1188 + void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) 1189 + { 1190 + if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) { 1191 + struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; 1192 + 1193 + write_lock(&vcpu->kvm->mmu_lock); 1194 + if (mmu->pending_unmap) { 1195 + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true); 1196 + mmu->pending_unmap = false; 1197 + } 1198 + write_unlock(&vcpu->kvm->mmu_lock); 1199 + } 1211 1200 }
+70 -7
arch/arm64/kvm/sys_regs.c
··· 1527 1527 val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); 1528 1528 1529 1529 val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); 1530 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); 1531 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); 1532 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); 1533 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); 1534 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); 1535 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); 1536 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); 1537 + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); 1530 1538 break; 1531 1539 case SYS_ID_AA64PFR2_EL1: 1532 1540 /* We only expose FPMR */ ··· 1558 1550 val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; 1559 1551 break; 1560 1552 case SYS_ID_AA64MMFR3_EL1: 1561 - val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE; 1553 + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | 1554 + ID_AA64MMFR3_EL1_S1PIE; 1562 1555 break; 1563 1556 case SYS_ID_MMFR4_EL1: 1564 1557 val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); ··· 1994 1985 * one cache line. 1995 1986 */ 1996 1987 if (kvm_has_mte(vcpu->kvm)) 1997 - clidr |= 2 << CLIDR_TTYPE_SHIFT(loc); 1988 + clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); 1998 1989 1999 1990 __vcpu_sys_reg(vcpu, r->reg) = clidr; 2000 1991 ··· 2385 2376 ID_AA64PFR0_EL1_RAS | 2386 2377 ID_AA64PFR0_EL1_AdvSIMD | 2387 2378 ID_AA64PFR0_EL1_FP), }, 2388 - ID_SANITISED(ID_AA64PFR1_EL1), 2379 + ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | 2380 + ID_AA64PFR1_EL1_DF2 | 2381 + ID_AA64PFR1_EL1_MTEX | 2382 + ID_AA64PFR1_EL1_THE | 2383 + ID_AA64PFR1_EL1_GCS | 2384 + ID_AA64PFR1_EL1_MTE_frac | 2385 + ID_AA64PFR1_EL1_NMI | 2386 + ID_AA64PFR1_EL1_RNDR_trap | 2387 + ID_AA64PFR1_EL1_SME | 2388 + ID_AA64PFR1_EL1_RES0 | 2389 + ID_AA64PFR1_EL1_MPAM_frac | 2390 + ID_AA64PFR1_EL1_RAS_frac | 2391 + ID_AA64PFR1_EL1_MTE)), 2389 2392 ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), 2390 2393 ID_UNALLOCATED(4,3), 2391 2394 ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), ··· 2411 2390 .get_user = get_id_reg, 2412 2391 .set_user = set_id_aa64dfr0_el1, 2413 2392 .reset = read_sanitised_id_aa64dfr0_el1, 2414 - .val = ID_AA64DFR0_EL1_PMUVer_MASK | 2393 + /* 2394 + * Prior to FEAT_Debugv8.9, the architecture defines context-aware 2395 + * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). 2396 + * KVM does not trap + emulate the breakpoint registers, and as such 2397 + * cannot support a layout that misaligns with the underlying hardware. 2398 + * While it may be possible to describe a subset that aligns with 2399 + * hardware, just prevent changes to BRPs and CTX_CMPs altogether for 2400 + * simplicity. 2401 + * 2402 + * See DDI0487K.a, section D2.8.3 Breakpoint types and linking 2403 + * of breakpoints for more details. 2404 + */ 2405 + .val = ID_AA64DFR0_EL1_DoubleLock_MASK | 2406 + ID_AA64DFR0_EL1_WRPs_MASK | 2407 + ID_AA64DFR0_EL1_PMUVer_MASK | 2415 2408 ID_AA64DFR0_EL1_DebugVer_MASK, }, 2416 2409 ID_SANITISED(ID_AA64DFR1_EL1), 2417 2410 ID_UNALLOCATED(5,2), ··· 2468 2433 ID_AA64MMFR2_EL1_NV | 2469 2434 ID_AA64MMFR2_EL1_CCIDX)), 2470 2435 ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | 2436 + ID_AA64MMFR3_EL1_S1PIE | 2471 2437 ID_AA64MMFR3_EL1_S1POE)), 2472 2438 ID_SANITISED(ID_AA64MMFR4_EL1), 2473 2439 ID_UNALLOCATED(7,5), ··· 2939 2903 * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the 2940 2904 * corresponding VMIDs. 2941 2905 */ 2942 - kvm_nested_s2_unmap(vcpu->kvm); 2906 + kvm_nested_s2_unmap(vcpu->kvm, true); 2943 2907 2944 2908 write_unlock(&vcpu->kvm->mmu_lock); 2945 2909 ··· 2991 2955 static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, 2992 2956 const union tlbi_info *info) 2993 2957 { 2994 - kvm_stage2_unmap_range(mmu, info->range.start, info->range.size); 2958 + /* 2959 + * The unmap operation is allowed to drop the MMU lock and block, which 2960 + * means that @mmu could be used for a different context than the one 2961 + * currently being invalidated. 2962 + * 2963 + * This behavior is still safe, as: 2964 + * 2965 + * 1) The vCPU(s) that recycled the MMU are responsible for invalidating 2966 + * the entire MMU before reusing it, which still honors the intent 2967 + * of a TLBI. 2968 + * 2969 + * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC 2970 + * and ERET to the guest), other vCPUs are allowed to use stale 2971 + * translations. 2972 + * 2973 + * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and 2974 + * at worst may cause more aborts for shadow stage-2 fills. 2975 + * 2976 + * Dropping the MMU lock also implies that shadow stage-2 fills could 2977 + * happen behind the back of the TLBI. This is still safe, though, as 2978 + * the L1 needs to put its stage-2 in a consistent state before doing 2979 + * the TLBI. 2980 + */ 2981 + kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); 2995 2982 } 2996 2983 2997 2984 static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, ··· 3109 3050 max_size = compute_tlb_inval_range(mmu, info->ipa.addr); 3110 3051 base_addr &= ~(max_size - 1); 3111 3052 3112 - kvm_stage2_unmap_range(mmu, base_addr, max_size); 3053 + /* 3054 + * See comment in s2_mmu_unmap_range() for why this is allowed to 3055 + * reschedule. 3056 + */ 3057 + kvm_stage2_unmap_range(mmu, base_addr, max_size, true); 3113 3058 } 3114 3059 3115 3060 static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+35 -6
arch/arm64/kvm/vgic/vgic-init.c
··· 417 417 kfree(vgic_cpu->private_irqs); 418 418 vgic_cpu->private_irqs = NULL; 419 419 420 - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) 420 + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { 421 + /* 422 + * If this vCPU is being destroyed because of a failed creation 423 + * then unregister the redistributor to avoid leaving behind a 424 + * dangling pointer to the vCPU struct. 425 + * 426 + * vCPUs that have been successfully created (i.e. added to 427 + * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as 428 + * this function gets called while holding kvm->arch.config_lock 429 + * in the VM teardown path and would otherwise introduce a lock 430 + * inversion w.r.t. kvm->srcu. 431 + * 432 + * vCPUs that failed creation are torn down outside of the 433 + * kvm->arch.config_lock and do not get unregistered in 434 + * kvm_vgic_destroy(), meaning it is both safe and necessary to 435 + * do so here. 436 + */ 437 + if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu) 438 + vgic_unregister_redist_iodev(vcpu); 439 + 421 440 vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; 441 + } 422 442 } 423 443 424 444 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) ··· 544 524 if (ret) 545 525 goto out; 546 526 547 - dist->ready = true; 548 527 dist_base = dist->vgic_dist_base; 549 528 mutex_unlock(&kvm->arch.config_lock); 550 529 551 530 ret = vgic_register_dist_iodev(kvm, dist_base, type); 552 - if (ret) 531 + if (ret) { 553 532 kvm_err("Unable to register VGIC dist MMIO regions\n"); 533 + goto out_slots; 534 + } 554 535 536 + /* 537 + * kvm_io_bus_register_dev() guarantees all readers see the new MMIO 538 + * registration before returning through synchronize_srcu(), which also 539 + * implies a full memory barrier. As such, marking the distributor as 540 + * 'ready' here is guaranteed to be ordered after all vCPUs having seen 541 + * a completely configured distributor. 542 + */ 543 + dist->ready = true; 555 544 goto out_slots; 556 545 out: 557 546 mutex_unlock(&kvm->arch.config_lock); 558 547 out_slots: 559 - mutex_unlock(&kvm->slots_lock); 560 - 561 548 if (ret) 562 - kvm_vgic_destroy(kvm); 549 + kvm_vm_dead(kvm); 550 + 551 + mutex_unlock(&kvm->slots_lock); 563 552 564 553 return ret; 565 554 }
+6 -1
arch/arm64/kvm/vgic/vgic-kvm-device.c
··· 236 236 237 237 mutex_lock(&dev->kvm->arch.config_lock); 238 238 239 - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) 239 + /* 240 + * Either userspace has already configured NR_IRQS or 241 + * the vgic has already been initialized and vgic_init() 242 + * supplied a default amount of SPIs. 243 + */ 244 + if (dev->kvm->arch.vgic.nr_spis) 240 245 ret = -EBUSY; 241 246 else 242 247 dev->kvm->arch.vgic.nr_spis =
+4 -4
arch/riscv/kvm/aia_imsic.c
··· 55 55 /* IMSIC SW-file */ 56 56 struct imsic_mrif *swfile; 57 57 phys_addr_t swfile_pa; 58 - spinlock_t swfile_extirq_lock; 58 + raw_spinlock_t swfile_extirq_lock; 59 59 }; 60 60 61 61 #define imsic_vs_csr_read(__c) \ ··· 622 622 * interruptions between reading topei and updating pending status. 623 623 */ 624 624 625 - spin_lock_irqsave(&imsic->swfile_extirq_lock, flags); 625 + raw_spin_lock_irqsave(&imsic->swfile_extirq_lock, flags); 626 626 627 627 if (imsic_mrif_atomic_read(mrif, &mrif->eidelivery) && 628 628 imsic_mrif_topei(mrif, imsic->nr_eix, imsic->nr_msis)) ··· 630 630 else 631 631 kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT); 632 632 633 - spin_unlock_irqrestore(&imsic->swfile_extirq_lock, flags); 633 + raw_spin_unlock_irqrestore(&imsic->swfile_extirq_lock, flags); 634 634 } 635 635 636 636 static void imsic_swfile_read(struct kvm_vcpu *vcpu, bool clear, ··· 1051 1051 } 1052 1052 imsic->swfile = page_to_virt(swfile_page); 1053 1053 imsic->swfile_pa = page_to_phys(swfile_page); 1054 - spin_lock_init(&imsic->swfile_extirq_lock); 1054 + raw_spin_lock_init(&imsic->swfile_extirq_lock); 1055 1055 1056 1056 /* Setup IO device */ 1057 1057 kvm_iodevice_init(&imsic->iodev, &imsic_iodoev_ops);
+4
arch/x86/kernel/kvm.c
··· 37 37 #include <asm/apic.h> 38 38 #include <asm/apicdef.h> 39 39 #include <asm/hypervisor.h> 40 + #include <asm/mtrr.h> 40 41 #include <asm/tlb.h> 41 42 #include <asm/cpuidle_haltpoll.h> 42 43 #include <asm/ptrace.h> ··· 981 980 } 982 981 kvmclock_init(); 983 982 x86_platform.apic_post_init = kvm_apic_init; 983 + 984 + /* Set WB as the default cache mode for SEV-SNP and TDX */ 985 + mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); 984 986 } 985 987 986 988 #if defined(CONFIG_AMD_MEM_ENCRYPT)
+17 -10
arch/x86/kvm/mmu/mmu.c
··· 1556 1556 { 1557 1557 bool flush = false; 1558 1558 1559 + /* 1560 + * To prevent races with vCPUs faulting in a gfn using stale data, 1561 + * zapping a gfn range must be protected by mmu_invalidate_in_progress 1562 + * (and mmu_invalidate_seq). The only exception is memslot deletion; 1563 + * in that case, SRCU synchronization ensures that SPTEs are zapped 1564 + * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the 1565 + * invalid slot. 1566 + */ 1567 + lockdep_assert_once(kvm->mmu_invalidate_in_progress || 1568 + lockdep_is_held(&kvm->slots_lock)); 1569 + 1559 1570 if (kvm_memslots_have_rmaps(kvm)) 1560 1571 flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, 1561 1572 range->start, range->end, ··· 1895 1884 if (is_obsolete_sp((_kvm), (_sp))) { \ 1896 1885 } else 1897 1886 1898 - #define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ 1887 + #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ 1899 1888 for_each_valid_sp(_kvm, _sp, \ 1900 1889 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ 1901 - if ((_sp)->gfn != (_gfn)) {} else 1902 - 1903 - #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ 1904 - for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ 1905 - if (!sp_has_gptes(_sp)) {} else 1890 + if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else 1906 1891 1907 1892 static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1908 1893 { ··· 7070 7063 7071 7064 /* 7072 7065 * Since accounting information is stored in struct kvm_arch_memory_slot, 7073 - * shadow pages deletion (e.g. unaccount_shadowed()) requires that all 7074 - * gfns with a shadow page have a corresponding memslot. Do so before 7075 - * the memslot goes away. 7066 + * all MMU pages that are shadowing guest PTEs must be zapped before the 7067 + * memslot is deleted, as freeing such pages after the memslot is freed 7068 + * will result in use-after-free, e.g. in unaccount_shadowed(). 7076 7069 */ 7077 7070 for (i = 0; i < slot->npages; i++) { 7078 7071 struct kvm_mmu_page *sp; 7079 7072 gfn_t gfn = slot->base_gfn + i; 7080 7073 7081 - for_each_gfn_valid_sp(kvm, sp, gfn) 7074 + for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) 7082 7075 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 7083 7076 7084 7077 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+5 -1
arch/x86/kvm/svm/nested.c
··· 63 63 u64 pdpte; 64 64 int ret; 65 65 66 + /* 67 + * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores 68 + * nCR3[4:0] when loading PDPTEs from memory. 69 + */ 66 70 ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, 67 - offset_in_page(cr3) + index * 8, 8); 71 + (cr3 & GENMASK(11, 5)) + index * 8, 8); 68 72 if (ret) 69 73 return 0; 70 74 return pdpte;
+3 -3
arch/x86/kvm/vmx/vmx.c
··· 4888 4888 vmx->hv_deadline_tsc = -1; 4889 4889 kvm_set_cr8(vcpu, 0); 4890 4890 4891 - vmx_segment_cache_clear(vmx); 4892 - kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4893 - 4894 4891 seg_setup(VCPU_SREG_CS); 4895 4892 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4896 4893 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); ··· 4913 4916 4914 4917 vmcs_writel(GUEST_IDTR_BASE, 0); 4915 4918 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4919 + 4920 + vmx_segment_cache_clear(vmx); 4921 + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4916 4922 4917 4923 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4918 4924 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
-2
include/linux/kvm_host.h
··· 1313 1313 1314 1314 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); 1315 1315 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); 1316 - kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); 1317 - kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); 1318 1316 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); 1319 1317 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); 1320 1318 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+1
tools/testing/selftests/kvm/Makefile
··· 244 244 -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ 245 245 -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ 246 246 -I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \ 247 + -march=x86-64-v2 \ 247 248 $(KHDR_INCLUDES) 248 249 ifeq ($(ARCH),s390) 249 250 CFLAGS += -march=z10
+13 -3
tools/testing/selftests/kvm/aarch64/set_id_regs.c
··· 68 68 } 69 69 70 70 static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = { 71 + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DoubleLock, 0), 72 + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, WRPs, 0), 71 73 S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0), 72 74 REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP), 73 75 REG_FTR_END, ··· 133 131 REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 0), 134 132 REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 0), 135 133 REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL0, 0), 134 + REG_FTR_END, 135 + }; 136 + 137 + static const struct reg_ftr_bits ftr_id_aa64pfr1_el1[] = { 138 + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, CSV2_frac, 0), 139 + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, SSBS, ID_AA64PFR1_EL1_SSBS_NI), 140 + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, BT, 0), 136 141 REG_FTR_END, 137 142 }; 138 143 ··· 209 200 TEST_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1_el1), 210 201 TEST_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2_el1), 211 202 TEST_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0_el1), 203 + TEST_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1_el1), 212 204 TEST_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0_el1), 213 205 TEST_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1_el1), 214 206 TEST_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2_el1), ··· 579 569 test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + 580 570 ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + 581 571 ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + 582 - ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + 583 - ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - 584 - ARRAY_SIZE(test_regs) + 2; 572 + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + 573 + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + 574 + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2; 585 575 586 576 ksft_set_plan(test_cnt); 587 577
+1 -1
tools/testing/selftests/kvm/x86_64/cpuid_test.c
··· 60 60 { 61 61 int i; 62 62 63 - for (i = 0; i < sizeof(mangled_cpuids); i++) { 63 + for (i = 0; i < ARRAY_SIZE(mangled_cpuids); i++) { 64 64 if (mangled_cpuids[i].function == entrie->function && 65 65 mangled_cpuids[i].index == entrie->index) 66 66 return true;
-12
virt/kvm/kvm_main.c
··· 3035 3035 } 3036 3036 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 3037 3037 3038 - kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 3039 - { 3040 - return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 3041 - } 3042 - EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 3043 - 3044 3038 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 3045 3039 { 3046 3040 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 3047 3041 } 3048 3042 EXPORT_SYMBOL_GPL(gfn_to_pfn); 3049 - 3050 - kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 3051 - { 3052 - return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 3053 - } 3054 - EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 3055 3043 3056 3044 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 3057 3045 struct page **pages, int nr_pages)