Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"These are mostly Oliver's Arm changes: lock ordering fixes for the
vGIC, and reverts for a buggy attempt to avoid RCU stalls on large
VMs.

Arm:

- Invalidate nested MMUs upon freeing the PGD to avoid WARNs when
visiting from an MMU notifier

- Fixes to the TLB match process and TLB invalidation range for
managing the VCNR pseudo-TLB

- Prevent SPE from erroneously profiling guests due to UNKNOWN reset
values in PMSCR_EL1

- Fix save/restore of host MDCR_EL2 to account for eagerly
programming at vcpu_load() on VHE systems

- Correct lock ordering when dealing with VGIC LPIs, avoiding
scenarios where an xarray's spinlock was nested with a *raw*
spinlock

- Permit stage-2 read permission aborts which are possible in the
case of NV depending on the guest hypervisor's stage-2 translation

- Call raw_spin_unlock() instead of the internal spinlock API

- Fix parameter ordering when assigning VBAR_EL1

- Reverted a couple of fixes for RCU stalls when destroying a stage-2
page table.

There appears to be some nasty refcounting / UAF issues lurking in
those patches and the band-aid we tried to apply didn't hold.

s390:

- mm fixes, including userfaultfd bug fix

x86:

- Sync the vTPR from the local APIC to the VMCB even when AVIC is
active.

This fixes a bug where host updates to the vTPR, e.g. via
KVM_SET_LAPIC or emulation of a guest access, are lost and result
in interrupt delivery issues in the guest"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: SVM: Sync TPR from LAPIC into VMCB::V_TPR even if AVIC is active
Revert "KVM: arm64: Split kvm_pgtable_stage2_destroy()"
Revert "KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables"
KVM: arm64: vgic: fix incorrect spinlock API usage
KVM: arm64: Remove stage 2 read fault check
KVM: arm64: Fix parameter ordering for VBAR_EL1 assignment
KVM: arm64: nv: Fix incorrect VNCR invalidation range calculation
KVM: arm64: vgic-v3: Indicate vgic_put_irq() may take LPI xarray lock
KVM: arm64: vgic-v3: Don't require IRQs be disabled for LPI xarray lock
KVM: arm64: vgic-v3: Erase LPIs from xarray outside of raw spinlocks
KVM: arm64: Spin off release helper from vgic_put_irq()
KVM: arm64: vgic-v3: Use bare refcount for VGIC LPIs
KVM: arm64: vgic: Drop stale comment on IRQ active state
KVM: arm64: VHE: Save and restore host MDCR_EL2 value correctly
KVM: arm64: Initialize PMSCR_EL1 when in VHE
KVM: arm64: nv: fix VNCR TLB ASID match logic for non-Global entries
KVM: s390: Fix FOLL_*/FAULT_FLAG_* confusion
KVM: s390: Fix incorrect usage of mmu_notifier_register()
KVM: s390: Fix access to unavailable adapter indicator pages during postcopy
KVM: arm64: Mark freed S2 MMUs as invalid

+155 -177
+1
arch/arm64/include/asm/kvm_host.h
··· 1369 1369 } 1370 1370 1371 1371 void kvm_init_host_debug_data(void); 1372 + void kvm_debug_init_vhe(void); 1372 1373 void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); 1373 1374 void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu); 1374 1375 void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu);
-30
arch/arm64/include/asm/kvm_pgtable.h
··· 355 355 return pteref; 356 356 } 357 357 358 - static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) 359 - { 360 - return pteref; 361 - } 362 - 363 358 static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) 364 359 { 365 360 /* ··· 382 387 kvm_pteref_t pteref) 383 388 { 384 389 return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); 385 - } 386 - 387 - static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) 388 - { 389 - return rcu_dereference_raw(pteref); 390 390 } 391 391 392 392 static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) ··· 550 560 * to freeing and therefore no TLB invalidation is performed. 551 561 */ 552 562 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); 553 - 554 - /** 555 - * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. 556 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 557 - * @addr: Intermediate physical address at which to place the mapping. 558 - * @size: Size of the mapping. 559 - * 560 - * The page-table is assumed to be unreachable by any hardware walkers prior 561 - * to freeing and therefore no TLB invalidation is performed. 562 - */ 563 - void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, 564 - u64 addr, u64 size); 565 - 566 - /** 567 - * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. 568 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 569 - * 570 - * It is assumed that the rest of the page-table is freed before this operation. 571 - */ 572 - void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); 573 563 574 564 /** 575 565 * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
+1 -3
arch/arm64/include/asm/kvm_pkvm.h
··· 179 179 180 180 int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, 181 181 struct kvm_pgtable_mm_ops *mm_ops); 182 - void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, 183 - u64 addr, u64 size); 184 - void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); 182 + void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); 185 183 int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, 186 184 enum kvm_pgtable_prot prot, void *mc, 187 185 enum kvm_pgtable_walk_flags flags);
+3 -1
arch/arm64/kvm/arm.c
··· 2113 2113 { 2114 2114 cpu_set_hyp_vector(); 2115 2115 2116 - if (is_kernel_in_hyp_mode()) 2116 + if (is_kernel_in_hyp_mode()) { 2117 2117 kvm_timer_init_vhe(); 2118 + kvm_debug_init_vhe(); 2119 + } 2118 2120 2119 2121 if (vgic_present) 2120 2122 kvm_vgic_init_cpu_hardware();
+13
arch/arm64/kvm/debug.c
··· 96 96 } 97 97 } 98 98 99 + void kvm_debug_init_vhe(void) 100 + { 101 + /* Clear PMSCR_EL1.E{0,1}SPE which reset to UNKNOWN values. */ 102 + if (SYS_FIELD_GET(ID_AA64DFR0_EL1, PMSVer, read_sysreg(id_aa64dfr0_el1))) 103 + write_sysreg_el1(0, SYS_PMSCR); 104 + } 105 + 99 106 /* 100 107 * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host 101 108 * has taken over MDSCR_EL1. ··· 144 137 145 138 /* Must be called before kvm_vcpu_load_vhe() */ 146 139 KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); 140 + 141 + if (has_vhe()) 142 + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); 147 143 148 144 /* 149 145 * Determine which of the possible debug states we're in: ··· 194 184 195 185 void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) 196 186 { 187 + if (has_vhe()) 188 + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); 189 + 197 190 if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) 198 191 return; 199 192
-5
arch/arm64/kvm/hyp/include/hyp/switch.h
··· 431 431 vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); 432 432 } 433 433 434 - *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); 435 - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); 436 - 437 434 if (cpus_have_final_cap(ARM64_HAS_HCX)) { 438 435 u64 hcrx = vcpu->arch.hcrx_el2; 439 436 if (is_nested_ctxt(vcpu)) { ··· 450 453 static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) 451 454 { 452 455 struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); 453 - 454 - write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); 455 456 456 457 write_sysreg(0, hstr_el2); 457 458 if (system_supports_pmuv3()) {
+6
arch/arm64/kvm/hyp/nvhe/switch.c
··· 50 50 static void __activate_traps(struct kvm_vcpu *vcpu) 51 51 { 52 52 ___activate_traps(vcpu, vcpu->arch.hcr_el2); 53 + 54 + *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2); 55 + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); 56 + 53 57 __activate_traps_common(vcpu); 54 58 __activate_cptr_traps(vcpu); 55 59 ··· 96 92 write_sysreg_el1(val | SCTLR_ELx_M, SYS_SCTLR); 97 93 isb(); 98 94 } 95 + 96 + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); 99 97 100 98 __deactivate_traps_common(vcpu); 101 99
+1 -1
arch/arm64/kvm/hyp/nvhe/sys_regs.c
··· 253 253 254 254 *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); 255 255 *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); 256 - __vcpu_assign_sys_reg(vcpu, read_sysreg_el1(SYS_VBAR), VBAR_EL1); 256 + __vcpu_assign_sys_reg(vcpu, VBAR_EL1, read_sysreg_el1(SYS_VBAR)); 257 257 258 258 kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); 259 259
+4 -21
arch/arm64/kvm/hyp/pgtable.c
··· 1551 1551 return 0; 1552 1552 } 1553 1553 1554 - void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, 1555 - u64 addr, u64 size) 1554 + void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) 1556 1555 { 1556 + size_t pgd_sz; 1557 1557 struct kvm_pgtable_walker walker = { 1558 1558 .cb = stage2_free_walker, 1559 1559 .flags = KVM_PGTABLE_WALK_LEAF | 1560 1560 KVM_PGTABLE_WALK_TABLE_POST, 1561 1561 }; 1562 1562 1563 - WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); 1564 - } 1565 - 1566 - void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) 1567 - { 1568 - size_t pgd_sz; 1569 - 1563 + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 1570 1564 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; 1571 - 1572 - /* 1573 - * Since the pgtable is unlinked at this point, and not shared with 1574 - * other walkers, safely deference pgd with kvm_dereference_pteref_raw() 1575 - */ 1576 - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); 1565 + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); 1577 1566 pgt->pgd = NULL; 1578 - } 1579 - 1580 - void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) 1581 - { 1582 - kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); 1583 - kvm_pgtable_stage2_destroy_pgd(pgt); 1584 1567 } 1585 1568 1586 1569 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
+6 -39
arch/arm64/kvm/mmu.c
··· 904 904 return 0; 905 905 } 906 906 907 - /* 908 - * Assume that @pgt is valid and unlinked from the KVM MMU to free the 909 - * page-table without taking the kvm_mmu_lock and without performing any 910 - * TLB invalidations. 911 - * 912 - * Also, the range of addresses can be large enough to cause need_resched 913 - * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke 914 - * cond_resched() periodically to prevent hogging the CPU for a long time 915 - * and schedule something else, if required. 916 - */ 917 - static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, 918 - phys_addr_t end) 919 - { 920 - u64 next; 921 - 922 - do { 923 - next = stage2_range_addr_end(addr, end); 924 - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, 925 - next - addr); 926 - if (next != end) 927 - cond_resched(); 928 - } while (addr = next, addr != end); 929 - } 930 - 931 - static void kvm_stage2_destroy(struct kvm_pgtable *pgt) 932 - { 933 - unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); 934 - 935 - stage2_destroy_range(pgt, 0, BIT(ia_bits)); 936 - KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); 937 - } 938 - 939 907 /** 940 908 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 941 909 * @kvm: The pointer to the KVM structure ··· 980 1012 return 0; 981 1013 982 1014 out_destroy_pgtable: 983 - kvm_stage2_destroy(pgt); 1015 + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 984 1016 out_free_pgtable: 985 1017 kfree(pgt); 986 1018 return err; ··· 1074 1106 mmu->pgt = NULL; 1075 1107 free_percpu(mmu->last_vcpu_ran); 1076 1108 } 1109 + 1110 + if (kvm_is_nested_s2_mmu(kvm, mmu)) 1111 + kvm_init_nested_s2_mmu(mmu); 1112 + 1077 1113 write_unlock(&kvm->mmu_lock); 1078 1114 1079 1115 if (pgt) { 1080 - kvm_stage2_destroy(pgt); 1116 + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 1081 1117 kfree(pgt); 1082 1118 } 1083 1119 } ··· 1512 1540 write_fault = kvm_is_write_fault(vcpu); 1513 1541 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1514 1542 VM_BUG_ON(write_fault && exec_fault); 1515 - 1516 - if (fault_is_perm && !write_fault && !exec_fault) { 1517 - kvm_err("Unexpected L2 read permission error\n"); 1518 - return -EFAULT; 1519 - } 1520 1543 1521 1544 if (!is_protected_kvm_enabled()) 1522 1545 memcache = &vcpu->arch.mmu_page_cache;
+3 -3
arch/arm64/kvm/nested.c
··· 847 847 848 848 ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, 849 849 vt->wr.level)); 850 - ipa_start = vt->wr.pa & (ipa_size - 1); 850 + ipa_start = vt->wr.pa & ~(ipa_size - 1); 851 851 ipa_end = ipa_start + ipa_size; 852 852 853 853 if (ipa_end <= start || ipa_start >= end) ··· 887 887 888 888 va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift, 889 889 vt->wr.level)); 890 - va_start = vt->gva & (va_size - 1); 890 + va_start = vt->gva & ~(va_size - 1); 891 891 va_end = va_start + va_size; 892 892 893 893 switch (scope->type) { ··· 1276 1276 !(tcr & TCR_ASID16)) 1277 1277 asid &= GENMASK(7, 0); 1278 1278 1279 - return asid != vt->wr.asid; 1279 + return asid == vt->wr.asid; 1280 1280 } 1281 1281 1282 1282 return true;
+2 -9
arch/arm64/kvm/pkvm.c
··· 316 316 return 0; 317 317 } 318 318 319 - void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, 320 - u64 addr, u64 size) 319 + void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) 321 320 { 322 - __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); 323 - } 324 - 325 - void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) 326 - { 327 - /* Expected to be called after all pKVM mappings have been released. */ 328 - WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); 321 + __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); 329 322 } 330 323 331 324 int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+1 -1
arch/arm64/kvm/vgic/vgic-debug.c
··· 69 69 int nr_lpis = 0; 70 70 71 71 xa_for_each(&dist->lpi_xa, intid, irq) { 72 - if (!vgic_try_get_irq_kref(irq)) 72 + if (!vgic_try_get_irq_ref(irq)) 73 73 continue; 74 74 75 75 xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+3 -3
arch/arm64/kvm/vgic/vgic-init.c
··· 53 53 { 54 54 struct vgic_dist *dist = &kvm->arch.vgic; 55 55 56 - xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); 56 + xa_init(&dist->lpi_xa); 57 57 } 58 58 59 59 /* CREATION */ ··· 208 208 raw_spin_lock_init(&irq->irq_lock); 209 209 irq->vcpu = NULL; 210 210 irq->target_vcpu = vcpu0; 211 - kref_init(&irq->refcount); 211 + refcount_set(&irq->refcount, 0); 212 212 switch (dist->vgic_model) { 213 213 case KVM_DEV_TYPE_ARM_VGIC_V2: 214 214 irq->targets = 0; ··· 277 277 irq->intid = i; 278 278 irq->vcpu = NULL; 279 279 irq->target_vcpu = vcpu; 280 - kref_init(&irq->refcount); 280 + refcount_set(&irq->refcount, 0); 281 281 if (vgic_irq_is_sgi(i)) { 282 282 /* SGIs */ 283 283 irq->enabled = 1;
+7 -8
arch/arm64/kvm/vgic/vgic-its.c
··· 78 78 { 79 79 struct vgic_dist *dist = &kvm->arch.vgic; 80 80 struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; 81 - unsigned long flags; 82 81 int ret; 83 82 84 83 /* In this case there is no put, since we keep the reference. */ ··· 88 89 if (!irq) 89 90 return ERR_PTR(-ENOMEM); 90 91 91 - ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); 92 + ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); 92 93 if (ret) { 93 94 kfree(irq); 94 95 return ERR_PTR(ret); ··· 98 99 raw_spin_lock_init(&irq->irq_lock); 99 100 100 101 irq->config = VGIC_CONFIG_EDGE; 101 - kref_init(&irq->refcount); 102 + refcount_set(&irq->refcount, 1); 102 103 irq->intid = intid; 103 104 irq->target_vcpu = vcpu; 104 105 irq->group = 1; 105 106 106 - xa_lock_irqsave(&dist->lpi_xa, flags); 107 + xa_lock(&dist->lpi_xa); 107 108 108 109 /* 109 110 * There could be a race with another vgic_add_lpi(), so we need to 110 111 * check that we don't add a second list entry with the same LPI. 111 112 */ 112 113 oldirq = xa_load(&dist->lpi_xa, intid); 113 - if (vgic_try_get_irq_kref(oldirq)) { 114 + if (vgic_try_get_irq_ref(oldirq)) { 114 115 /* Someone was faster with adding this LPI, lets use that. */ 115 116 kfree(irq); 116 117 irq = oldirq; ··· 125 126 } 126 127 127 128 out_unlock: 128 - xa_unlock_irqrestore(&dist->lpi_xa, flags); 129 + xa_unlock(&dist->lpi_xa); 129 130 130 131 if (ret) 131 132 return ERR_PTR(ret); ··· 546 547 rcu_read_lock(); 547 548 548 549 irq = xa_load(&its->translation_cache, cache_key); 549 - if (!vgic_try_get_irq_kref(irq)) 550 + if (!vgic_try_get_irq_ref(irq)) 550 551 irq = NULL; 551 552 552 553 rcu_read_unlock(); ··· 570 571 * its_lock, as the ITE (and the reference it holds) cannot be freed. 571 572 */ 572 573 lockdep_assert_held(&its->its_lock); 573 - vgic_get_irq_kref(irq); 574 + vgic_get_irq_ref(irq); 574 575 575 576 old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT); 576 577
+1 -1
arch/arm64/kvm/vgic/vgic-v4.c
··· 518 518 if (!irq->hw || irq->host_irq != host_irq) 519 519 continue; 520 520 521 - if (!vgic_try_get_irq_kref(irq)) 521 + if (!vgic_try_get_irq_ref(irq)) 522 522 return NULL; 523 523 524 524 return irq;
+58 -22
arch/arm64/kvm/vgic/vgic.c
··· 28 28 * kvm->arch.config_lock (mutex) 29 29 * its->cmd_lock (mutex) 30 30 * its->its_lock (mutex) 31 - * vgic_cpu->ap_list_lock must be taken with IRQs disabled 32 - * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled 31 + * vgic_dist->lpi_xa.xa_lock 32 + * vgic_cpu->ap_list_lock must be taken with IRQs disabled 33 33 * vgic_irq->irq_lock must be taken with IRQs disabled 34 34 * 35 35 * As the ap_list_lock might be taken from the timer interrupt handler, ··· 71 71 rcu_read_lock(); 72 72 73 73 irq = xa_load(&dist->lpi_xa, intid); 74 - if (!vgic_try_get_irq_kref(irq)) 74 + if (!vgic_try_get_irq_ref(irq)) 75 75 irq = NULL; 76 76 77 77 rcu_read_unlock(); ··· 114 114 return vgic_get_irq(vcpu->kvm, intid); 115 115 } 116 116 117 - /* 118 - * We can't do anything in here, because we lack the kvm pointer to 119 - * lock and remove the item from the lpi_list. So we keep this function 120 - * empty and use the return value of kref_put() to trigger the freeing. 121 - */ 122 - static void vgic_irq_release(struct kref *ref) 117 + static void vgic_release_lpi_locked(struct vgic_dist *dist, struct vgic_irq *irq) 123 118 { 119 + lockdep_assert_held(&dist->lpi_xa.xa_lock); 120 + __xa_erase(&dist->lpi_xa, irq->intid); 121 + kfree_rcu(irq, rcu); 122 + } 123 + 124 + static __must_check bool __vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) 125 + { 126 + if (irq->intid < VGIC_MIN_LPI) 127 + return false; 128 + 129 + return refcount_dec_and_test(&irq->refcount); 130 + } 131 + 132 + static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq *irq) 133 + { 134 + if (!__vgic_put_irq(kvm, irq)) 135 + return false; 136 + 137 + irq->pending_release = true; 138 + return true; 124 139 } 125 140 126 141 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) 127 142 { 128 143 struct vgic_dist *dist = &kvm->arch.vgic; 129 - unsigned long flags; 130 144 131 - if (irq->intid < VGIC_MIN_LPI) 145 + if (irq->intid >= VGIC_MIN_LPI) 146 + might_lock(&dist->lpi_xa.xa_lock); 147 + 148 + if (!__vgic_put_irq(kvm, irq)) 132 149 return; 133 150 134 - if (!kref_put(&irq->refcount, vgic_irq_release)) 135 - return; 151 + xa_lock(&dist->lpi_xa); 152 + vgic_release_lpi_locked(dist, irq); 153 + xa_unlock(&dist->lpi_xa); 154 + } 136 155 137 - xa_lock_irqsave(&dist->lpi_xa, flags); 138 - __xa_erase(&dist->lpi_xa, irq->intid); 139 - xa_unlock_irqrestore(&dist->lpi_xa, flags); 156 + static void vgic_release_deleted_lpis(struct kvm *kvm) 157 + { 158 + struct vgic_dist *dist = &kvm->arch.vgic; 159 + unsigned long intid; 160 + struct vgic_irq *irq; 140 161 141 - kfree_rcu(irq, rcu); 162 + xa_lock(&dist->lpi_xa); 163 + 164 + xa_for_each(&dist->lpi_xa, intid, irq) { 165 + if (irq->pending_release) 166 + vgic_release_lpi_locked(dist, irq); 167 + } 168 + 169 + xa_unlock(&dist->lpi_xa); 142 170 } 143 171 144 172 void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) 145 173 { 146 174 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 147 175 struct vgic_irq *irq, *tmp; 176 + bool deleted = false; 148 177 unsigned long flags; 149 178 150 179 raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); ··· 184 155 list_del(&irq->ap_list); 185 156 irq->vcpu = NULL; 186 157 raw_spin_unlock(&irq->irq_lock); 187 - vgic_put_irq(vcpu->kvm, irq); 158 + deleted |= vgic_put_irq_norelease(vcpu->kvm, irq); 188 159 } 189 160 } 190 161 191 162 raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); 163 + 164 + if (deleted) 165 + vgic_release_deleted_lpis(vcpu->kvm); 192 166 } 193 167 194 168 void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) ··· 431 399 * now in the ap_list. This is safe as the caller must already hold a 432 400 * reference on the irq. 433 401 */ 434 - vgic_get_irq_kref(irq); 402 + vgic_get_irq_ref(irq); 435 403 list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); 436 404 irq->vcpu = vcpu; 437 405 ··· 662 630 { 663 631 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 664 632 struct vgic_irq *irq, *tmp; 633 + bool deleted_lpis = false; 665 634 666 635 DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); 667 636 ··· 690 657 691 658 /* 692 659 * This vgic_put_irq call matches the 693 - * vgic_get_irq_kref in vgic_queue_irq_unlock, 660 + * vgic_get_irq_ref in vgic_queue_irq_unlock, 694 661 * where we added the LPI to the ap_list. As 695 662 * we remove the irq from the list, we drop 696 663 * also drop the refcount. 697 664 */ 698 - vgic_put_irq(vcpu->kvm, irq); 665 + deleted_lpis |= vgic_put_irq_norelease(vcpu->kvm, irq); 699 666 continue; 700 667 } 701 668 ··· 758 725 } 759 726 760 727 raw_spin_unlock(&vgic_cpu->ap_list_lock); 728 + 729 + if (unlikely(deleted_lpis)) 730 + vgic_release_deleted_lpis(vcpu->kvm); 761 731 } 762 732 763 733 static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) ··· 854 818 * the AP list has been sorted already. 855 819 */ 856 820 if (multi_sgi && irq->priority > prio) { 857 - _raw_spin_unlock(&irq->irq_lock); 821 + raw_spin_unlock(&irq->irq_lock); 858 822 break; 859 823 } 860 824
+4 -4
arch/arm64/kvm/vgic/vgic.h
··· 267 267 void vgic_v2_save_state(struct kvm_vcpu *vcpu); 268 268 void vgic_v2_restore_state(struct kvm_vcpu *vcpu); 269 269 270 - static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) 270 + static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq) 271 271 { 272 272 if (!irq) 273 273 return false; ··· 275 275 if (irq->intid < VGIC_MIN_LPI) 276 276 return true; 277 277 278 - return kref_get_unless_zero(&irq->refcount); 278 + return refcount_inc_not_zero(&irq->refcount); 279 279 } 280 280 281 - static inline void vgic_get_irq_kref(struct vgic_irq *irq) 281 + static inline void vgic_get_irq_ref(struct vgic_irq *irq) 282 282 { 283 - WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); 283 + WARN_ON_ONCE(!vgic_try_get_irq_ref(irq)); 284 284 } 285 285 286 286 void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+11 -4
arch/s390/kvm/interrupt.c
··· 2778 2778 2779 2779 static struct page *get_map_page(struct kvm *kvm, u64 uaddr) 2780 2780 { 2781 + struct mm_struct *mm = kvm->mm; 2781 2782 struct page *page = NULL; 2783 + int locked = 1; 2782 2784 2783 - mmap_read_lock(kvm->mm); 2784 - get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, 2785 - &page, NULL); 2786 - mmap_read_unlock(kvm->mm); 2785 + if (mmget_not_zero(mm)) { 2786 + mmap_read_lock(mm); 2787 + get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE, 2788 + &page, &locked); 2789 + if (locked) 2790 + mmap_read_unlock(mm); 2791 + mmput(mm); 2792 + } 2793 + 2787 2794 return page; 2788 2795 } 2789 2796
+12 -12
arch/s390/kvm/kvm-s390.c
··· 4864 4864 * @vcpu: the vCPU whose gmap is to be fixed up 4865 4865 * @gfn: the guest frame number used for memslots (including fake memslots) 4866 4866 * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps 4867 - * @flags: FOLL_* flags 4867 + * @foll: FOLL_* flags 4868 4868 * 4869 4869 * Return: 0 on success, < 0 in case of error. 4870 4870 * Context: The mm lock must not be held before calling. May sleep. 4871 4871 */ 4872 - int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) 4872 + int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll) 4873 4873 { 4874 4874 struct kvm_memory_slot *slot; 4875 4875 unsigned int fault_flags; ··· 4883 4883 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 4884 4884 return vcpu_post_run_addressing_exception(vcpu); 4885 4885 4886 - fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; 4886 + fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; 4887 4887 if (vcpu->arch.gmap->pfault_enabled) 4888 - flags |= FOLL_NOWAIT; 4888 + foll |= FOLL_NOWAIT; 4889 4889 vmaddr = __gfn_to_hva_memslot(slot, gfn); 4890 4890 4891 4891 try_again: 4892 - pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); 4892 + pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page); 4893 4893 4894 4894 /* Access outside memory, inject addressing exception */ 4895 4895 if (is_noslot_pfn(pfn)) ··· 4905 4905 return 0; 4906 4906 vcpu->stat.pfault_sync++; 4907 4907 /* Could not setup async pfault, try again synchronously */ 4908 - flags &= ~FOLL_NOWAIT; 4908 + foll &= ~FOLL_NOWAIT; 4909 4909 goto try_again; 4910 4910 } 4911 4911 /* Any other error */ ··· 4925 4925 return rc; 4926 4926 } 4927 4927 4928 - static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) 4928 + static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll) 4929 4929 { 4930 4930 unsigned long gaddr_tmp; 4931 4931 gfn_t gfn; ··· 4950 4950 } 4951 4951 gfn = gpa_to_gfn(gaddr_tmp); 4952 4952 } 4953 - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); 4953 + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll); 4954 4954 } 4955 4955 4956 4956 static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) 4957 4957 { 4958 - unsigned int flags = 0; 4958 + unsigned int foll = 0; 4959 4959 unsigned long gaddr; 4960 4960 int rc; 4961 4961 4962 4962 gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; 4963 4963 if (kvm_s390_cur_gmap_fault_is_write()) 4964 - flags = FAULT_FLAG_WRITE; 4964 + foll = FOLL_WRITE; 4965 4965 4966 4966 switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) { 4967 4967 case 0: ··· 5003 5003 send_sig(SIGSEGV, current, 0); 5004 5004 if (rc != -ENXIO) 5005 5005 break; 5006 - flags = FAULT_FLAG_WRITE; 5006 + foll = FOLL_WRITE; 5007 5007 fallthrough; 5008 5008 case PGM_PROTECTION: 5009 5009 case PGM_SEGMENT_TRANSLATION: ··· 5013 5013 case PGM_REGION_SECOND_TRANS: 5014 5014 case PGM_REGION_THIRD_TRANS: 5015 5015 kvm_s390_assert_primary_as(vcpu); 5016 - return vcpu_dat_fault_handler(vcpu, gaddr, flags); 5016 + return vcpu_dat_fault_handler(vcpu, gaddr, foll); 5017 5017 default: 5018 5018 KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", 5019 5019 current->thread.gmap_int_code, current->thread.gmap_teid.val);
+11 -5
arch/s390/kvm/pv.c
··· 624 624 int cc, ret; 625 625 u16 dummy; 626 626 627 + /* Add the notifier only once. No races because we hold kvm->lock */ 628 + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { 629 + /* The notifier will be unregistered when the VM is destroyed */ 630 + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; 631 + ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); 632 + if (ret) { 633 + kvm->arch.pv.mmu_notifier.ops = NULL; 634 + return ret; 635 + } 636 + } 637 + 627 638 ret = kvm_s390_pv_alloc_vm(kvm); 628 639 if (ret) 629 640 return ret; ··· 670 659 return -EIO; 671 660 } 672 661 kvm->arch.gmap->guest_handle = uvcb.guest_handle; 673 - /* Add the notifier only once. No races because we hold kvm->lock */ 674 - if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { 675 - kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; 676 - mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); 677 - } 678 662 return 0; 679 663 } 680 664
+1 -2
arch/x86/kvm/svm/svm.c
··· 4046 4046 struct vcpu_svm *svm = to_svm(vcpu); 4047 4047 u64 cr8; 4048 4048 4049 - if (nested_svm_virtualize_tpr(vcpu) || 4050 - kvm_vcpu_apicv_active(vcpu)) 4049 + if (nested_svm_virtualize_tpr(vcpu)) 4051 4050 return; 4052 4051 4053 4052 cr8 = kvm_get_cr8(vcpu);
+6 -3
include/kvm/arm_vgic.h
··· 8 8 #include <linux/bits.h> 9 9 #include <linux/kvm.h> 10 10 #include <linux/irqreturn.h> 11 - #include <linux/kref.h> 12 11 #include <linux/mutex.h> 12 + #include <linux/refcount.h> 13 13 #include <linux/spinlock.h> 14 14 #include <linux/static_key.h> 15 15 #include <linux/types.h> ··· 139 139 bool pending_latch; /* The pending latch state used to calculate 140 140 * the pending state for both level 141 141 * and edge triggered IRQs. */ 142 - bool active; /* not used for LPIs */ 142 + bool active; 143 + bool pending_release; /* Used for LPIs only, unreferenced IRQ 144 + * pending a release */ 145 + 143 146 bool enabled; 144 147 bool hw; /* Tied to HW IRQ */ 145 - struct kref refcount; /* Used for LPIs */ 148 + refcount_t refcount; /* Used for LPIs */ 146 149 u32 hwintid; /* HW INTID number */ 147 150 unsigned int host_irq; /* linux irq corresponding to hwintid */ 148 151 union {