Merge tag 'kvm-x86-irqs-6.17' of https://github.com/kvm-x86/linux into HEAD

+8 -12

arch/arm64/kvm/arm.c

··· 2765 2765 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq); 2766 2766 } 2767 2767 2768 - bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, 2769 - struct kvm_kernel_irq_routing_entry *new) 2768 + void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, 2769 + struct kvm_kernel_irq_routing_entry *old, 2770 + struct kvm_kernel_irq_routing_entry *new) 2770 2771 { 2771 - if (old->type != KVM_IRQ_ROUTING_MSI || 2772 - new->type != KVM_IRQ_ROUTING_MSI) 2773 - return true; 2772 + if (old->type == KVM_IRQ_ROUTING_MSI && 2773 + new->type == KVM_IRQ_ROUTING_MSI && 2774 + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) 2775 + return; 2774 2776 2775 - return memcmp(&old->msi, &new->msi, sizeof(new->msi)); 2776 - } 2777 - 2778 - int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, 2779 - uint32_t guest_irq, bool set) 2780 - { 2781 2777 /* 2782 2778 * Remapping the vLPI requires taking the its_lock mutex to resolve 2783 2779 * the new translation. We're in spinlock land at this point, so no ··· 2781 2785 * 2782 2786 * Unmap the vLPI and fall back to software LPI injection. 2783 2787 */ 2784 - return kvm_vgic_v4_unset_forwarding(kvm, host_irq); 2788 + return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq); 2785 2789 } 2786 2790 2787 2791 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)

+1 -1

arch/arm64/kvm/vgic/vgic-its.c

··· 758 758 if (irq) { 759 759 scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) { 760 760 if (irq->hw) 761 - WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); 761 + its_unmap_vlpi(ite->irq->host_irq); 762 762 763 763 irq->hw = false; 764 764 }

+4 -6

arch/arm64/kvm/vgic/vgic-v4.c

··· 527 527 return NULL; 528 528 } 529 529 530 - int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) 530 + void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq) 531 531 { 532 532 struct vgic_irq *irq; 533 533 unsigned long flags; 534 - int ret = 0; 535 534 536 535 if (!vgic_supports_direct_msis(kvm)) 537 - return 0; 536 + return; 538 537 539 538 irq = __vgic_host_irq_get_vlpi(kvm, host_irq); 540 539 if (!irq) 541 - return 0; 540 + return; 542 541 543 542 raw_spin_lock_irqsave(&irq->irq_lock, flags); 544 543 WARN_ON(irq->hw && irq->host_irq != host_irq); 545 544 if (irq->hw) { 546 545 atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count); 547 546 irq->hw = false; 548 - ret = its_unmap_vlpi(host_irq); 547 + its_unmap_vlpi(host_irq); 549 548 } 550 549 551 550 raw_spin_unlock_irqrestore(&irq->irq_lock, flags); 552 551 vgic_put_irq(kvm, irq); 553 - return ret; 554 552 }

+16 -1

arch/x86/include/asm/irq_remapping.h

··· 26 26 IRQ_REMAP_X2APIC_MODE, 27 27 }; 28 28 29 - struct vcpu_data { 29 + /* 30 + * This is mainly used to communicate information back-and-forth 31 + * between SVM and IOMMU for setting up and tearing down posted 32 + * interrupt 33 + */ 34 + struct amd_iommu_pi_data { 35 + u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */ 36 + u32 ga_tag; 37 + u32 vector; /* Guest vector of the interrupt */ 38 + int cpu; 39 + bool ga_log_intr; 40 + bool is_guest_mode; 41 + void *ir_data; 42 + }; 43 + 44 + struct intel_iommu_pi_data { 30 45 u64 pi_desc_addr; /* Physical address of PI Descriptor */ 31 46 u32 vector; /* Guest vector of the interrupt */ 32 47 };

+1 -1

arch/x86/include/asm/kvm-x86-ops.h

··· 112 112 KVM_X86_OP_OPTIONAL(vcpu_blocking) 113 113 KVM_X86_OP_OPTIONAL(vcpu_unblocking) 114 114 KVM_X86_OP_OPTIONAL(pi_update_irte) 115 - KVM_X86_OP_OPTIONAL(pi_start_assignment) 115 + KVM_X86_OP_OPTIONAL(pi_start_bypass) 116 116 KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) 117 117 KVM_X86_OP_OPTIONAL(apicv_post_state_restore) 118 118 KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)

+18 -27

arch/x86/include/asm/kvm_host.h

··· 297 297 */ 298 298 #define KVM_APIC_PV_EOI_PENDING 1 299 299 300 + struct kvm_kernel_irqfd; 300 301 struct kvm_kernel_irq_routing_entry; 301 302 302 303 /* ··· 1321 1320 */ 1322 1321 APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, 1323 1322 1323 + /* 1324 + * AVIC is disabled because the vCPU's APIC ID is beyond the max 1325 + * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable. 1326 + */ 1327 + APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG, 1328 + 1324 1329 NR_APICV_INHIBIT_REASONS, 1325 1330 }; 1326 1331 ··· 1345 1338 __APICV_INHIBIT_REASON(IRQWIN), \ 1346 1339 __APICV_INHIBIT_REASON(PIT_REINJ), \ 1347 1340 __APICV_INHIBIT_REASON(SEV), \ 1348 - __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED) 1341 + __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \ 1342 + __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG) 1349 1343 1350 1344 struct kvm_arch { 1351 1345 unsigned long n_used_mmu_pages; ··· 1389 1381 atomic_t noncoherent_dma_count; 1390 1382 #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE 1391 1383 atomic_t assigned_device_count; 1384 + unsigned long nr_possible_bypass_irqs; 1385 + 1386 + #ifdef CONFIG_KVM_IOAPIC 1392 1387 struct kvm_pic *vpic; 1393 1388 struct kvm_ioapic *vioapic; 1394 1389 struct kvm_pit *vpit; 1390 + #endif 1395 1391 atomic_t vapics_in_nmi_mode; 1396 1392 struct mutex apic_map_lock; 1397 1393 struct kvm_apic_map __rcu *apic_map; ··· 1415 1403 bool pause_in_guest; 1416 1404 bool cstate_in_guest; 1417 1405 1418 - unsigned long irq_sources_bitmap; 1419 1406 s64 kvmclock_offset; 1420 1407 1421 1408 /* ··· 1442 1431 u64 master_cycle_now; 1443 1432 struct delayed_work kvmclock_update_work; 1444 1433 struct delayed_work kvmclock_sync_work; 1445 - 1446 - /* reads protected by irq_srcu, writes by irq_lock */ 1447 - struct hlist_head mask_notifier_list; 1448 1434 1449 1435 #ifdef CONFIG_KVM_HYPERV 1450 1436 struct kvm_hv hyperv; ··· 1861 1853 void (*vcpu_blocking)(struct kvm_vcpu *vcpu); 1862 1854 void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); 1863 1855 1864 - int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, 1865 - uint32_t guest_irq, bool set); 1866 - void (*pi_start_assignment)(struct kvm *kvm); 1856 + int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 1857 + unsigned int host_irq, uint32_t guest_irq, 1858 + struct kvm_vcpu *vcpu, u32 vector); 1859 + void (*pi_start_bypass)(struct kvm *kvm); 1867 1860 void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); 1868 1861 void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); 1869 1862 bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); ··· 1959 1950 extern u32 __read_mostly kvm_nr_uret_msrs; 1960 1951 extern bool __read_mostly allow_smaller_maxphyaddr; 1961 1952 extern bool __read_mostly enable_apicv; 1953 + extern bool __read_mostly enable_ipiv; 1962 1954 extern bool __read_mostly enable_device_posted_irqs; 1963 1955 extern struct kvm_x86_ops kvm_x86_ops; 1964 1956 ··· 2053 2043 2054 2044 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 2055 2045 const void *val, int bytes); 2056 - 2057 - struct kvm_irq_mask_notifier { 2058 - void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); 2059 - int irq; 2060 - struct hlist_node link; 2061 - }; 2062 - 2063 - void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, 2064 - struct kvm_irq_mask_notifier *kimn); 2065 - void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 2066 - struct kvm_irq_mask_notifier *kimn); 2067 - void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 2068 - bool mask); 2069 2046 2070 2047 extern bool tdp_enabled; 2071 2048 ··· 2211 2214 2212 2215 return !!(*irq_state); 2213 2216 } 2214 - 2215 - int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); 2216 - void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); 2217 2217 2218 2218 void kvm_inject_nmi(struct kvm_vcpu *vcpu); 2219 2219 int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); ··· 2387 2393 2388 2394 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, 2389 2395 struct kvm_vcpu **dest_vcpu); 2390 - 2391 - void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, 2392 - struct kvm_lapic_irq *irq); 2393 2396 2394 2397 static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) 2395 2398 {

+8 -5

arch/x86/include/asm/svm.h

··· 252 252 #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 253 253 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) 254 254 255 + /* 256 + * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible 257 + * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate 258 + * GA log interrupts to wake the vCPU (because it's blocking or about to block). 259 + */ 260 + #define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61) 261 + 255 262 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) 256 - #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) 263 + #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12) 257 264 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) 258 265 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) 259 266 #define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL) 260 267 261 268 #define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0) 262 - 263 - #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL 264 269 265 270 #define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 266 271 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 ··· 294 289 295 290 static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); 296 291 static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); 297 - 298 - #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) 299 292 300 293 #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) 301 294 #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)

+10

arch/x86/kvm/Kconfig

··· 166 166 Encrypted State (SEV-ES), and Secure Encrypted Virtualization with 167 167 Secure Nested Paging (SEV-SNP) technologies on AMD processors. 168 168 169 + config KVM_IOAPIC 170 + bool "I/O APIC, PIC, and PIT emulation" 171 + default y 172 + depends on KVM 173 + help 174 + Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. 175 + for full in-kernel APIC emulation. 176 + 177 + If unsure, say Y. 178 + 169 179 config KVM_SMM 170 180 bool "System Management Mode emulation" 171 181 default y

+3 -4

arch/x86/kvm/Makefile

··· 5 5 6 6 include $(srctree)/virt/kvm/Makefile.kvm 7 7 8 - kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ 9 - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ 10 - debugfs.o mmu/mmu.o mmu/page_track.o \ 11 - mmu/spte.o 8 + kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \ 9 + debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o 12 10 13 11 kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o 12 + kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o 14 13 kvm-$(CONFIG_KVM_HYPERV) += hyperv.o 15 14 kvm-$(CONFIG_KVM_XEN) += xen.o 16 15 kvm-$(CONFIG_KVM_SMM) += smm.o

+7 -3

arch/x86/kvm/hyperv.c

··· 497 497 return ret; 498 498 } 499 499 500 - int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) 500 + int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, 501 + int irq_source_id, int level, bool line_status) 501 502 { 502 503 struct kvm_vcpu_hv_synic *synic; 503 504 504 - synic = synic_get(kvm, vpidx); 505 + if (!level) 506 + return -1; 507 + 508 + synic = synic_get(kvm, e->hv_sint.vcpu); 505 509 if (!synic) 506 510 return -EINVAL; 507 511 508 - return synic_set_irq(synic, sint); 512 + return synic_set_irq(synic, e->hv_sint.sint); 509 513 } 510 514 511 515 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)

+2 -1

arch/x86/kvm/hyperv.h

··· 103 103 int kvm_hv_hypercall(struct kvm_vcpu *vcpu); 104 104 105 105 void kvm_hv_irq_routing_update(struct kvm *kvm); 106 - int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); 106 + int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, 107 + int irq_source_id, int level, bool line_status); 107 108 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); 108 109 int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); 109 110

+78 -12

arch/x86/kvm/i8254.c

··· 248 248 if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) 249 249 return; 250 250 251 - kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); 252 - kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); 251 + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false); 252 + kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false); 253 253 254 254 /* 255 255 * Provides NMI watchdog support via Virtual Wire mode. ··· 288 288 atomic_set(&pit->pit_state.irq_ack, 1); 289 289 } 290 290 291 - void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) 291 + static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) 292 292 { 293 293 struct kvm_kpit_state *ps = &pit->pit_state; 294 294 struct kvm *kvm = pit->kvm; ··· 400 400 } 401 401 } 402 402 403 - void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, 404 - int hpet_legacy_start) 403 + static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, 404 + int hpet_legacy_start) 405 405 { 406 406 u8 saved_mode; 407 407 ··· 649 649 kvm_pit_reset_reinject(pit); 650 650 } 651 651 652 + int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 653 + { 654 + struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; 655 + 656 + BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); 657 + 658 + mutex_lock(&kps->lock); 659 + memcpy(ps, &kps->channels, sizeof(*ps)); 660 + mutex_unlock(&kps->lock); 661 + return 0; 662 + } 663 + 664 + int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 665 + { 666 + int i; 667 + struct kvm_pit *pit = kvm->arch.vpit; 668 + 669 + mutex_lock(&pit->pit_state.lock); 670 + memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); 671 + for (i = 0; i < 3; i++) 672 + kvm_pit_load_count(pit, i, ps->channels[i].count, 0); 673 + mutex_unlock(&pit->pit_state.lock); 674 + return 0; 675 + } 676 + 677 + int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 678 + { 679 + mutex_lock(&kvm->arch.vpit->pit_state.lock); 680 + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 681 + sizeof(ps->channels)); 682 + ps->flags = kvm->arch.vpit->pit_state.flags; 683 + mutex_unlock(&kvm->arch.vpit->pit_state.lock); 684 + memset(&ps->reserved, 0, sizeof(ps->reserved)); 685 + return 0; 686 + } 687 + 688 + int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 689 + { 690 + int start = 0; 691 + int i; 692 + u32 prev_legacy, cur_legacy; 693 + struct kvm_pit *pit = kvm->arch.vpit; 694 + 695 + mutex_lock(&pit->pit_state.lock); 696 + prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 697 + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 698 + if (!prev_legacy && cur_legacy) 699 + start = 1; 700 + memcpy(&pit->pit_state.channels, &ps->channels, 701 + sizeof(pit->pit_state.channels)); 702 + pit->pit_state.flags = ps->flags; 703 + for (i = 0; i < 3; i++) 704 + kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, 705 + start && i == 0); 706 + mutex_unlock(&pit->pit_state.lock); 707 + return 0; 708 + } 709 + 710 + int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) 711 + { 712 + struct kvm_pit *pit = kvm->arch.vpit; 713 + 714 + /* pit->pit_state.lock was overloaded to prevent userspace from getting 715 + * an inconsistent state after running multiple KVM_REINJECT_CONTROL 716 + * ioctls in parallel. Use a separate lock if that ioctl isn't rare. 717 + */ 718 + mutex_lock(&pit->pit_state.lock); 719 + kvm_pit_set_reinject(pit, control->pit_reinject); 720 + mutex_unlock(&pit->pit_state.lock); 721 + 722 + return 0; 723 + } 724 + 652 725 static const struct kvm_io_device_ops pit_dev_ops = { 653 726 .read = pit_ioport_read, 654 727 .write = pit_ioport_write, ··· 743 670 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT); 744 671 if (!pit) 745 672 return NULL; 746 - 747 - pit->irq_source_id = kvm_request_irq_source_id(kvm); 748 - if (pit->irq_source_id < 0) 749 - goto fail_request; 750 673 751 674 mutex_init(&pit->pit_state.lock); 752 675 ··· 795 726 kvm_pit_set_reinject(pit, false); 796 727 kthread_destroy_worker(pit->worker); 797 728 fail_kthread: 798 - kvm_free_irq_source_id(kvm, pit->irq_source_id); 799 - fail_request: 800 729 kfree(pit); 801 730 return NULL; 802 731 } ··· 811 744 kvm_pit_set_reinject(pit, false); 812 745 hrtimer_cancel(&pit->pit_state.timer); 813 746 kthread_destroy_worker(pit->worker); 814 - kvm_free_irq_source_id(kvm, pit->irq_source_id); 815 747 kfree(pit); 816 748 } 817 749 }

+12 -5

arch/x86/kvm/i8254.h

··· 6 6 7 7 #include <kvm/iodev.h> 8 8 9 + #include <uapi/asm/kvm.h> 10 + 11 + #include "ioapic.h" 12 + 13 + #ifdef CONFIG_KVM_IOAPIC 9 14 struct kvm_kpit_channel_state { 10 15 u32 count; /* can be 65536 */ 11 16 u16 latched_count; ··· 47 42 struct kvm_io_device speaker_dev; 48 43 struct kvm *kvm; 49 44 struct kvm_kpit_state pit_state; 50 - int irq_source_id; 51 45 struct kvm_irq_mask_notifier mask_notifier; 52 46 struct kthread_worker *worker; 53 47 struct kthread_work expired; ··· 59 55 #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 60 56 #define KVM_PIT_CHANNEL_MASK 0x3 61 57 58 + int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps); 59 + int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps); 60 + int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); 61 + int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps); 62 + int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control); 63 + 62 64 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); 63 65 void kvm_free_pit(struct kvm *kvm); 64 - 65 - void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, 66 - int hpet_legacy_start); 67 - void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject); 66 + #endif /* CONFIG_KVM_IOAPIC */ 68 67 69 68 #endif

+6 -11

arch/x86/kvm/i8259.c

··· 31 31 #include <linux/mm.h> 32 32 #include <linux/slab.h> 33 33 #include <linux/bitops.h> 34 + 35 + #include "ioapic.h" 34 36 #include "irq.h" 35 37 36 38 #include <linux/kvm_host.h> ··· 187 185 pic_unlock(s); 188 186 } 189 187 190 - int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) 188 + int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, 189 + int irq_source_id, int level, bool line_status) 191 190 { 191 + struct kvm_pic *s = kvm->arch.vpic; 192 + int irq = e->irqchip.pin; 192 193 int ret, irq_level; 193 194 194 195 BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); ··· 206 201 pic_unlock(s); 207 202 208 203 return ret; 209 - } 210 - 211 - void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) 212 - { 213 - int i; 214 - 215 - pic_lock(s); 216 - for (i = 0; i < PIC_NUM_PINS; i++) 217 - __clear_bit(irq_source_id, &s->irq_states[i]); 218 - pic_unlock(s); 219 204 } 220 205 221 206 /*

+42 -13

arch/x86/kvm/ioapic.c

··· 41 41 #include <asm/processor.h> 42 42 #include <asm/page.h> 43 43 #include <asm/current.h> 44 - #include <trace/events/kvm.h> 45 44 46 45 #include "ioapic.h" 47 46 #include "lapic.h" 48 47 #include "irq.h" 48 + #include "trace.h" 49 49 50 50 static int ioapic_service(struct kvm_ioapic *vioapic, int irq, 51 51 bool line_status); ··· 310 310 kvm_make_scan_ioapic_request(kvm); 311 311 } 312 312 313 + void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, 314 + struct kvm_irq_mask_notifier *kimn) 315 + { 316 + struct kvm_ioapic *ioapic = kvm->arch.vioapic; 317 + 318 + mutex_lock(&kvm->irq_lock); 319 + kimn->irq = irq; 320 + hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list); 321 + mutex_unlock(&kvm->irq_lock); 322 + } 323 + 324 + void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 325 + struct kvm_irq_mask_notifier *kimn) 326 + { 327 + mutex_lock(&kvm->irq_lock); 328 + hlist_del_rcu(&kimn->link); 329 + mutex_unlock(&kvm->irq_lock); 330 + synchronize_srcu(&kvm->irq_srcu); 331 + } 332 + 333 + void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 334 + bool mask) 335 + { 336 + struct kvm_ioapic *ioapic = kvm->arch.vioapic; 337 + struct kvm_irq_mask_notifier *kimn; 338 + int idx, gsi; 339 + 340 + idx = srcu_read_lock(&kvm->irq_srcu); 341 + gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 342 + if (gsi != -1) 343 + hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link) 344 + if (kimn->irq == gsi) 345 + kimn->func(kimn, mask); 346 + srcu_read_unlock(&kvm->irq_srcu, idx); 347 + } 348 + 313 349 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) 314 350 { 315 351 unsigned index; ··· 515 479 return ret; 516 480 } 517 481 518 - int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 519 - int level, bool line_status) 482 + int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, 483 + int irq_source_id, int level, bool line_status) 520 484 { 485 + struct kvm_ioapic *ioapic = kvm->arch.vioapic; 486 + int irq = e->irqchip.pin; 521 487 int ret, irq_level; 522 488 523 489 BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); ··· 532 494 spin_unlock(&ioapic->lock); 533 495 534 496 return ret; 535 - } 536 - 537 - void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) 538 - { 539 - int i; 540 - 541 - spin_lock(&ioapic->lock); 542 - for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) 543 - __clear_bit(irq_source_id, &ioapic->irq_states[i]); 544 - spin_unlock(&ioapic->lock); 545 497 } 546 498 547 499 static void kvm_ioapic_eoi_inject_work(struct work_struct *work) ··· 746 718 return -ENOMEM; 747 719 spin_lock_init(&ioapic->lock); 748 720 INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work); 721 + INIT_HLIST_HEAD(&ioapic->mask_notifier_list); 749 722 kvm->arch.vioapic = ioapic; 750 723 kvm_ioapic_reset(ioapic); 751 724 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);

+20 -4

arch/x86/kvm/ioapic.h

··· 86 86 struct delayed_work eoi_inject; 87 87 u32 irq_eoi[IOAPIC_NUM_PINS]; 88 88 u32 irr_delivered; 89 + 90 + /* reads protected by irq_srcu, writes by irq_lock */ 91 + struct hlist_head mask_notifier_list; 89 92 }; 93 + 94 + struct kvm_irq_mask_notifier { 95 + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); 96 + int irq; 97 + struct hlist_node link; 98 + }; 99 + 100 + void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, 101 + struct kvm_irq_mask_notifier *kimn); 102 + void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 103 + struct kvm_irq_mask_notifier *kimn); 104 + void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 105 + bool mask); 90 106 91 107 #ifdef DEBUG 92 108 #define ASSERT(x) \ ··· 119 103 120 104 static inline int ioapic_in_kernel(struct kvm *kvm) 121 105 { 122 - return irqchip_kernel(kvm); 106 + return irqchip_full(kvm); 123 107 } 124 108 125 109 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); ··· 127 111 int trigger_mode); 128 112 int kvm_ioapic_init(struct kvm *kvm); 129 113 void kvm_ioapic_destroy(struct kvm *kvm); 130 - int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, 131 - int level, bool line_status); 132 - void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); 114 + int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, 115 + int irq_source_id, int level, bool line_status); 116 + 133 117 void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 134 118 void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); 135 119 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,

+556 -11

arch/x86/kvm/irq.c

··· 11 11 12 12 #include <linux/export.h> 13 13 #include <linux/kvm_host.h> 14 + #include <linux/kvm_irqfd.h> 14 15 16 + #include "hyperv.h" 17 + #include "ioapic.h" 15 18 #include "irq.h" 16 - #include "i8254.h" 19 + #include "trace.h" 17 20 #include "x86.h" 18 21 #include "xen.h" 19 22 ··· 42 39 static int pending_userspace_extint(struct kvm_vcpu *v) 43 40 { 44 41 return v->arch.pending_external_vector != -1; 42 + } 43 + 44 + static int get_userspace_extint(struct kvm_vcpu *vcpu) 45 + { 46 + int vector = vcpu->arch.pending_external_vector; 47 + 48 + vcpu->arch.pending_external_vector = -1; 49 + return vector; 45 50 } 46 51 47 52 /* ··· 78 67 if (!kvm_apic_accept_pic_intr(v)) 79 68 return 0; 80 69 81 - if (irqchip_split(v->kvm)) 82 - return pending_userspace_extint(v); 83 - else 70 + #ifdef CONFIG_KVM_IOAPIC 71 + if (pic_in_kernel(v->kvm)) 84 72 return v->kvm->arch.vpic->output; 73 + #endif 74 + 75 + WARN_ON_ONCE(!irqchip_split(v->kvm)); 76 + return pending_userspace_extint(v); 85 77 } 86 78 87 79 /* ··· 140 126 return v->kvm->arch.xen.upcall_vector; 141 127 #endif 142 128 143 - if (irqchip_split(v->kvm)) { 144 - int vector = v->arch.pending_external_vector; 145 - 146 - v->arch.pending_external_vector = -1; 147 - return vector; 148 - } else 129 + #ifdef CONFIG_KVM_IOAPIC 130 + if (pic_in_kernel(v->kvm)) 149 131 return kvm_pic_read_irq(v->kvm); /* PIC */ 132 + #endif 133 + 134 + WARN_ON_ONCE(!irqchip_split(v->kvm)); 135 + return get_userspace_extint(v); 150 136 } 151 137 EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); 152 138 ··· 177 163 void __kvm_migrate_timers(struct kvm_vcpu *vcpu) 178 164 { 179 165 __kvm_migrate_apic_timer(vcpu); 166 + #ifdef CONFIG_KVM_IOAPIC 180 167 __kvm_migrate_pit_timer(vcpu); 168 + #endif 181 169 kvm_x86_call(migrate_timers)(vcpu); 182 170 } 183 171 ··· 187 171 { 188 172 bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE; 189 173 190 - return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm); 174 + return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm); 191 175 } 192 176 193 177 bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) 194 178 { 195 179 return irqchip_in_kernel(kvm); 196 180 } 181 + 182 + int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 183 + struct kvm_lapic_irq *irq, struct dest_map *dest_map) 184 + { 185 + int r = -1; 186 + struct kvm_vcpu *vcpu, *lowest = NULL; 187 + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; 188 + unsigned int dest_vcpus = 0; 189 + 190 + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) 191 + return r; 192 + 193 + if (irq->dest_mode == APIC_DEST_PHYSICAL && 194 + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { 195 + pr_info("apic: phys broadcast and lowest prio\n"); 196 + irq->delivery_mode = APIC_DM_FIXED; 197 + } 198 + 199 + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); 200 + 201 + kvm_for_each_vcpu(i, vcpu, kvm) { 202 + if (!kvm_apic_present(vcpu)) 203 + continue; 204 + 205 + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, 206 + irq->dest_id, irq->dest_mode)) 207 + continue; 208 + 209 + if (!kvm_lowest_prio_delivery(irq)) { 210 + if (r < 0) 211 + r = 0; 212 + r += kvm_apic_set_irq(vcpu, irq, dest_map); 213 + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { 214 + if (!kvm_vector_hashing_enabled()) { 215 + if (!lowest) 216 + lowest = vcpu; 217 + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) 218 + lowest = vcpu; 219 + } else { 220 + __set_bit(i, dest_vcpu_bitmap); 221 + dest_vcpus++; 222 + } 223 + } 224 + } 225 + 226 + if (dest_vcpus != 0) { 227 + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, 228 + dest_vcpu_bitmap, KVM_MAX_VCPUS); 229 + 230 + lowest = kvm_get_vcpu(kvm, idx); 231 + } 232 + 233 + if (lowest) 234 + r = kvm_apic_set_irq(lowest, irq, dest_map); 235 + 236 + return r; 237 + } 238 + 239 + static void kvm_msi_to_lapic_irq(struct kvm *kvm, 240 + struct kvm_kernel_irq_routing_entry *e, 241 + struct kvm_lapic_irq *irq) 242 + { 243 + struct msi_msg msg = { .address_lo = e->msi.address_lo, 244 + .address_hi = e->msi.address_hi, 245 + .data = e->msi.data }; 246 + 247 + trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? 248 + (u64)msg.address_hi << 32 : 0), msg.data); 249 + 250 + irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); 251 + irq->vector = msg.arch_data.vector; 252 + irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); 253 + irq->trig_mode = msg.arch_data.is_level; 254 + irq->delivery_mode = msg.arch_data.delivery_mode << 8; 255 + irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; 256 + irq->level = 1; 257 + irq->shorthand = APIC_DEST_NOSHORT; 258 + } 259 + 260 + static inline bool kvm_msi_route_invalid(struct kvm *kvm, 261 + struct kvm_kernel_irq_routing_entry *e) 262 + { 263 + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); 264 + } 265 + 266 + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 267 + struct kvm *kvm, int irq_source_id, int level, bool line_status) 268 + { 269 + struct kvm_lapic_irq irq; 270 + 271 + if (kvm_msi_route_invalid(kvm, e)) 272 + return -EINVAL; 273 + 274 + if (!level) 275 + return -1; 276 + 277 + kvm_msi_to_lapic_irq(kvm, e, &irq); 278 + 279 + return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); 280 + } 281 + 282 + int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, 283 + struct kvm *kvm, int irq_source_id, int level, 284 + bool line_status) 285 + { 286 + struct kvm_lapic_irq irq; 287 + int r; 288 + 289 + switch (e->type) { 290 + #ifdef CONFIG_KVM_HYPERV 291 + case KVM_IRQ_ROUTING_HV_SINT: 292 + return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level, 293 + line_status); 294 + #endif 295 + 296 + case KVM_IRQ_ROUTING_MSI: 297 + if (kvm_msi_route_invalid(kvm, e)) 298 + return -EINVAL; 299 + 300 + kvm_msi_to_lapic_irq(kvm, e, &irq); 301 + 302 + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) 303 + return r; 304 + break; 305 + 306 + #ifdef CONFIG_KVM_XEN 307 + case KVM_IRQ_ROUTING_XEN_EVTCHN: 308 + if (!level) 309 + return -1; 310 + 311 + return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); 312 + #endif 313 + default: 314 + break; 315 + } 316 + 317 + return -EWOULDBLOCK; 318 + } 319 + 320 + int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, 321 + bool line_status) 322 + { 323 + if (!irqchip_in_kernel(kvm)) 324 + return -ENXIO; 325 + 326 + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 327 + irq_event->irq, irq_event->level, 328 + line_status); 329 + return 0; 330 + } 331 + 332 + bool kvm_arch_can_set_irq_routing(struct kvm *kvm) 333 + { 334 + return irqchip_in_kernel(kvm); 335 + } 336 + 337 + int kvm_set_routing_entry(struct kvm *kvm, 338 + struct kvm_kernel_irq_routing_entry *e, 339 + const struct kvm_irq_routing_entry *ue) 340 + { 341 + /* We can't check irqchip_in_kernel() here as some callers are 342 + * currently initializing the irqchip. Other callers should therefore 343 + * check kvm_arch_can_set_irq_routing() before calling this function. 344 + */ 345 + switch (ue->type) { 346 + #ifdef CONFIG_KVM_IOAPIC 347 + case KVM_IRQ_ROUTING_IRQCHIP: 348 + if (irqchip_split(kvm)) 349 + return -EINVAL; 350 + e->irqchip.pin = ue->u.irqchip.pin; 351 + switch (ue->u.irqchip.irqchip) { 352 + case KVM_IRQCHIP_PIC_SLAVE: 353 + e->irqchip.pin += PIC_NUM_PINS / 2; 354 + fallthrough; 355 + case KVM_IRQCHIP_PIC_MASTER: 356 + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) 357 + return -EINVAL; 358 + e->set = kvm_pic_set_irq; 359 + break; 360 + case KVM_IRQCHIP_IOAPIC: 361 + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) 362 + return -EINVAL; 363 + e->set = kvm_ioapic_set_irq; 364 + break; 365 + default: 366 + return -EINVAL; 367 + } 368 + e->irqchip.irqchip = ue->u.irqchip.irqchip; 369 + break; 370 + #endif 371 + case KVM_IRQ_ROUTING_MSI: 372 + e->set = kvm_set_msi; 373 + e->msi.address_lo = ue->u.msi.address_lo; 374 + e->msi.address_hi = ue->u.msi.address_hi; 375 + e->msi.data = ue->u.msi.data; 376 + 377 + if (kvm_msi_route_invalid(kvm, e)) 378 + return -EINVAL; 379 + break; 380 + #ifdef CONFIG_KVM_HYPERV 381 + case KVM_IRQ_ROUTING_HV_SINT: 382 + e->set = kvm_hv_synic_set_irq; 383 + e->hv_sint.vcpu = ue->u.hv_sint.vcpu; 384 + e->hv_sint.sint = ue->u.hv_sint.sint; 385 + break; 386 + #endif 387 + #ifdef CONFIG_KVM_XEN 388 + case KVM_IRQ_ROUTING_XEN_EVTCHN: 389 + return kvm_xen_setup_evtchn(kvm, e, ue); 390 + #endif 391 + default: 392 + return -EINVAL; 393 + } 394 + 395 + return 0; 396 + } 397 + 398 + bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, 399 + struct kvm_vcpu **dest_vcpu) 400 + { 401 + int r = 0; 402 + unsigned long i; 403 + struct kvm_vcpu *vcpu; 404 + 405 + if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) 406 + return true; 407 + 408 + kvm_for_each_vcpu(i, vcpu, kvm) { 409 + if (!kvm_apic_present(vcpu)) 410 + continue; 411 + 412 + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, 413 + irq->dest_id, irq->dest_mode)) 414 + continue; 415 + 416 + if (++r == 2) 417 + return false; 418 + 419 + *dest_vcpu = vcpu; 420 + } 421 + 422 + return r == 1; 423 + } 424 + EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); 425 + 426 + void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, 427 + u8 vector, unsigned long *ioapic_handled_vectors) 428 + { 429 + /* 430 + * Intercept EOI if the vCPU is the target of the new IRQ routing, or 431 + * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU 432 + * may receive a level-triggered IRQ in the future, or already received 433 + * level-triggered IRQ. The EOI needs to be intercepted and forwarded 434 + * to I/O APIC emulation so that the IRQ can be de-asserted. 435 + */ 436 + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { 437 + __set_bit(vector, ioapic_handled_vectors); 438 + } else if (kvm_apic_pending_eoi(vcpu, vector)) { 439 + __set_bit(vector, ioapic_handled_vectors); 440 + 441 + /* 442 + * Track the highest pending EOI for which the vCPU is NOT the 443 + * target in the new routing. Only the EOI for the IRQ that is 444 + * in-flight (for the old routing) needs to be intercepted, any 445 + * future IRQs that arrive on this vCPU will be coincidental to 446 + * the level-triggered routing and don't need to be intercepted. 447 + */ 448 + if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) 449 + vcpu->arch.highest_stale_pending_ioapic_eoi = vector; 450 + } 451 + } 452 + 453 + void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, 454 + ulong *ioapic_handled_vectors) 455 + { 456 + struct kvm *kvm = vcpu->kvm; 457 + struct kvm_kernel_irq_routing_entry *entry; 458 + struct kvm_irq_routing_table *table; 459 + u32 i, nr_ioapic_pins; 460 + int idx; 461 + 462 + idx = srcu_read_lock(&kvm->irq_srcu); 463 + table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 464 + nr_ioapic_pins = min_t(u32, table->nr_rt_entries, 465 + kvm->arch.nr_reserved_ioapic_pins); 466 + for (i = 0; i < nr_ioapic_pins; ++i) { 467 + hlist_for_each_entry(entry, &table->map[i], link) { 468 + struct kvm_lapic_irq irq; 469 + 470 + if (entry->type != KVM_IRQ_ROUTING_MSI) 471 + continue; 472 + 473 + kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq); 474 + 475 + if (!irq.trig_mode) 476 + continue; 477 + 478 + kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, 479 + irq.vector, ioapic_handled_vectors); 480 + } 481 + } 482 + srcu_read_unlock(&kvm->irq_srcu, idx); 483 + } 484 + 485 + void kvm_arch_irq_routing_update(struct kvm *kvm) 486 + { 487 + #ifdef CONFIG_KVM_HYPERV 488 + kvm_hv_irq_routing_update(kvm); 489 + #endif 490 + 491 + if (irqchip_split(kvm)) 492 + kvm_make_scan_ioapic_request(kvm); 493 + } 494 + 495 + static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd, 496 + struct kvm_kernel_irq_routing_entry *entry) 497 + { 498 + unsigned int host_irq = irqfd->producer->irq; 499 + struct kvm *kvm = irqfd->kvm; 500 + struct kvm_vcpu *vcpu = NULL; 501 + struct kvm_lapic_irq irq; 502 + int r; 503 + 504 + if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass())) 505 + return -EINVAL; 506 + 507 + if (entry && entry->type == KVM_IRQ_ROUTING_MSI) { 508 + kvm_msi_to_lapic_irq(kvm, entry, &irq); 509 + 510 + /* 511 + * Force remapped mode if hardware doesn't support posting the 512 + * virtual interrupt to a vCPU. Only IRQs are postable (NMIs, 513 + * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support 514 + * posting multicast/broadcast IRQs. If the interrupt can't be 515 + * posted, the device MSI needs to be routed to the host so that 516 + * the guest's desired interrupt can be synthesized by KVM. 517 + * 518 + * This means that KVM can only post lowest-priority interrupts 519 + * if they have a single CPU as the destination, e.g. only if 520 + * the guest has affined the interrupt to a single vCPU. 521 + */ 522 + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 523 + !kvm_irq_is_postable(&irq)) 524 + vcpu = NULL; 525 + } 526 + 527 + if (!irqfd->irq_bypass_vcpu && !vcpu) 528 + return 0; 529 + 530 + r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi, 531 + vcpu, irq.vector); 532 + if (r) { 533 + WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu); 534 + irqfd->irq_bypass_vcpu = NULL; 535 + return r; 536 + } 537 + 538 + irqfd->irq_bypass_vcpu = vcpu; 539 + 540 + trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu); 541 + return 0; 542 + } 543 + 544 + int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, 545 + struct irq_bypass_producer *prod) 546 + { 547 + struct kvm_kernel_irqfd *irqfd = 548 + container_of(cons, struct kvm_kernel_irqfd, consumer); 549 + struct kvm *kvm = irqfd->kvm; 550 + int ret = 0; 551 + 552 + kvm_arch_start_assignment(irqfd->kvm); 553 + 554 + spin_lock_irq(&kvm->irqfds.lock); 555 + irqfd->producer = prod; 556 + 557 + if (!kvm->arch.nr_possible_bypass_irqs++) 558 + kvm_x86_call(pi_start_bypass)(kvm); 559 + 560 + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { 561 + ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry); 562 + if (ret) { 563 + kvm->arch.nr_possible_bypass_irqs--; 564 + kvm_arch_end_assignment(irqfd->kvm); 565 + } 566 + } 567 + spin_unlock_irq(&kvm->irqfds.lock); 568 + 569 + return ret; 570 + } 571 + 572 + void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, 573 + struct irq_bypass_producer *prod) 574 + { 575 + struct kvm_kernel_irqfd *irqfd = 576 + container_of(cons, struct kvm_kernel_irqfd, consumer); 577 + struct kvm *kvm = irqfd->kvm; 578 + int ret; 579 + 580 + WARN_ON(irqfd->producer != prod); 581 + 582 + /* 583 + * If the producer of an IRQ that is currently being posted to a vCPU 584 + * is unregistered, change the associated IRTE back to remapped mode as 585 + * the IRQ has been released (or repurposed) by the device driver, i.e. 586 + * KVM must relinquish control of the IRTE. 587 + */ 588 + spin_lock_irq(&kvm->irqfds.lock); 589 + 590 + if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) { 591 + ret = kvm_pi_update_irte(irqfd, NULL); 592 + if (ret) 593 + pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n", 594 + irqfd->consumer.eventfd, ret); 595 + } 596 + irqfd->producer = NULL; 597 + 598 + kvm->arch.nr_possible_bypass_irqs--; 599 + 600 + spin_unlock_irq(&kvm->irqfds.lock); 601 + 602 + 603 + kvm_arch_end_assignment(irqfd->kvm); 604 + } 605 + 606 + void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, 607 + struct kvm_kernel_irq_routing_entry *old, 608 + struct kvm_kernel_irq_routing_entry *new) 609 + { 610 + if (new->type != KVM_IRQ_ROUTING_MSI && 611 + old->type != KVM_IRQ_ROUTING_MSI) 612 + return; 613 + 614 + if (old->type == KVM_IRQ_ROUTING_MSI && 615 + new->type == KVM_IRQ_ROUTING_MSI && 616 + !memcmp(&old->msi, &new->msi, sizeof(new->msi))) 617 + return; 618 + 619 + kvm_pi_update_irte(irqfd, new); 620 + } 621 + 622 + #ifdef CONFIG_KVM_IOAPIC 623 + #define IOAPIC_ROUTING_ENTRY(irq) \ 624 + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ 625 + .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } 626 + #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) 627 + 628 + #define PIC_ROUTING_ENTRY(irq) \ 629 + { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ 630 + .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } 631 + #define ROUTING_ENTRY2(irq) \ 632 + IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) 633 + 634 + static const struct kvm_irq_routing_entry default_routing[] = { 635 + ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), 636 + ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), 637 + ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), 638 + ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), 639 + ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), 640 + ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), 641 + ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), 642 + ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), 643 + ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), 644 + ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), 645 + ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), 646 + ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), 647 + }; 648 + 649 + int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm) 650 + { 651 + return kvm_set_irq_routing(kvm, default_routing, 652 + ARRAY_SIZE(default_routing), 0); 653 + } 654 + 655 + int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 656 + { 657 + struct kvm_pic *pic = kvm->arch.vpic; 658 + int r; 659 + 660 + r = 0; 661 + switch (chip->chip_id) { 662 + case KVM_IRQCHIP_PIC_MASTER: 663 + memcpy(&chip->chip.pic, &pic->pics[0], 664 + sizeof(struct kvm_pic_state)); 665 + break; 666 + case KVM_IRQCHIP_PIC_SLAVE: 667 + memcpy(&chip->chip.pic, &pic->pics[1], 668 + sizeof(struct kvm_pic_state)); 669 + break; 670 + case KVM_IRQCHIP_IOAPIC: 671 + kvm_get_ioapic(kvm, &chip->chip.ioapic); 672 + break; 673 + default: 674 + r = -EINVAL; 675 + break; 676 + } 677 + return r; 678 + } 679 + 680 + int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 681 + { 682 + struct kvm_pic *pic = kvm->arch.vpic; 683 + int r; 684 + 685 + r = 0; 686 + switch (chip->chip_id) { 687 + case KVM_IRQCHIP_PIC_MASTER: 688 + spin_lock(&pic->lock); 689 + memcpy(&pic->pics[0], &chip->chip.pic, 690 + sizeof(struct kvm_pic_state)); 691 + spin_unlock(&pic->lock); 692 + break; 693 + case KVM_IRQCHIP_PIC_SLAVE: 694 + spin_lock(&pic->lock); 695 + memcpy(&pic->pics[1], &chip->chip.pic, 696 + sizeof(struct kvm_pic_state)); 697 + spin_unlock(&pic->lock); 698 + break; 699 + case KVM_IRQCHIP_IOAPIC: 700 + kvm_set_ioapic(kvm, &chip->chip.ioapic); 701 + break; 702 + default: 703 + r = -EINVAL; 704 + break; 705 + } 706 + kvm_pic_update_irq(pic); 707 + return r; 708 + } 709 + #endif

+30 -15

arch/x86/kvm/irq.h

··· 18 18 #include <kvm/iodev.h> 19 19 #include "lapic.h" 20 20 21 + #ifdef CONFIG_KVM_IOAPIC 22 + 21 23 #define PIC_NUM_PINS 16 22 24 #define SELECT_PIC(irq) \ 23 25 ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) ··· 65 63 void kvm_pic_destroy(struct kvm *kvm); 66 64 int kvm_pic_read_irq(struct kvm *kvm); 67 65 void kvm_pic_update_irq(struct kvm_pic *s); 66 + int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, 67 + int irq_source_id, int level, bool line_status); 68 + 69 + int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm); 70 + 71 + int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); 72 + int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip); 73 + 74 + static inline int irqchip_full(struct kvm *kvm) 75 + { 76 + int mode = kvm->arch.irqchip_mode; 77 + 78 + /* Matches smp_wmb() when setting irqchip_mode */ 79 + smp_rmb(); 80 + return mode == KVM_IRQCHIP_KERNEL; 81 + } 82 + #else /* CONFIG_KVM_IOAPIC */ 83 + static __always_inline int irqchip_full(struct kvm *kvm) 84 + { 85 + return false; 86 + } 87 + #endif 88 + 89 + static inline int pic_in_kernel(struct kvm *kvm) 90 + { 91 + return irqchip_full(kvm); 92 + } 93 + 68 94 69 95 static inline int irqchip_split(struct kvm *kvm) 70 96 { ··· 101 71 /* Matches smp_wmb() when setting irqchip_mode */ 102 72 smp_rmb(); 103 73 return mode == KVM_IRQCHIP_SPLIT; 104 - } 105 - 106 - static inline int irqchip_kernel(struct kvm *kvm) 107 - { 108 - int mode = kvm->arch.irqchip_mode; 109 - 110 - /* Matches smp_wmb() when setting irqchip_mode */ 111 - smp_rmb(); 112 - return mode == KVM_IRQCHIP_KERNEL; 113 - } 114 - 115 - static inline int pic_in_kernel(struct kvm *kvm) 116 - { 117 - return irqchip_kernel(kvm); 118 74 } 119 75 120 76 static inline int irqchip_in_kernel(struct kvm *kvm) ··· 121 105 122 106 int apic_has_pending_timer(struct kvm_vcpu *vcpu); 123 107 124 - int kvm_setup_default_irq_routing(struct kvm *kvm); 125 108 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 126 109 struct kvm_lapic_irq *irq, 127 110 struct dest_map *dest_map);

-469

arch/x86/kvm/irq_comm.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-only 2 - /* 3 - * irq_comm.c: Common API for in kernel interrupt controller 4 - * Copyright (c) 2007, Intel Corporation. 5 - * 6 - * Authors: 7 - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> 8 - * 9 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 - */ 11 - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 - 13 - #include <linux/kvm_host.h> 14 - #include <linux/slab.h> 15 - #include <linux/export.h> 16 - #include <linux/rculist.h> 17 - 18 - #include <trace/events/kvm.h> 19 - 20 - #include "irq.h" 21 - 22 - #include "ioapic.h" 23 - 24 - #include "lapic.h" 25 - 26 - #include "hyperv.h" 27 - #include "x86.h" 28 - #include "xen.h" 29 - 30 - static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, 31 - struct kvm *kvm, int irq_source_id, int level, 32 - bool line_status) 33 - { 34 - struct kvm_pic *pic = kvm->arch.vpic; 35 - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); 36 - } 37 - 38 - static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, 39 - struct kvm *kvm, int irq_source_id, int level, 40 - bool line_status) 41 - { 42 - struct kvm_ioapic *ioapic = kvm->arch.vioapic; 43 - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, 44 - line_status); 45 - } 46 - 47 - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 48 - struct kvm_lapic_irq *irq, struct dest_map *dest_map) 49 - { 50 - int r = -1; 51 - struct kvm_vcpu *vcpu, *lowest = NULL; 52 - unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; 53 - unsigned int dest_vcpus = 0; 54 - 55 - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) 56 - return r; 57 - 58 - if (irq->dest_mode == APIC_DEST_PHYSICAL && 59 - irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { 60 - pr_info("apic: phys broadcast and lowest prio\n"); 61 - irq->delivery_mode = APIC_DM_FIXED; 62 - } 63 - 64 - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); 65 - 66 - kvm_for_each_vcpu(i, vcpu, kvm) { 67 - if (!kvm_apic_present(vcpu)) 68 - continue; 69 - 70 - if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, 71 - irq->dest_id, irq->dest_mode)) 72 - continue; 73 - 74 - if (!kvm_lowest_prio_delivery(irq)) { 75 - if (r < 0) 76 - r = 0; 77 - r += kvm_apic_set_irq(vcpu, irq, dest_map); 78 - } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { 79 - if (!kvm_vector_hashing_enabled()) { 80 - if (!lowest) 81 - lowest = vcpu; 82 - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) 83 - lowest = vcpu; 84 - } else { 85 - __set_bit(i, dest_vcpu_bitmap); 86 - dest_vcpus++; 87 - } 88 - } 89 - } 90 - 91 - if (dest_vcpus != 0) { 92 - int idx = kvm_vector_to_index(irq->vector, dest_vcpus, 93 - dest_vcpu_bitmap, KVM_MAX_VCPUS); 94 - 95 - lowest = kvm_get_vcpu(kvm, idx); 96 - } 97 - 98 - if (lowest) 99 - r = kvm_apic_set_irq(lowest, irq, dest_map); 100 - 101 - return r; 102 - } 103 - 104 - void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, 105 - struct kvm_lapic_irq *irq) 106 - { 107 - struct msi_msg msg = { .address_lo = e->msi.address_lo, 108 - .address_hi = e->msi.address_hi, 109 - .data = e->msi.data }; 110 - 111 - trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? 112 - (u64)msg.address_hi << 32 : 0), msg.data); 113 - 114 - irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); 115 - irq->vector = msg.arch_data.vector; 116 - irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); 117 - irq->trig_mode = msg.arch_data.is_level; 118 - irq->delivery_mode = msg.arch_data.delivery_mode << 8; 119 - irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; 120 - irq->level = 1; 121 - irq->shorthand = APIC_DEST_NOSHORT; 122 - } 123 - EXPORT_SYMBOL_GPL(kvm_set_msi_irq); 124 - 125 - static inline bool kvm_msi_route_invalid(struct kvm *kvm, 126 - struct kvm_kernel_irq_routing_entry *e) 127 - { 128 - return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); 129 - } 130 - 131 - int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 132 - struct kvm *kvm, int irq_source_id, int level, bool line_status) 133 - { 134 - struct kvm_lapic_irq irq; 135 - 136 - if (kvm_msi_route_invalid(kvm, e)) 137 - return -EINVAL; 138 - 139 - if (!level) 140 - return -1; 141 - 142 - kvm_set_msi_irq(kvm, e, &irq); 143 - 144 - return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); 145 - } 146 - 147 - #ifdef CONFIG_KVM_HYPERV 148 - static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, 149 - struct kvm *kvm, int irq_source_id, int level, 150 - bool line_status) 151 - { 152 - if (!level) 153 - return -1; 154 - 155 - return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); 156 - } 157 - #endif 158 - 159 - int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, 160 - struct kvm *kvm, int irq_source_id, int level, 161 - bool line_status) 162 - { 163 - struct kvm_lapic_irq irq; 164 - int r; 165 - 166 - switch (e->type) { 167 - #ifdef CONFIG_KVM_HYPERV 168 - case KVM_IRQ_ROUTING_HV_SINT: 169 - return kvm_hv_set_sint(e, kvm, irq_source_id, level, 170 - line_status); 171 - #endif 172 - 173 - case KVM_IRQ_ROUTING_MSI: 174 - if (kvm_msi_route_invalid(kvm, e)) 175 - return -EINVAL; 176 - 177 - kvm_set_msi_irq(kvm, e, &irq); 178 - 179 - if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) 180 - return r; 181 - break; 182 - 183 - #ifdef CONFIG_KVM_XEN 184 - case KVM_IRQ_ROUTING_XEN_EVTCHN: 185 - if (!level) 186 - return -1; 187 - 188 - return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm); 189 - #endif 190 - default: 191 - break; 192 - } 193 - 194 - return -EWOULDBLOCK; 195 - } 196 - 197 - int kvm_request_irq_source_id(struct kvm *kvm) 198 - { 199 - unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; 200 - int irq_source_id; 201 - 202 - mutex_lock(&kvm->irq_lock); 203 - irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); 204 - 205 - if (irq_source_id >= BITS_PER_LONG) { 206 - pr_warn("exhausted allocatable IRQ sources!\n"); 207 - irq_source_id = -EFAULT; 208 - goto unlock; 209 - } 210 - 211 - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 212 - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); 213 - set_bit(irq_source_id, bitmap); 214 - unlock: 215 - mutex_unlock(&kvm->irq_lock); 216 - 217 - return irq_source_id; 218 - } 219 - 220 - void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) 221 - { 222 - ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 223 - ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); 224 - 225 - mutex_lock(&kvm->irq_lock); 226 - if (irq_source_id < 0 || 227 - irq_source_id >= BITS_PER_LONG) { 228 - pr_err("IRQ source ID out of range!\n"); 229 - goto unlock; 230 - } 231 - clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); 232 - if (!irqchip_kernel(kvm)) 233 - goto unlock; 234 - 235 - kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); 236 - kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); 237 - unlock: 238 - mutex_unlock(&kvm->irq_lock); 239 - } 240 - 241 - void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, 242 - struct kvm_irq_mask_notifier *kimn) 243 - { 244 - mutex_lock(&kvm->irq_lock); 245 - kimn->irq = irq; 246 - hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list); 247 - mutex_unlock(&kvm->irq_lock); 248 - } 249 - 250 - void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 251 - struct kvm_irq_mask_notifier *kimn) 252 - { 253 - mutex_lock(&kvm->irq_lock); 254 - hlist_del_rcu(&kimn->link); 255 - mutex_unlock(&kvm->irq_lock); 256 - synchronize_srcu(&kvm->irq_srcu); 257 - } 258 - 259 - void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 260 - bool mask) 261 - { 262 - struct kvm_irq_mask_notifier *kimn; 263 - int idx, gsi; 264 - 265 - idx = srcu_read_lock(&kvm->irq_srcu); 266 - gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 267 - if (gsi != -1) 268 - hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link) 269 - if (kimn->irq == gsi) 270 - kimn->func(kimn, mask); 271 - srcu_read_unlock(&kvm->irq_srcu, idx); 272 - } 273 - 274 - bool kvm_arch_can_set_irq_routing(struct kvm *kvm) 275 - { 276 - return irqchip_in_kernel(kvm); 277 - } 278 - 279 - int kvm_set_routing_entry(struct kvm *kvm, 280 - struct kvm_kernel_irq_routing_entry *e, 281 - const struct kvm_irq_routing_entry *ue) 282 - { 283 - /* We can't check irqchip_in_kernel() here as some callers are 284 - * currently initializing the irqchip. Other callers should therefore 285 - * check kvm_arch_can_set_irq_routing() before calling this function. 286 - */ 287 - switch (ue->type) { 288 - case KVM_IRQ_ROUTING_IRQCHIP: 289 - if (irqchip_split(kvm)) 290 - return -EINVAL; 291 - e->irqchip.pin = ue->u.irqchip.pin; 292 - switch (ue->u.irqchip.irqchip) { 293 - case KVM_IRQCHIP_PIC_SLAVE: 294 - e->irqchip.pin += PIC_NUM_PINS / 2; 295 - fallthrough; 296 - case KVM_IRQCHIP_PIC_MASTER: 297 - if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) 298 - return -EINVAL; 299 - e->set = kvm_set_pic_irq; 300 - break; 301 - case KVM_IRQCHIP_IOAPIC: 302 - if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) 303 - return -EINVAL; 304 - e->set = kvm_set_ioapic_irq; 305 - break; 306 - default: 307 - return -EINVAL; 308 - } 309 - e->irqchip.irqchip = ue->u.irqchip.irqchip; 310 - break; 311 - case KVM_IRQ_ROUTING_MSI: 312 - e->set = kvm_set_msi; 313 - e->msi.address_lo = ue->u.msi.address_lo; 314 - e->msi.address_hi = ue->u.msi.address_hi; 315 - e->msi.data = ue->u.msi.data; 316 - 317 - if (kvm_msi_route_invalid(kvm, e)) 318 - return -EINVAL; 319 - break; 320 - #ifdef CONFIG_KVM_HYPERV 321 - case KVM_IRQ_ROUTING_HV_SINT: 322 - e->set = kvm_hv_set_sint; 323 - e->hv_sint.vcpu = ue->u.hv_sint.vcpu; 324 - e->hv_sint.sint = ue->u.hv_sint.sint; 325 - break; 326 - #endif 327 - #ifdef CONFIG_KVM_XEN 328 - case KVM_IRQ_ROUTING_XEN_EVTCHN: 329 - return kvm_xen_setup_evtchn(kvm, e, ue); 330 - #endif 331 - default: 332 - return -EINVAL; 333 - } 334 - 335 - return 0; 336 - } 337 - 338 - bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, 339 - struct kvm_vcpu **dest_vcpu) 340 - { 341 - int r = 0; 342 - unsigned long i; 343 - struct kvm_vcpu *vcpu; 344 - 345 - if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) 346 - return true; 347 - 348 - kvm_for_each_vcpu(i, vcpu, kvm) { 349 - if (!kvm_apic_present(vcpu)) 350 - continue; 351 - 352 - if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, 353 - irq->dest_id, irq->dest_mode)) 354 - continue; 355 - 356 - if (++r == 2) 357 - return false; 358 - 359 - *dest_vcpu = vcpu; 360 - } 361 - 362 - return r == 1; 363 - } 364 - EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu); 365 - 366 - #define IOAPIC_ROUTING_ENTRY(irq) \ 367 - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ 368 - .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } } 369 - #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq) 370 - 371 - #define PIC_ROUTING_ENTRY(irq) \ 372 - { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ 373 - .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } } 374 - #define ROUTING_ENTRY2(irq) \ 375 - IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq) 376 - 377 - static const struct kvm_irq_routing_entry default_routing[] = { 378 - ROUTING_ENTRY2(0), ROUTING_ENTRY2(1), 379 - ROUTING_ENTRY2(2), ROUTING_ENTRY2(3), 380 - ROUTING_ENTRY2(4), ROUTING_ENTRY2(5), 381 - ROUTING_ENTRY2(6), ROUTING_ENTRY2(7), 382 - ROUTING_ENTRY2(8), ROUTING_ENTRY2(9), 383 - ROUTING_ENTRY2(10), ROUTING_ENTRY2(11), 384 - ROUTING_ENTRY2(12), ROUTING_ENTRY2(13), 385 - ROUTING_ENTRY2(14), ROUTING_ENTRY2(15), 386 - ROUTING_ENTRY1(16), ROUTING_ENTRY1(17), 387 - ROUTING_ENTRY1(18), ROUTING_ENTRY1(19), 388 - ROUTING_ENTRY1(20), ROUTING_ENTRY1(21), 389 - ROUTING_ENTRY1(22), ROUTING_ENTRY1(23), 390 - }; 391 - 392 - int kvm_setup_default_irq_routing(struct kvm *kvm) 393 - { 394 - return kvm_set_irq_routing(kvm, default_routing, 395 - ARRAY_SIZE(default_routing), 0); 396 - } 397 - 398 - void kvm_arch_post_irq_routing_update(struct kvm *kvm) 399 - { 400 - if (!irqchip_split(kvm)) 401 - return; 402 - kvm_make_scan_ioapic_request(kvm); 403 - } 404 - 405 - void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, 406 - u8 vector, unsigned long *ioapic_handled_vectors) 407 - { 408 - /* 409 - * Intercept EOI if the vCPU is the target of the new IRQ routing, or 410 - * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU 411 - * may receive a level-triggered IRQ in the future, or already received 412 - * level-triggered IRQ. The EOI needs to be intercepted and forwarded 413 - * to I/O APIC emulation so that the IRQ can be de-asserted. 414 - */ 415 - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { 416 - __set_bit(vector, ioapic_handled_vectors); 417 - } else if (kvm_apic_pending_eoi(vcpu, vector)) { 418 - __set_bit(vector, ioapic_handled_vectors); 419 - 420 - /* 421 - * Track the highest pending EOI for which the vCPU is NOT the 422 - * target in the new routing. Only the EOI for the IRQ that is 423 - * in-flight (for the old routing) needs to be intercepted, any 424 - * future IRQs that arrive on this vCPU will be coincidental to 425 - * the level-triggered routing and don't need to be intercepted. 426 - */ 427 - if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) 428 - vcpu->arch.highest_stale_pending_ioapic_eoi = vector; 429 - } 430 - } 431 - 432 - void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, 433 - ulong *ioapic_handled_vectors) 434 - { 435 - struct kvm *kvm = vcpu->kvm; 436 - struct kvm_kernel_irq_routing_entry *entry; 437 - struct kvm_irq_routing_table *table; 438 - u32 i, nr_ioapic_pins; 439 - int idx; 440 - 441 - idx = srcu_read_lock(&kvm->irq_srcu); 442 - table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 443 - nr_ioapic_pins = min_t(u32, table->nr_rt_entries, 444 - kvm->arch.nr_reserved_ioapic_pins); 445 - for (i = 0; i < nr_ioapic_pins; ++i) { 446 - hlist_for_each_entry(entry, &table->map[i], link) { 447 - struct kvm_lapic_irq irq; 448 - 449 - if (entry->type != KVM_IRQ_ROUTING_MSI) 450 - continue; 451 - 452 - kvm_set_msi_irq(vcpu->kvm, entry, &irq); 453 - 454 - if (!irq.trig_mode) 455 - continue; 456 - 457 - kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, 458 - irq.vector, ioapic_handled_vectors); 459 - } 460 - } 461 - srcu_read_unlock(&kvm->irq_srcu, idx); 462 - } 463 - 464 - void kvm_arch_irq_routing_update(struct kvm *kvm) 465 - { 466 - #ifdef CONFIG_KVM_HYPERV 467 - kvm_hv_irq_routing_update(kvm); 468 - #endif 469 - }

+6 -1

arch/x86/kvm/lapic.c

··· 1455 1455 1456 1456 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) 1457 1457 { 1458 - int trigger_mode; 1458 + int __maybe_unused trigger_mode; 1459 1459 1460 1460 /* Eoi the ioapic only if the ioapic doesn't own the vector. */ 1461 1461 if (!kvm_ioapic_handles_vector(apic, vector)) ··· 1476 1476 return; 1477 1477 } 1478 1478 1479 + #ifdef CONFIG_KVM_IOAPIC 1479 1480 if (apic_test_vector(vector, apic->regs + APIC_TMR)) 1480 1481 trigger_mode = IOAPIC_LEVEL_TRIG; 1481 1482 else 1482 1483 trigger_mode = IOAPIC_EDGE_TRIG; 1483 1484 1484 1485 kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); 1486 + #endif 1485 1487 } 1486 1488 1487 1489 static int apic_set_eoi(struct kvm_lapic *apic) ··· 3148 3146 kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); 3149 3147 } 3150 3148 kvm_make_request(KVM_REQ_EVENT, vcpu); 3149 + 3150 + #ifdef CONFIG_KVM_IOAPIC 3151 3151 if (ioapic_in_kernel(vcpu->kvm)) 3152 3152 kvm_rtc_eoi_tracking_restore_one(vcpu); 3153 + #endif 3153 3154 3154 3155 vcpu->arch.apic_arb_prio = 0; 3155 3156

+302 -390

arch/x86/kvm/svm/avic.c

··· 18 18 #include <linux/hashtable.h> 19 19 #include <linux/amd-iommu.h> 20 20 #include <linux/kvm_host.h> 21 + #include <linux/kvm_irqfd.h> 21 22 22 23 #include <asm/irq_remapping.h> 23 24 #include <asm/msr.h> ··· 30 29 #include "svm.h" 31 30 32 31 /* 33 - * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, 34 - * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry 35 - * if an interrupt can't be delivered, e.g. because the vCPU isn't running. 32 + * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that 33 + * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't 34 + * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index 35 + * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast 36 + * lookup on the index, where as vCPUs whose index doesn't match their ID need 37 + * to walk the entire xarray of vCPUs in the worst case scenario. 36 38 * 37 - * For the vCPU ID, use however many bits are currently allowed for the max 39 + * For the vCPU index, use however many bits are currently allowed for the max 38 40 * guest physical APIC ID (limited by the size of the physical ID table), and 39 41 * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the 40 42 * size of the GATag is defined by hardware (32 bits), but is an opaque value 41 43 * as far as hardware is concerned. 42 44 */ 43 - #define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 45 + #define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK 44 46 45 47 #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) 46 48 #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) 47 49 48 50 #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) 49 - #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) 51 + #define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK) 50 52 51 - #define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 52 - ((vcpu_id) & AVIC_VCPU_ID_MASK)) 53 - #define AVIC_GATAG(vm_id, vcpu_id) \ 53 + #define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ 54 + ((vcpu_idx) & AVIC_VCPU_IDX_MASK)) 55 + #define AVIC_GATAG(vm_id, vcpu_idx) \ 54 56 ({ \ 55 - u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ 57 + u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \ 56 58 \ 57 - WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ 59 + WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \ 58 60 WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ 59 61 ga_tag; \ 60 62 }) 61 63 62 - static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); 64 + static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u); 63 65 64 66 static bool force_avic; 65 67 module_param_unsafe(force_avic, bool, 0444); ··· 78 74 static bool next_vm_id_wrapped = 0; 79 75 static DEFINE_SPINLOCK(svm_vm_data_hash_lock); 80 76 bool x2avic_enabled; 81 - 82 - /* 83 - * This is a wrapper of struct amd_iommu_ir_data. 84 - */ 85 - struct amd_svm_iommu_ir { 86 - struct list_head node; /* Used by SVM for per-vcpu ir_list */ 87 - void *data; /* Storing pointer to struct amd_ir_data */ 88 - }; 89 77 90 78 static void avic_activate_vmcb(struct vcpu_svm *svm) 91 79 { ··· 143 147 struct kvm_svm *kvm_svm; 144 148 struct kvm_vcpu *vcpu = NULL; 145 149 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 146 - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); 150 + u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag); 147 151 148 - pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); 149 - trace_kvm_avic_ga_log(vm_id, vcpu_id); 152 + pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx); 153 + trace_kvm_avic_ga_log(vm_id, vcpu_idx); 150 154 151 155 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 152 156 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { 153 157 if (kvm_svm->avic_vm_id != vm_id) 154 158 continue; 155 - vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id); 159 + vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx); 156 160 break; 157 161 } 158 162 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); ··· 176 180 if (!enable_apicv) 177 181 return; 178 182 179 - if (kvm_svm->avic_logical_id_table_page) 180 - __free_page(kvm_svm->avic_logical_id_table_page); 181 - if (kvm_svm->avic_physical_id_table_page) 182 - __free_page(kvm_svm->avic_physical_id_table_page); 183 + free_page((unsigned long)kvm_svm->avic_logical_id_table); 184 + free_page((unsigned long)kvm_svm->avic_physical_id_table); 183 185 184 186 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 185 187 hash_del(&kvm_svm->hnode); ··· 190 196 int err = -ENOMEM; 191 197 struct kvm_svm *kvm_svm = to_kvm_svm(kvm); 192 198 struct kvm_svm *k2; 193 - struct page *p_page; 194 - struct page *l_page; 195 199 u32 vm_id; 196 200 197 201 if (!enable_apicv) 198 202 return 0; 199 203 200 - /* Allocating physical APIC ID table (4KB) */ 201 - p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 202 - if (!p_page) 204 + kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 205 + if (!kvm_svm->avic_physical_id_table) 203 206 goto free_avic; 204 207 205 - kvm_svm->avic_physical_id_table_page = p_page; 206 - 207 - /* Allocating logical APIC ID table (4KB) */ 208 - l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 209 - if (!l_page) 208 + kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 209 + if (!kvm_svm->avic_logical_id_table) 210 210 goto free_avic; 211 - 212 - kvm_svm->avic_logical_id_table_page = l_page; 213 211 214 212 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 215 213 again: ··· 228 242 return err; 229 243 } 230 244 245 + static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm) 246 + { 247 + return __sme_set(__pa(svm->vcpu.arch.apic->regs)); 248 + } 249 + 231 250 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) 232 251 { 233 252 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); 234 - phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); 235 - phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); 236 - phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); 237 253 238 - vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 239 - vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; 240 - vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; 241 - vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; 254 + vmcb->control.avic_backing_page = avic_get_backing_page_address(svm); 255 + vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table)); 256 + vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); 257 + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; 242 258 243 259 if (kvm_apicv_activated(svm->vcpu.kvm)) 244 260 avic_activate_vmcb(svm); ··· 248 260 avic_deactivate_vmcb(svm); 249 261 } 250 262 251 - static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, 252 - unsigned int index) 253 - { 254 - u64 *avic_physical_id_table; 255 - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 256 - 257 - if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || 258 - (index > X2AVIC_MAX_PHYSICAL_ID)) 259 - return NULL; 260 - 261 - avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); 262 - 263 - return &avic_physical_id_table[index]; 264 - } 265 - 266 263 static int avic_init_backing_page(struct kvm_vcpu *vcpu) 267 264 { 268 - u64 *entry, new_entry; 269 - int id = vcpu->vcpu_id; 265 + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 270 266 struct vcpu_svm *svm = to_svm(vcpu); 267 + u32 id = vcpu->vcpu_id; 268 + u64 new_entry; 271 269 270 + /* 271 + * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC 272 + * hardware. Immediately clear apicv_active, i.e. don't wait until the 273 + * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as 274 + * avic_vcpu_load() expects to be called if and only if the vCPU has 275 + * fully initialized AVIC. 276 + */ 272 277 if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || 273 - (id > X2AVIC_MAX_PHYSICAL_ID)) 274 - return -EINVAL; 278 + (id > X2AVIC_MAX_PHYSICAL_ID)) { 279 + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); 280 + vcpu->arch.apic->apicv_active = false; 281 + return 0; 282 + } 275 283 276 - if (!vcpu->arch.apic->regs) 284 + BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE || 285 + (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE); 286 + 287 + if (WARN_ON_ONCE(!vcpu->arch.apic->regs)) 277 288 return -EINVAL; 278 289 279 290 if (kvm_apicv_activated(vcpu->kvm)) { ··· 289 302 return ret; 290 303 } 291 304 292 - svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); 305 + /* Note, fls64() returns the bit position, +1. */ 306 + BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT > 307 + fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK)); 293 308 294 309 /* Setting AVIC backing page address in the phy APIC ID table */ 295 - entry = avic_get_physical_id_entry(vcpu, id); 296 - if (!entry) 297 - return -EINVAL; 310 + new_entry = avic_get_backing_page_address(svm) | 311 + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 312 + svm->avic_physical_id_entry = new_entry; 298 313 299 - new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & 300 - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 301 - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); 302 - WRITE_ONCE(*entry, new_entry); 303 - 304 - svm->avic_physical_id_cache = entry; 314 + /* 315 + * Initialize the real table, as vCPUs must have a valid entry in order 316 + * for broadcast IPIs to function correctly (broadcast IPIs ignore 317 + * invalid entries, i.e. aren't guaranteed to generate a VM-Exit). 318 + */ 319 + WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry); 305 320 306 321 return 0; 307 322 } ··· 437 448 if (apic_x2apic_mode(source)) 438 449 avic_logical_id_table = NULL; 439 450 else 440 - avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); 451 + avic_logical_id_table = kvm_svm->avic_logical_id_table; 441 452 442 453 /* 443 454 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical ··· 539 550 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 540 551 { 541 552 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 542 - u32 *logical_apic_id_table; 543 553 u32 cluster, index; 544 554 545 555 ldr = GET_APIC_LOGICAL_ID(ldr); ··· 559 571 return NULL; 560 572 index += (cluster << 2); 561 573 562 - logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); 563 - 564 - return &logical_apic_id_table[index]; 574 + return &kvm_svm->avic_logical_id_table[index]; 565 575 } 566 576 567 577 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) ··· 708 722 int ret; 709 723 struct kvm_vcpu *vcpu = &svm->vcpu; 710 724 725 + INIT_LIST_HEAD(&svm->ir_list); 726 + spin_lock_init(&svm->ir_list_lock); 727 + 711 728 if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) 712 729 return 0; 713 730 ··· 718 729 if (ret) 719 730 return ret; 720 731 721 - INIT_LIST_HEAD(&svm->ir_list); 722 - spin_lock_init(&svm->ir_list_lock); 723 732 svm->dfr_reg = APIC_DFR_FLAT; 724 733 725 734 return ret; ··· 729 742 avic_handle_ldr_update(vcpu); 730 743 } 731 744 732 - static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) 745 + static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) 733 746 { 734 - int ret = 0; 747 + struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu; 735 748 unsigned long flags; 736 - struct amd_svm_iommu_ir *ir; 737 - struct vcpu_svm *svm = to_svm(vcpu); 738 749 739 - if (!kvm_arch_has_assigned_device(vcpu->kvm)) 740 - return 0; 750 + if (!vcpu) 751 + return; 741 752 753 + spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); 754 + list_del(&irqfd->vcpu_list); 755 + spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); 756 + } 757 + 758 + int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 759 + unsigned int host_irq, uint32_t guest_irq, 760 + struct kvm_vcpu *vcpu, u32 vector) 761 + { 742 762 /* 743 - * Here, we go through the per-vcpu ir_list to update all existing 744 - * interrupt remapping table entry targeting this vcpu. 763 + * If the IRQ was affined to a different vCPU, remove the IRTE metadata 764 + * from the *previous* vCPU's list. 745 765 */ 746 - spin_lock_irqsave(&svm->ir_list_lock, flags); 766 + svm_ir_list_del(irqfd); 747 767 748 - if (list_empty(&svm->ir_list)) 749 - goto out; 768 + if (vcpu) { 769 + /* 770 + * Try to enable guest_mode in IRTE, unless AVIC is inhibited, 771 + * in which case configure the IRTE for legacy mode, but track 772 + * the IRTE metadata so that it can be converted to guest mode 773 + * if AVIC is enabled/uninhibited in the future. 774 + */ 775 + struct amd_iommu_pi_data pi_data = { 776 + .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 777 + vcpu->vcpu_idx), 778 + .is_guest_mode = kvm_vcpu_apicv_active(vcpu), 779 + .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)), 780 + .vector = vector, 781 + }; 782 + struct vcpu_svm *svm = to_svm(vcpu); 783 + u64 entry; 784 + int ret; 750 785 751 - list_for_each_entry(ir, &svm->ir_list, node) { 752 - if (activate) 753 - ret = amd_iommu_activate_guest_mode(ir->data); 754 - else 755 - ret = amd_iommu_deactivate_guest_mode(ir->data); 786 + /* 787 + * Prevent the vCPU from being scheduled out or migrated until 788 + * the IRTE is updated and its metadata has been added to the 789 + * list of IRQs being posted to the vCPU, to ensure the IRTE 790 + * isn't programmed with stale pCPU/IsRunning information. 791 + */ 792 + guard(spinlock_irqsave)(&svm->ir_list_lock); 793 + 794 + /* 795 + * Update the target pCPU for IOMMU doorbells if the vCPU is 796 + * running. If the vCPU is NOT running, i.e. is blocking or 797 + * scheduled out, KVM will update the pCPU info when the vCPU 798 + * is awakened and/or scheduled in. See also avic_vcpu_load(). 799 + */ 800 + entry = svm->avic_physical_id_entry; 801 + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) { 802 + pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 803 + } else { 804 + pi_data.cpu = -1; 805 + pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 806 + } 807 + 808 + ret = irq_set_vcpu_affinity(host_irq, &pi_data); 756 809 if (ret) 757 - break; 758 - } 759 - out: 760 - spin_unlock_irqrestore(&svm->ir_list_lock, flags); 761 - return ret; 762 - } 810 + return ret; 763 811 764 - static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 765 - { 766 - unsigned long flags; 767 - struct amd_svm_iommu_ir *cur; 768 - 769 - spin_lock_irqsave(&svm->ir_list_lock, flags); 770 - list_for_each_entry(cur, &svm->ir_list, node) { 771 - if (cur->data != pi->ir_data) 772 - continue; 773 - list_del(&cur->node); 774 - kfree(cur); 775 - break; 776 - } 777 - spin_unlock_irqrestore(&svm->ir_list_lock, flags); 778 - } 779 - 780 - static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) 781 - { 782 - int ret = 0; 783 - unsigned long flags; 784 - struct amd_svm_iommu_ir *ir; 785 - u64 entry; 786 - 787 - if (WARN_ON_ONCE(!pi->ir_data)) 788 - return -EINVAL; 789 - 790 - /** 791 - * In some cases, the existing irte is updated and re-set, 792 - * so we need to check here if it's already been * added 793 - * to the ir_list. 794 - */ 795 - if (pi->prev_ga_tag) { 796 - struct kvm *kvm = svm->vcpu.kvm; 797 - u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); 798 - struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); 799 - struct vcpu_svm *prev_svm; 800 - 801 - if (!prev_vcpu) { 802 - ret = -EINVAL; 803 - goto out; 812 + /* 813 + * Revert to legacy mode if the IOMMU didn't provide metadata 814 + * for the IRTE, which KVM needs to keep the IRTE up-to-date, 815 + * e.g. if the vCPU is migrated or AVIC is disabled. 816 + */ 817 + if (WARN_ON_ONCE(!pi_data.ir_data)) { 818 + irq_set_vcpu_affinity(host_irq, NULL); 819 + return -EIO; 804 820 } 805 821 806 - prev_svm = to_svm(prev_vcpu); 807 - svm_ir_list_del(prev_svm, pi); 822 + irqfd->irq_bypass_data = pi_data.ir_data; 823 + list_add(&irqfd->vcpu_list, &svm->ir_list); 824 + return 0; 808 825 } 826 + return irq_set_vcpu_affinity(host_irq, NULL); 827 + } 809 828 810 - /** 811 - * Allocating new amd_iommu_pi_data, which will get 812 - * add to the per-vcpu ir_list. 829 + enum avic_vcpu_action { 830 + /* 831 + * There is no need to differentiate between activate and deactivate, 832 + * as KVM only refreshes AVIC state when the vCPU is scheduled in and 833 + * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is 834 + * being (de)activated. 813 835 */ 814 - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); 815 - if (!ir) { 816 - ret = -ENOMEM; 817 - goto out; 818 - } 819 - ir->data = pi->ir_data; 820 - 821 - spin_lock_irqsave(&svm->ir_list_lock, flags); 836 + AVIC_TOGGLE_ON_OFF = BIT(0), 837 + AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF, 838 + AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF, 822 839 823 840 /* 824 - * Update the target pCPU for IOMMU doorbells if the vCPU is running. 825 - * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM 826 - * will update the pCPU info when the vCPU awkened and/or scheduled in. 827 - * See also avic_vcpu_load(). 841 + * No unique action is required to deal with a vCPU that stops/starts 842 + * running. A vCPU that starts running by definition stops blocking as 843 + * well, and a vCPU that stops running can't have been blocking, i.e. 844 + * doesn't need to toggle GALogIntr. 828 845 */ 829 - entry = READ_ONCE(*(svm->avic_physical_id_cache)); 830 - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) 831 - amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, 832 - true, pi->ir_data); 846 + AVIC_START_RUNNING = 0, 847 + AVIC_STOP_RUNNING = 0, 833 848 834 - list_add(&ir->node, &svm->ir_list); 835 - spin_unlock_irqrestore(&svm->ir_list_lock, flags); 836 - out: 837 - return ret; 838 - } 849 + /* 850 + * When a vCPU starts blocking, KVM needs to set the GALogIntr flag 851 + * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is 852 + * sent to the vCPU. 853 + */ 854 + AVIC_START_BLOCKING = BIT(1), 855 + }; 839 856 840 - /* 841 - * Note: 842 - * The HW cannot support posting multicast/broadcast 843 - * interrupts to a vCPU. So, we still use legacy interrupt 844 - * remapping for these kind of interrupts. 845 - * 846 - * For lowest-priority interrupts, we only support 847 - * those with single CPU as the destination, e.g. user 848 - * configures the interrupts via /proc/irq or uses 849 - * irqbalance to make the interrupts single-CPU. 850 - */ 851 - static int 852 - get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, 853 - struct vcpu_data *vcpu_info, struct vcpu_svm **svm) 857 + static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, 858 + enum avic_vcpu_action action) 854 859 { 855 - struct kvm_lapic_irq irq; 856 - struct kvm_vcpu *vcpu = NULL; 857 - 858 - kvm_set_msi_irq(kvm, e, &irq); 859 - 860 - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 861 - !kvm_irq_is_postable(&irq)) { 862 - pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", 863 - __func__, irq.vector); 864 - return -1; 865 - } 866 - 867 - pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 868 - irq.vector); 869 - *svm = to_svm(vcpu); 870 - vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); 871 - vcpu_info->vector = irq.vector; 872 - 873 - return 0; 874 - } 875 - 876 - /* 877 - * avic_pi_update_irte - set IRTE for Posted-Interrupts 878 - * 879 - * @kvm: kvm 880 - * @host_irq: host irq of the interrupt 881 - * @guest_irq: gsi of the interrupt 882 - * @set: set or unset PI 883 - * returns 0 on success, < 0 on failure 884 - */ 885 - int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, 886 - uint32_t guest_irq, bool set) 887 - { 888 - struct kvm_kernel_irq_routing_entry *e; 889 - struct kvm_irq_routing_table *irq_rt; 890 - bool enable_remapped_mode = true; 891 - int idx, ret = 0; 892 - 893 - if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) 894 - return 0; 895 - 896 - pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", 897 - __func__, host_irq, guest_irq, set); 898 - 899 - idx = srcu_read_lock(&kvm->irq_srcu); 900 - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 901 - 902 - if (guest_irq >= irq_rt->nr_rt_entries || 903 - hlist_empty(&irq_rt->map[guest_irq])) { 904 - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", 905 - guest_irq, irq_rt->nr_rt_entries); 906 - goto out; 907 - } 908 - 909 - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 910 - struct vcpu_data vcpu_info; 911 - struct vcpu_svm *svm = NULL; 912 - 913 - if (e->type != KVM_IRQ_ROUTING_MSI) 914 - continue; 915 - 916 - /** 917 - * Here, we setup with legacy mode in the following cases: 918 - * 1. When cannot target interrupt to a specific vcpu. 919 - * 2. Unsetting posted interrupt. 920 - * 3. APIC virtualization is disabled for the vcpu. 921 - * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) 922 - */ 923 - if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && 924 - kvm_vcpu_apicv_active(&svm->vcpu)) { 925 - struct amd_iommu_pi_data pi; 926 - 927 - enable_remapped_mode = false; 928 - 929 - /* Try to enable guest_mode in IRTE */ 930 - pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & 931 - AVIC_HPA_MASK); 932 - pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, 933 - svm->vcpu.vcpu_id); 934 - pi.is_guest_mode = true; 935 - pi.vcpu_data = &vcpu_info; 936 - ret = irq_set_vcpu_affinity(host_irq, &pi); 937 - 938 - /** 939 - * Here, we successfully setting up vcpu affinity in 940 - * IOMMU guest mode. Now, we need to store the posted 941 - * interrupt information in a per-vcpu ir_list so that 942 - * we can reference to them directly when we update vcpu 943 - * scheduling information in IOMMU irte. 944 - */ 945 - if (!ret && pi.is_guest_mode) 946 - svm_ir_list_add(svm, &pi); 947 - } 948 - 949 - if (!ret && svm) { 950 - trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, 951 - e->gsi, vcpu_info.vector, 952 - vcpu_info.pi_desc_addr, set); 953 - } 954 - 955 - if (ret < 0) { 956 - pr_err("%s: failed to update PI IRTE\n", __func__); 957 - goto out; 958 - } 959 - } 960 - 961 - ret = 0; 962 - if (enable_remapped_mode) { 963 - /* Use legacy mode in IRTE */ 964 - struct amd_iommu_pi_data pi; 965 - 966 - /** 967 - * Here, pi is used to: 968 - * - Tell IOMMU to use legacy mode for this interrupt. 969 - * - Retrieve ga_tag of prior interrupt remapping data. 970 - */ 971 - pi.prev_ga_tag = 0; 972 - pi.is_guest_mode = false; 973 - ret = irq_set_vcpu_affinity(host_irq, &pi); 974 - 975 - /** 976 - * Check if the posted interrupt was previously 977 - * setup with the guest_mode by checking if the ga_tag 978 - * was cached. If so, we need to clean up the per-vcpu 979 - * ir_list. 980 - */ 981 - if (!ret && pi.prev_ga_tag) { 982 - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); 983 - struct kvm_vcpu *vcpu; 984 - 985 - vcpu = kvm_get_vcpu_by_id(kvm, id); 986 - if (vcpu) 987 - svm_ir_list_del(to_svm(vcpu), &pi); 988 - } 989 - } 990 - out: 991 - srcu_read_unlock(&kvm->irq_srcu, idx); 992 - return ret; 993 - } 994 - 995 - static inline int 996 - avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) 997 - { 998 - int ret = 0; 999 - struct amd_svm_iommu_ir *ir; 860 + bool ga_log_intr = (action & AVIC_START_BLOCKING); 1000 861 struct vcpu_svm *svm = to_svm(vcpu); 862 + struct kvm_kernel_irqfd *irqfd; 1001 863 1002 864 lockdep_assert_held(&svm->ir_list_lock); 1003 865 1004 - if (!kvm_arch_has_assigned_device(vcpu->kvm)) 1005 - return 0; 1006 - 1007 866 /* 1008 867 * Here, we go through the per-vcpu ir_list to update all existing 1009 868 * interrupt remapping table entry targeting this vcpu. 1010 869 */ 1011 870 if (list_empty(&svm->ir_list)) 1012 - return 0; 871 + return; 1013 872 1014 - list_for_each_entry(ir, &svm->ir_list, node) { 1015 - ret = amd_iommu_update_ga(cpu, r, ir->data); 1016 - if (ret) 1017 - return ret; 873 + list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) { 874 + void *data = irqfd->irq_bypass_data; 875 + 876 + if (!(action & AVIC_TOGGLE_ON_OFF)) 877 + WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr)); 878 + else if (cpu >= 0) 879 + WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr)); 880 + else 881 + WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data)); 1018 882 } 1019 - return 0; 1020 883 } 1021 884 1022 - void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 885 + static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, 886 + enum avic_vcpu_action action) 1023 887 { 1024 - u64 entry; 888 + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 1025 889 int h_physical_id = kvm_cpu_get_apicid(cpu); 1026 890 struct vcpu_svm *svm = to_svm(vcpu); 1027 891 unsigned long flags; 892 + u64 entry; 1028 893 1029 894 lockdep_assert_preemption_disabled(); 1030 895 1031 896 if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 1032 897 return; 1033 898 1034 - /* 1035 - * No need to update anything if the vCPU is blocking, i.e. if the vCPU 1036 - * is being scheduled in after being preempted. The CPU entries in the 1037 - * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 1038 - * If the vCPU was migrated, its new CPU value will be stuffed when the 1039 - * vCPU unblocks. 1040 - */ 1041 - if (kvm_vcpu_is_blocking(vcpu)) 899 + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) 1042 900 return; 1043 901 1044 902 /* ··· 895 1063 */ 896 1064 spin_lock_irqsave(&svm->ir_list_lock, flags); 897 1065 898 - entry = READ_ONCE(*(svm->avic_physical_id_cache)); 1066 + entry = svm->avic_physical_id_entry; 899 1067 WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 900 1068 901 - entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 1069 + entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK | 1070 + AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 902 1071 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 903 1072 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 904 1073 905 - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 906 - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); 1074 + svm->avic_physical_id_entry = entry; 1075 + 1076 + /* 1077 + * If IPI virtualization is disabled, clear IsRunning when updating the 1078 + * actual Physical ID table, so that the CPU never sees IsRunning=1. 1079 + * Keep the APIC ID up-to-date in the entry to minimize the chances of 1080 + * things going sideways if hardware peeks at the ID. 1081 + */ 1082 + if (!enable_ipiv) 1083 + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1084 + 1085 + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1086 + 1087 + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); 907 1088 908 1089 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 909 1090 } 910 1091 911 - void avic_vcpu_put(struct kvm_vcpu *vcpu) 1092 + void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 912 1093 { 913 - u64 entry; 1094 + /* 1095 + * No need to update anything if the vCPU is blocking, i.e. if the vCPU 1096 + * is being scheduled in after being preempted. The CPU entries in the 1097 + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 1098 + * If the vCPU was migrated, its new CPU value will be stuffed when the 1099 + * vCPU unblocks. 1100 + */ 1101 + if (kvm_vcpu_is_blocking(vcpu)) 1102 + return; 1103 + 1104 + __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING); 1105 + } 1106 + 1107 + static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) 1108 + { 1109 + struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); 914 1110 struct vcpu_svm *svm = to_svm(vcpu); 915 1111 unsigned long flags; 1112 + u64 entry = svm->avic_physical_id_entry; 916 1113 917 1114 lockdep_assert_preemption_disabled(); 918 1115 919 - /* 920 - * Note, reading the Physical ID entry outside of ir_list_lock is safe 921 - * as only the pCPU that has loaded (or is loading) the vCPU is allowed 922 - * to modify the entry, and preemption is disabled. I.e. the vCPU 923 - * can't be scheduled out and thus avic_vcpu_{put,load}() can't run 924 - * recursively. 925 - */ 926 - entry = READ_ONCE(*(svm->avic_physical_id_cache)); 927 - 928 - /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ 929 - if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) 1116 + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) 930 1117 return; 931 1118 932 1119 /* ··· 958 1107 */ 959 1108 spin_lock_irqsave(&svm->ir_list_lock, flags); 960 1109 961 - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); 1110 + avic_update_iommu_vcpu_affinity(vcpu, -1, action); 962 1111 1112 + WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR); 1113 + 1114 + /* 1115 + * Keep the previous APIC ID in the entry so that a rogue doorbell from 1116 + * hardware is at least restricted to a CPU associated with the vCPU. 1117 + */ 963 1118 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 964 - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 1119 + 1120 + if (enable_ipiv) 1121 + WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry); 1122 + 1123 + /* 1124 + * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as 1125 + * it's a synthetic flag that usurps an unused should-be-zero bit. 1126 + */ 1127 + if (action & AVIC_START_BLOCKING) 1128 + entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR; 1129 + 1130 + svm->avic_physical_id_entry = entry; 965 1131 966 1132 spin_unlock_irqrestore(&svm->ir_list_lock, flags); 1133 + } 967 1134 1135 + void avic_vcpu_put(struct kvm_vcpu *vcpu) 1136 + { 1137 + /* 1138 + * Note, reading the Physical ID entry outside of ir_list_lock is safe 1139 + * as only the pCPU that has loaded (or is loading) the vCPU is allowed 1140 + * to modify the entry, and preemption is disabled. I.e. the vCPU 1141 + * can't be scheduled out and thus avic_vcpu_{put,load}() can't run 1142 + * recursively. 1143 + */ 1144 + u64 entry = to_svm(vcpu)->avic_physical_id_entry; 1145 + 1146 + /* 1147 + * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the 1148 + * vCPU is preempted while its in the process of blocking. WARN if the 1149 + * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put 1150 + * the AVIC if it wasn't previously loaded. 1151 + */ 1152 + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) { 1153 + if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu))) 1154 + return; 1155 + 1156 + /* 1157 + * The vCPU was preempted while blocking, ensure its IRTEs are 1158 + * configured to generate GA Log Interrupts. 1159 + */ 1160 + if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR)))) 1161 + return; 1162 + } 1163 + 1164 + __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING : 1165 + AVIC_STOP_RUNNING); 968 1166 } 969 1167 970 1168 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) ··· 1042 1142 1043 1143 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 1044 1144 { 1045 - bool activated = kvm_vcpu_apicv_active(vcpu); 1046 - 1047 1145 if (!enable_apicv) 1048 1146 return; 1049 1147 1148 + /* APICv should only be toggled on/off while the vCPU is running. */ 1149 + WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu)); 1150 + 1050 1151 avic_refresh_virtual_apic_mode(vcpu); 1051 1152 1052 - if (activated) 1053 - avic_vcpu_load(vcpu, vcpu->cpu); 1153 + if (kvm_vcpu_apicv_active(vcpu)) 1154 + __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE); 1054 1155 else 1055 - avic_vcpu_put(vcpu); 1056 - 1057 - avic_set_pi_irte_mode(vcpu, activated); 1156 + __avic_vcpu_put(vcpu, AVIC_DEACTIVATE); 1058 1157 } 1059 1158 1060 1159 void avic_vcpu_blocking(struct kvm_vcpu *vcpu) ··· 1061 1162 if (!kvm_vcpu_apicv_active(vcpu)) 1062 1163 return; 1063 1164 1064 - /* 1065 - * Unload the AVIC when the vCPU is about to block, _before_ 1066 - * the vCPU actually blocks. 1067 - * 1068 - * Any IRQs that arrive before IsRunning=0 will not cause an 1069 - * incomplete IPI vmexit on the source, therefore vIRR will also 1070 - * be checked by kvm_vcpu_check_block() before blocking. The 1071 - * memory barrier implicit in set_current_state orders writing 1072 - * IsRunning=0 before reading the vIRR. The processor needs a 1073 - * matching memory barrier on interrupt delivery between writing 1074 - * IRR and reading IsRunning; the lack of this barrier might be 1075 - * the cause of errata #1235). 1076 - */ 1077 - avic_vcpu_put(vcpu); 1165 + /* 1166 + * Unload the AVIC when the vCPU is about to block, _before_ the vCPU 1167 + * actually blocks. 1168 + * 1169 + * Note, any IRQs that arrive before IsRunning=0 will not cause an 1170 + * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles 1171 + * this by checking vIRR one last time before blocking. The memory 1172 + * barrier implicit in set_current_state orders writing IsRunning=0 1173 + * before reading the vIRR. The processor needs a matching memory 1174 + * barrier on interrupt delivery between writing IRR and reading 1175 + * IsRunning; the lack of this barrier might be the cause of errata #1235). 1176 + * 1177 + * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM 1178 + * doesn't need to detect events for scheduling purposes. The doorbell 1179 + * used to signal running vCPUs cannot be blocked, i.e. will perturb the 1180 + * CPU and cause noisy neighbor problems if the VM is sending interrupts 1181 + * to the vCPU while it's scheduled out. 1182 + */ 1183 + __avic_vcpu_put(vcpu, AVIC_START_BLOCKING); 1078 1184 } 1079 1185 1080 1186 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) ··· 1131 1227 x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); 1132 1228 if (x2avic_enabled) 1133 1229 pr_info("x2AVIC enabled\n"); 1230 + 1231 + /* 1232 + * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) 1233 + * due to erratum 1235, which results in missed VM-Exits on the sender 1234 + * and thus missed wake events for blocking vCPUs due to the CPU 1235 + * failing to see a software update to clear IsRunning. 1236 + */ 1237 + enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17; 1134 1238 1135 1239 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1136 1240

+4

arch/x86/kvm/svm/svm.c

··· 232 232 */ 233 233 static bool avic; 234 234 module_param(avic, bool, 0444); 235 + module_param(enable_ipiv, bool, 0444); 235 236 236 237 module_param(enable_device_posted_irqs, bool, 0444); 237 238 ··· 1490 1489 static void svm_vcpu_free(struct kvm_vcpu *vcpu) 1491 1490 { 1492 1491 struct vcpu_svm *svm = to_svm(vcpu); 1492 + 1493 + WARN_ON_ONCE(!list_empty(&svm->ir_list)); 1493 1494 1494 1495 svm_leave_nested(vcpu); 1495 1496 svm_free_nested(svm); ··· 5584 5581 enable_apicv = avic = avic && avic_hardware_setup(); 5585 5582 5586 5583 if (!enable_apicv) { 5584 + enable_ipiv = false; 5587 5585 svm_x86_ops.vcpu_blocking = NULL; 5588 5586 svm_x86_ops.vcpu_unblocking = NULL; 5589 5587 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;

+21 -11

arch/x86/kvm/svm/svm.h

··· 123 123 124 124 /* Struct members for AVIC */ 125 125 u32 avic_vm_id; 126 - struct page *avic_logical_id_table_page; 127 - struct page *avic_physical_id_table_page; 126 + u32 *avic_logical_id_table; 127 + u64 *avic_physical_id_table; 128 128 struct hlist_node hnode; 129 129 130 130 struct kvm_sev_info sev_info; ··· 306 306 307 307 u32 ldr_reg; 308 308 u32 dfr_reg; 309 - struct page *avic_backing_page; 310 - u64 *avic_physical_id_cache; 309 + 310 + /* This is essentially a shadow of the vCPU's actual entry in the 311 + * Physical ID table that is programmed into the VMCB, i.e. that is 312 + * seen by the CPU. If IPI virtualization is disabled, IsRunning is 313 + * only ever set in the shadow, i.e. is never propagated to the "real" 314 + * table, so that hardware never sees IsRunning=1. 315 + */ 316 + u64 avic_physical_id_entry; 311 317 312 318 /* 313 - * Per-vcpu list of struct amd_svm_iommu_ir: 314 - * This is used mainly to store interrupt remapping information used 315 - * when update the vcpu affinity. This avoids the need to scan for 316 - * IRTE and try to match ga_tag in the IOMMU driver. 319 + * Per-vCPU list of irqfds that are eligible to post IRQs directly to 320 + * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list 321 + * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the 322 + * target pCPU), when AVIC is toggled on/off (to (de)activate bypass), 323 + * and if the irqfd becomes ineligible for posting (to put the IRTE 324 + * back into remapped mode). 317 325 */ 318 326 struct list_head ir_list; 319 327 spinlock_t ir_list_lock; ··· 729 721 BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ 730 722 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ 731 723 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \ 732 - BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \ 724 + BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \ 725 + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \ 733 726 ) 734 727 735 728 bool avic_hardware_setup(void); ··· 745 736 void avic_vcpu_put(struct kvm_vcpu *vcpu); 746 737 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); 747 738 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); 748 - int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, 749 - uint32_t guest_irq, bool set); 739 + int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 740 + unsigned int host_irq, uint32_t guest_irq, 741 + struct kvm_vcpu *vcpu, u32 vector); 750 742 void avic_vcpu_blocking(struct kvm_vcpu *vcpu); 751 743 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); 752 744 void avic_ring_doorbell(struct kvm_vcpu *vcpu);

+87 -12

arch/x86/kvm/trace.h

··· 260 260 __entry->used_max_basic ? ", used max basic" : "") 261 261 ); 262 262 263 + #define kvm_deliver_mode \ 264 + {0x0, "Fixed"}, \ 265 + {0x1, "LowPrio"}, \ 266 + {0x2, "SMI"}, \ 267 + {0x3, "Res3"}, \ 268 + {0x4, "NMI"}, \ 269 + {0x5, "INIT"}, \ 270 + {0x6, "SIPI"}, \ 271 + {0x7, "ExtINT"} 272 + 273 + #ifdef CONFIG_KVM_IOAPIC 274 + TRACE_EVENT(kvm_ioapic_set_irq, 275 + TP_PROTO(__u64 e, int pin, bool coalesced), 276 + TP_ARGS(e, pin, coalesced), 277 + 278 + TP_STRUCT__entry( 279 + __field( __u64, e ) 280 + __field( int, pin ) 281 + __field( bool, coalesced ) 282 + ), 283 + 284 + TP_fast_assign( 285 + __entry->e = e; 286 + __entry->pin = pin; 287 + __entry->coalesced = coalesced; 288 + ), 289 + 290 + TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", 291 + __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, 292 + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), 293 + (__entry->e & (1<<11)) ? "logical" : "physical", 294 + (__entry->e & (1<<15)) ? "level" : "edge", 295 + (__entry->e & (1<<16)) ? "|masked" : "", 296 + __entry->coalesced ? " (coalesced)" : "") 297 + ); 298 + 299 + TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, 300 + TP_PROTO(__u64 e), 301 + TP_ARGS(e), 302 + 303 + TP_STRUCT__entry( 304 + __field( __u64, e ) 305 + ), 306 + 307 + TP_fast_assign( 308 + __entry->e = e; 309 + ), 310 + 311 + TP_printk("dst %x vec %u (%s|%s|%s%s)", 312 + (u8)(__entry->e >> 56), (u8)__entry->e, 313 + __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), 314 + (__entry->e & (1<<11)) ? "logical" : "physical", 315 + (__entry->e & (1<<15)) ? "level" : "edge", 316 + (__entry->e & (1<<16)) ? "|masked" : "") 317 + ); 318 + #endif 319 + 320 + TRACE_EVENT(kvm_msi_set_irq, 321 + TP_PROTO(__u64 address, __u64 data), 322 + TP_ARGS(address, data), 323 + 324 + TP_STRUCT__entry( 325 + __field( __u64, address ) 326 + __field( __u64, data ) 327 + ), 328 + 329 + TP_fast_assign( 330 + __entry->address = address; 331 + __entry->data = data; 332 + ), 333 + 334 + TP_printk("dst %llx vec %u (%s|%s|%s%s)", 335 + (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), 336 + (u8)__entry->data, 337 + __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), 338 + (__entry->address & (1<<2)) ? "logical" : "physical", 339 + (__entry->data & (1<<15)) ? "level" : "edge", 340 + (__entry->address & (1<<3)) ? "|rh" : "") 341 + ); 342 + 263 343 #define AREG(x) { APIC_##x, "APIC_" #x } 264 344 265 345 #define kvm_trace_symbol_apic \ ··· 1176 1096 * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC. 1177 1097 */ 1178 1098 TRACE_EVENT(kvm_pi_irte_update, 1179 - TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, 1180 - unsigned int gsi, unsigned int gvec, 1181 - u64 pi_desc_addr, bool set), 1182 - TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), 1099 + TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu, 1100 + unsigned int gsi, unsigned int gvec, bool set), 1101 + TP_ARGS(host_irq, vcpu, gsi, gvec, set), 1183 1102 1184 1103 TP_STRUCT__entry( 1185 1104 __field( unsigned int, host_irq ) 1186 - __field( unsigned int, vcpu_id ) 1105 + __field( int, vcpu_id ) 1187 1106 __field( unsigned int, gsi ) 1188 1107 __field( unsigned int, gvec ) 1189 - __field( u64, pi_desc_addr ) 1190 1108 __field( bool, set ) 1191 1109 ), 1192 1110 1193 1111 TP_fast_assign( 1194 1112 __entry->host_irq = host_irq; 1195 - __entry->vcpu_id = vcpu_id; 1113 + __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1; 1196 1114 __entry->gsi = gsi; 1197 1115 __entry->gvec = gvec; 1198 - __entry->pi_desc_addr = pi_desc_addr; 1199 1116 __entry->set = set; 1200 1117 ), 1201 1118 1202 - TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, " 1203 - "gvec: 0x%x, pi_desc_addr: 0x%llx", 1119 + TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x", 1204 1120 __entry->set ? "enabled and being updated" : "disabled", 1205 1121 __entry->host_irq, 1206 1122 __entry->vcpu_id, 1207 1123 __entry->gsi, 1208 - __entry->gvec, 1209 - __entry->pi_desc_addr) 1124 + __entry->gvec) 1210 1125 ); 1211 1126 1212 1127 /*

-1

arch/x86/kvm/vmx/capabilities.h

··· 15 15 extern bool __read_mostly enable_unrestricted_guest; 16 16 extern bool __read_mostly enable_ept_ad_bits; 17 17 extern bool __read_mostly enable_pml; 18 - extern bool __read_mostly enable_ipiv; 19 18 extern int __read_mostly pt_mode; 20 19 21 20 #define PT_MODE_SYSTEM 0

+1 -1

arch/x86/kvm/vmx/main.c

··· 1014 1014 .nested_ops = &vmx_nested_ops, 1015 1015 1016 1016 .pi_update_irte = vmx_pi_update_irte, 1017 - .pi_start_assignment = vmx_pi_start_assignment, 1017 + .pi_start_bypass = vmx_pi_start_bypass, 1018 1018 1019 1019 #ifdef CONFIG_X86_64 1020 1020 .set_hv_timer = vt_op(set_hv_timer),

+39 -99

arch/x86/kvm/vmx/posted_intr.c

··· 2 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 3 4 4 #include <linux/kvm_host.h> 5 + #include <linux/kvm_irqfd.h> 5 6 6 7 #include <asm/irq_remapping.h> 7 8 #include <asm/cpu.h> ··· 73 72 /* 74 73 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the 75 74 * full update can be skipped as neither the vector nor the destination 76 - * needs to be changed. 75 + * needs to be changed. Clear SN even if there is no assigned device, 76 + * again for simplicity. 77 77 */ 78 78 if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { 79 - /* 80 - * Clear SN if it was set due to being preempted. Again, do 81 - * this even if there is no assigned device for simplicity. 82 - */ 83 79 if (pi_test_and_clear_sn(pi_desc)) 84 80 goto after_clear_sn; 85 81 return; ··· 146 148 147 149 static bool vmx_can_use_vtd_pi(struct kvm *kvm) 148 150 { 151 + /* 152 + * Note, reading the number of possible bypass IRQs can race with a 153 + * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures 154 + * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK. 155 + */ 149 156 return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && 150 - kvm_arch_has_assigned_device(kvm); 157 + READ_ONCE(kvm->arch.nr_possible_bypass_irqs); 151 158 } 152 159 153 160 /* ··· 227 224 if (!vmx_needs_pi_wakeup(vcpu)) 228 225 return; 229 226 230 - if (kvm_vcpu_is_blocking(vcpu) && 227 + /* 228 + * If the vCPU is blocking with IRQs enabled and ISN'T being preempted, 229 + * enable the wakeup handler so that notification IRQ wakes the vCPU as 230 + * expected. There is no need to enable the wakeup handler if the vCPU 231 + * is preempted between setting its wait state and manually scheduling 232 + * out, as the task is still runnable, i.e. doesn't need a wake event 233 + * from KVM to be scheduled in. 234 + * 235 + * If the wakeup handler isn't being enabled, Suppress Notifications as 236 + * the cost of propagating PIR.IRR to PID.ON is negligible compared to 237 + * the cost of a spurious IRQ, and vCPU put/load is a slow path. 238 + */ 239 + if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) && 231 240 ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || 232 241 (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) 233 242 pi_enable_wakeup_handler(vcpu); 234 - 235 - /* 236 - * Set SN when the vCPU is preempted. Note, the vCPU can both be seen 237 - * as blocking and preempted, e.g. if it's preempted between setting 238 - * its wait state and manually scheduling out. 239 - */ 240 - if (vcpu->preempted) 243 + else 241 244 pi_set_sn(pi_desc); 242 245 } 243 246 ··· 290 281 291 282 292 283 /* 293 - * Bail out of the block loop if the VM has an assigned 294 - * device, but the blocking vCPU didn't reconfigure the 295 - * PI.NV to the wakeup vector, i.e. the assigned device 296 - * came along after the initial check in vmx_vcpu_pi_put(). 284 + * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as 285 + * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup 286 + * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put(). 297 287 */ 298 - void vmx_pi_start_assignment(struct kvm *kvm) 288 + void vmx_pi_start_bypass(struct kvm *kvm) 299 289 { 300 - if (!kvm_arch_has_irq_bypass()) 290 + if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm))) 301 291 return; 302 292 303 293 kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); 304 294 } 305 295 306 - /* 307 - * vmx_pi_update_irte - set IRTE for Posted-Interrupts 308 - * 309 - * @kvm: kvm 310 - * @host_irq: host irq of the interrupt 311 - * @guest_irq: gsi of the interrupt 312 - * @set: set or unset PI 313 - * returns 0 on success, < 0 on failure 314 - */ 315 - int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, 316 - uint32_t guest_irq, bool set) 296 + int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 297 + unsigned int host_irq, uint32_t guest_irq, 298 + struct kvm_vcpu *vcpu, u32 vector) 317 299 { 318 - struct kvm_kernel_irq_routing_entry *e; 319 - struct kvm_irq_routing_table *irq_rt; 320 - bool enable_remapped_mode = true; 321 - struct kvm_lapic_irq irq; 322 - struct kvm_vcpu *vcpu; 323 - struct vcpu_data vcpu_info; 324 - int idx, ret = 0; 300 + if (vcpu) { 301 + struct intel_iommu_pi_data pi_data = { 302 + .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)), 303 + .vector = vector, 304 + }; 325 305 326 - if (!vmx_can_use_vtd_pi(kvm)) 327 - return 0; 328 - 329 - idx = srcu_read_lock(&kvm->irq_srcu); 330 - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 331 - if (guest_irq >= irq_rt->nr_rt_entries || 332 - hlist_empty(&irq_rt->map[guest_irq])) { 333 - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", 334 - guest_irq, irq_rt->nr_rt_entries); 335 - goto out; 306 + return irq_set_vcpu_affinity(host_irq, &pi_data); 307 + } else { 308 + return irq_set_vcpu_affinity(host_irq, NULL); 336 309 } 337 - 338 - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 339 - if (e->type != KVM_IRQ_ROUTING_MSI) 340 - continue; 341 - /* 342 - * VT-d PI cannot support posting multicast/broadcast 343 - * interrupts to a vCPU, we still use interrupt remapping 344 - * for these kind of interrupts. 345 - * 346 - * For lowest-priority interrupts, we only support 347 - * those with single CPU as the destination, e.g. user 348 - * configures the interrupts via /proc/irq or uses 349 - * irqbalance to make the interrupts single-CPU. 350 - * 351 - * We will support full lowest-priority interrupt later. 352 - * 353 - * In addition, we can only inject generic interrupts using 354 - * the PI mechanism, refuse to route others through it. 355 - */ 356 - 357 - kvm_set_msi_irq(kvm, e, &irq); 358 - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 359 - !kvm_irq_is_postable(&irq)) 360 - continue; 361 - 362 - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); 363 - vcpu_info.vector = irq.vector; 364 - 365 - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, 366 - vcpu_info.vector, vcpu_info.pi_desc_addr, set); 367 - 368 - if (!set) 369 - continue; 370 - 371 - enable_remapped_mode = false; 372 - 373 - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); 374 - if (ret < 0) { 375 - printk(KERN_INFO "%s: failed to update PI IRTE\n", 376 - __func__); 377 - goto out; 378 - } 379 - } 380 - 381 - if (enable_remapped_mode) 382 - ret = irq_set_vcpu_affinity(host_irq, NULL); 383 - 384 - ret = 0; 385 - out: 386 - srcu_read_unlock(&kvm->irq_srcu, idx); 387 - return ret; 388 310 }

+7 -3

arch/x86/kvm/vmx/posted_intr.h

··· 3 3 #define __KVM_X86_VMX_POSTED_INTR_H 4 4 5 5 #include <linux/bitmap.h> 6 + #include <linux/find.h> 7 + #include <linux/kvm_host.h> 8 + 6 9 #include <asm/posted_intr.h> 7 10 8 11 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); ··· 14 11 void __init pi_init_cpu(int cpu); 15 12 void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); 16 13 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); 17 - int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, 18 - uint32_t guest_irq, bool set); 19 - void vmx_pi_start_assignment(struct kvm *kvm); 14 + int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, 15 + unsigned int host_irq, uint32_t guest_irq, 16 + struct kvm_vcpu *vcpu, u32 vector); 17 + void vmx_pi_start_bypass(struct kvm *kvm); 20 18 21 19 static inline int pi_find_highest_vector(struct pi_desc *pi_desc) 22 20 {

-2

arch/x86/kvm/vmx/vmx.c

··· 113 113 module_param(fasteoi, bool, 0444); 114 114 115 115 module_param(enable_apicv, bool, 0444); 116 - 117 - bool __read_mostly enable_ipiv = true; 118 116 module_param(enable_ipiv, bool, 0444); 119 117 120 118 module_param(enable_device_posted_irqs, bool, 0444);

+25 -229

arch/x86/kvm/x86.c

··· 226 226 bool __read_mostly enable_apicv = true; 227 227 EXPORT_SYMBOL_GPL(enable_apicv); 228 228 229 + bool __read_mostly enable_ipiv = true; 230 + EXPORT_SYMBOL_GPL(enable_ipiv); 231 + 229 232 bool __read_mostly enable_device_posted_irqs = true; 230 233 EXPORT_SYMBOL_GPL(enable_device_posted_irqs); 231 234 ··· 4637 4634 case KVM_CAP_EXT_CPUID: 4638 4635 case KVM_CAP_EXT_EMUL_CPUID: 4639 4636 case KVM_CAP_CLOCKSOURCE: 4637 + #ifdef CONFIG_KVM_IOAPIC 4640 4638 case KVM_CAP_PIT: 4639 + case KVM_CAP_PIT2: 4640 + case KVM_CAP_PIT_STATE2: 4641 + case KVM_CAP_REINJECT_CONTROL: 4642 + #endif 4641 4643 case KVM_CAP_NOP_IO_DELAY: 4642 4644 case KVM_CAP_MP_STATE: 4643 4645 case KVM_CAP_SYNC_MMU: 4644 4646 case KVM_CAP_USER_NMI: 4645 - case KVM_CAP_REINJECT_CONTROL: 4646 4647 case KVM_CAP_IRQ_INJECT_STATUS: 4647 4648 case KVM_CAP_IOEVENTFD: 4648 4649 case KVM_CAP_IOEVENTFD_NO_LENGTH: 4649 - case KVM_CAP_PIT2: 4650 - case KVM_CAP_PIT_STATE2: 4650 + 4651 4651 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 4652 4652 case KVM_CAP_VCPU_EVENTS: 4653 4653 #ifdef CONFIG_KVM_HYPERV ··· 6407 6401 return 0; 6408 6402 } 6409 6403 6410 - static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 6411 - { 6412 - struct kvm_pic *pic = kvm->arch.vpic; 6413 - int r; 6414 - 6415 - r = 0; 6416 - switch (chip->chip_id) { 6417 - case KVM_IRQCHIP_PIC_MASTER: 6418 - memcpy(&chip->chip.pic, &pic->pics[0], 6419 - sizeof(struct kvm_pic_state)); 6420 - break; 6421 - case KVM_IRQCHIP_PIC_SLAVE: 6422 - memcpy(&chip->chip.pic, &pic->pics[1], 6423 - sizeof(struct kvm_pic_state)); 6424 - break; 6425 - case KVM_IRQCHIP_IOAPIC: 6426 - kvm_get_ioapic(kvm, &chip->chip.ioapic); 6427 - break; 6428 - default: 6429 - r = -EINVAL; 6430 - break; 6431 - } 6432 - return r; 6433 - } 6434 - 6435 - static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 6436 - { 6437 - struct kvm_pic *pic = kvm->arch.vpic; 6438 - int r; 6439 - 6440 - r = 0; 6441 - switch (chip->chip_id) { 6442 - case KVM_IRQCHIP_PIC_MASTER: 6443 - spin_lock(&pic->lock); 6444 - memcpy(&pic->pics[0], &chip->chip.pic, 6445 - sizeof(struct kvm_pic_state)); 6446 - spin_unlock(&pic->lock); 6447 - break; 6448 - case KVM_IRQCHIP_PIC_SLAVE: 6449 - spin_lock(&pic->lock); 6450 - memcpy(&pic->pics[1], &chip->chip.pic, 6451 - sizeof(struct kvm_pic_state)); 6452 - spin_unlock(&pic->lock); 6453 - break; 6454 - case KVM_IRQCHIP_IOAPIC: 6455 - kvm_set_ioapic(kvm, &chip->chip.ioapic); 6456 - break; 6457 - default: 6458 - r = -EINVAL; 6459 - break; 6460 - } 6461 - kvm_pic_update_irq(pic); 6462 - return r; 6463 - } 6464 - 6465 - static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) 6466 - { 6467 - struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; 6468 - 6469 - BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); 6470 - 6471 - mutex_lock(&kps->lock); 6472 - memcpy(ps, &kps->channels, sizeof(*ps)); 6473 - mutex_unlock(&kps->lock); 6474 - return 0; 6475 - } 6476 - 6477 - static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) 6478 - { 6479 - int i; 6480 - struct kvm_pit *pit = kvm->arch.vpit; 6481 - 6482 - mutex_lock(&pit->pit_state.lock); 6483 - memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); 6484 - for (i = 0; i < 3; i++) 6485 - kvm_pit_load_count(pit, i, ps->channels[i].count, 0); 6486 - mutex_unlock(&pit->pit_state.lock); 6487 - return 0; 6488 - } 6489 - 6490 - static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 6491 - { 6492 - mutex_lock(&kvm->arch.vpit->pit_state.lock); 6493 - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, 6494 - sizeof(ps->channels)); 6495 - ps->flags = kvm->arch.vpit->pit_state.flags; 6496 - mutex_unlock(&kvm->arch.vpit->pit_state.lock); 6497 - memset(&ps->reserved, 0, sizeof(ps->reserved)); 6498 - return 0; 6499 - } 6500 - 6501 - static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) 6502 - { 6503 - int start = 0; 6504 - int i; 6505 - u32 prev_legacy, cur_legacy; 6506 - struct kvm_pit *pit = kvm->arch.vpit; 6507 - 6508 - mutex_lock(&pit->pit_state.lock); 6509 - prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; 6510 - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; 6511 - if (!prev_legacy && cur_legacy) 6512 - start = 1; 6513 - memcpy(&pit->pit_state.channels, &ps->channels, 6514 - sizeof(pit->pit_state.channels)); 6515 - pit->pit_state.flags = ps->flags; 6516 - for (i = 0; i < 3; i++) 6517 - kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, 6518 - start && i == 0); 6519 - mutex_unlock(&pit->pit_state.lock); 6520 - return 0; 6521 - } 6522 - 6523 - static int kvm_vm_ioctl_reinject(struct kvm *kvm, 6524 - struct kvm_reinject_control *control) 6525 - { 6526 - struct kvm_pit *pit = kvm->arch.vpit; 6527 - 6528 - /* pit->pit_state.lock was overloaded to prevent userspace from getting 6529 - * an inconsistent state after running multiple KVM_REINJECT_CONTROL 6530 - * ioctls in parallel. Use a separate lock if that ioctl isn't rare. 6531 - */ 6532 - mutex_lock(&pit->pit_state.lock); 6533 - kvm_pit_set_reinject(pit, control->pit_reinject); 6534 - mutex_unlock(&pit->pit_state.lock); 6535 - 6536 - return 0; 6537 - } 6538 - 6539 6404 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 6540 6405 { 6541 6406 ··· 6424 6547 6425 6548 kvm_for_each_vcpu(i, vcpu, kvm) 6426 6549 kvm_vcpu_kick(vcpu); 6427 - } 6428 - 6429 - int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, 6430 - bool line_status) 6431 - { 6432 - if (!irqchip_in_kernel(kvm)) 6433 - return -ENXIO; 6434 - 6435 - irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 6436 - irq_event->irq, irq_event->level, 6437 - line_status); 6438 - return 0; 6439 6550 } 6440 6551 6441 6552 int kvm_vm_ioctl_enable_cap(struct kvm *kvm, ··· 6937 7072 struct kvm *kvm = filp->private_data; 6938 7073 void __user *argp = (void __user *)arg; 6939 7074 int r = -ENOTTY; 7075 + 7076 + #ifdef CONFIG_KVM_IOAPIC 6940 7077 /* 6941 7078 * This union makes it completely explicit to gcc-3.x 6942 - * that these two variables' stack usage should be 7079 + * that these three variables' stack usage should be 6943 7080 * combined, not added together. 6944 7081 */ 6945 7082 union { ··· 6949 7082 struct kvm_pit_state2 ps2; 6950 7083 struct kvm_pit_config pit_config; 6951 7084 } u; 7085 + #endif 6952 7086 6953 7087 switch (ioctl) { 6954 7088 case KVM_SET_TSS_ADDR: ··· 6973 7105 case KVM_SET_NR_MMU_PAGES: 6974 7106 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 6975 7107 break; 7108 + #ifdef CONFIG_KVM_IOAPIC 6976 7109 case KVM_CREATE_IRQCHIP: { 6977 7110 mutex_lock(&kvm->lock); 6978 7111 ··· 6995 7126 goto create_irqchip_unlock; 6996 7127 } 6997 7128 6998 - r = kvm_setup_default_irq_routing(kvm); 7129 + r = kvm_setup_default_ioapic_and_pic_routing(kvm); 6999 7130 if (r) { 7000 7131 kvm_ioapic_destroy(kvm); 7001 7132 kvm_pic_destroy(kvm); ··· 7043 7174 } 7044 7175 7045 7176 r = -ENXIO; 7046 - if (!irqchip_kernel(kvm)) 7177 + if (!irqchip_full(kvm)) 7047 7178 goto get_irqchip_out; 7048 7179 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 7049 7180 if (r) ··· 7067 7198 } 7068 7199 7069 7200 r = -ENXIO; 7070 - if (!irqchip_kernel(kvm)) 7201 + if (!irqchip_full(kvm)) 7071 7202 goto set_irqchip_out; 7072 7203 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 7073 7204 set_irqchip_out: ··· 7140 7271 r = kvm_vm_ioctl_reinject(kvm, &control); 7141 7272 break; 7142 7273 } 7274 + #endif 7143 7275 case KVM_SET_BOOT_CPU_ID: 7144 7276 r = 0; 7145 7277 mutex_lock(&kvm->lock); ··· 10600 10730 10601 10731 if (irqchip_split(vcpu->kvm)) 10602 10732 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); 10733 + #ifdef CONFIG_KVM_IOAPIC 10603 10734 else if (ioapic_in_kernel(vcpu->kvm)) 10604 10735 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); 10736 + #endif 10605 10737 10606 10738 if (is_guest_mode(vcpu)) 10607 10739 vcpu->arch.load_eoi_exitmap_pending = true; ··· 12673 12801 if (ret) 12674 12802 goto out_uninit_mmu; 12675 12803 12676 - INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); 12677 12804 atomic_set(&kvm->arch.noncoherent_dma_count, 0); 12678 - 12679 - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 12680 - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 12681 - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ 12682 - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 12683 - &kvm->arch.irq_sources_bitmap); 12684 12805 12685 12806 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 12686 12807 mutex_init(&kvm->arch.apic_map_lock); ··· 12805 12940 cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); 12806 12941 cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); 12807 12942 12943 + #ifdef CONFIG_KVM_IOAPIC 12808 12944 kvm_free_pit(kvm); 12945 + #endif 12809 12946 12810 12947 kvm_mmu_pre_destroy_vm(kvm); 12811 12948 static_call_cond(kvm_x86_vm_pre_destroy)(kvm); ··· 12831 12964 } 12832 12965 kvm_destroy_vcpus(kvm); 12833 12966 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); 12967 + #ifdef CONFIG_KVM_IOAPIC 12834 12968 kvm_pic_destroy(kvm); 12835 12969 kvm_ioapic_destroy(kvm); 12970 + #endif 12836 12971 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); 12837 12972 kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); 12838 12973 kvm_mmu_uninit_vm(kvm); ··· 13446 13577 13447 13578 void kvm_arch_start_assignment(struct kvm *kvm) 13448 13579 { 13449 - if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) 13450 - kvm_x86_call(pi_start_assignment)(kvm); 13580 + atomic_inc(&kvm->arch.assigned_device_count); 13451 13581 } 13452 13582 EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); 13453 13583 ··· 13496 13628 return atomic_read(&kvm->arch.noncoherent_dma_count); 13497 13629 } 13498 13630 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); 13499 - 13500 - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, 13501 - struct irq_bypass_producer *prod) 13502 - { 13503 - struct kvm_kernel_irqfd *irqfd = 13504 - container_of(cons, struct kvm_kernel_irqfd, consumer); 13505 - struct kvm *kvm = irqfd->kvm; 13506 - int ret; 13507 - 13508 - kvm_arch_start_assignment(irqfd->kvm); 13509 - 13510 - spin_lock_irq(&kvm->irqfds.lock); 13511 - irqfd->producer = prod; 13512 - 13513 - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, 13514 - prod->irq, irqfd->gsi, 1); 13515 - if (ret) 13516 - kvm_arch_end_assignment(irqfd->kvm); 13517 - 13518 - spin_unlock_irq(&kvm->irqfds.lock); 13519 - 13520 - 13521 - return ret; 13522 - } 13523 - 13524 - void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, 13525 - struct irq_bypass_producer *prod) 13526 - { 13527 - int ret; 13528 - struct kvm_kernel_irqfd *irqfd = 13529 - container_of(cons, struct kvm_kernel_irqfd, consumer); 13530 - struct kvm *kvm = irqfd->kvm; 13531 - 13532 - WARN_ON(irqfd->producer != prod); 13533 - 13534 - /* 13535 - * When producer of consumer is unregistered, we change back to 13536 - * remapped mode, so we can re-use the current implementation 13537 - * when the irq is masked/disabled or the consumer side (KVM 13538 - * int this case doesn't want to receive the interrupts. 13539 - */ 13540 - spin_lock_irq(&kvm->irqfds.lock); 13541 - irqfd->producer = NULL; 13542 - 13543 - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, 13544 - prod->irq, irqfd->gsi, 0); 13545 - if (ret) 13546 - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" 13547 - " fails: %d\n", irqfd->consumer.token, ret); 13548 - 13549 - spin_unlock_irq(&kvm->irqfds.lock); 13550 - 13551 - 13552 - kvm_arch_end_assignment(irqfd->kvm); 13553 - } 13554 - 13555 - int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, 13556 - uint32_t guest_irq, bool set) 13557 - { 13558 - return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set); 13559 - } 13560 - 13561 - bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, 13562 - struct kvm_kernel_irq_routing_entry *new) 13563 - { 13564 - if (old->type != KVM_IRQ_ROUTING_MSI || 13565 - new->type != KVM_IRQ_ROUTING_MSI) 13566 - return true; 13567 - 13568 - return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); 13569 - } 13570 13631 13571 13632 bool kvm_vector_hashing_enabled(void) 13572 13633 { ··· 13896 14099 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); 13897 14100 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); 13898 14101 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); 13899 - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); 13900 14102 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); 13901 14103 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); 13902 14104 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);

+8

drivers/hv/mshv_eventfd.c

··· 368 368 container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); 369 369 370 370 irqfd->irqfd_wqh = wqh; 371 + 372 + /* 373 + * TODO: Ensure there isn't already an exclusive, priority waiter, e.g. 374 + * that the irqfd isn't already bound to another partition. Only the 375 + * first exclusive waiter encountered will be notified, and 376 + * add_wait_queue_priority() doesn't enforce exclusivity. 377 + */ 378 + irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE; 371 379 add_wait_queue_priority(wqh, &irqfd->irqfd_wait); 372 380 } 373 381

-1

drivers/iommu/amd/amd_iommu_types.h

··· 1054 1054 }; 1055 1055 1056 1056 struct amd_ir_data { 1057 - u32 cached_ga_tag; 1058 1057 struct amd_iommu *iommu; 1059 1058 struct irq_2_irte irq_2_irte; 1060 1059 struct msi_msg msi_entry;

+77 -48

drivers/iommu/amd/iommu.c

··· 3804 3804 .deactivate = irq_remapping_deactivate, 3805 3805 }; 3806 3806 3807 - int amd_iommu_activate_guest_mode(void *data) 3807 + static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu, 3808 + bool ga_log_intr) 3809 + { 3810 + if (cpu >= 0) { 3811 + entry->lo.fields_vapic.destination = 3812 + APICID_TO_IRTE_DEST_LO(cpu); 3813 + entry->hi.fields.destination = 3814 + APICID_TO_IRTE_DEST_HI(cpu); 3815 + entry->lo.fields_vapic.is_run = true; 3816 + entry->lo.fields_vapic.ga_log_intr = false; 3817 + } else { 3818 + entry->lo.fields_vapic.is_run = false; 3819 + entry->lo.fields_vapic.ga_log_intr = ga_log_intr; 3820 + } 3821 + } 3822 + 3823 + /* 3824 + * Update the pCPU information for an IRTE that is configured to post IRQs to 3825 + * a vCPU, without issuing an IOMMU invalidation for the IRTE. 3826 + * 3827 + * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination 3828 + * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't 3829 + * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based 3830 + * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is 3831 + * blocking and requires a notification wake event). I.e. treat vCPUs that are 3832 + * associated with a pCPU as running. This API is intended to be used when a 3833 + * vCPU is scheduled in/out (or stops running for any reason), to do a fast 3834 + * update of IsRun, GALogIntr, and (conditionally) Destination. 3835 + * 3836 + * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached 3837 + * and thus don't require an invalidation to ensure the IOMMU consumes fresh 3838 + * information. 3839 + */ 3840 + int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) 3841 + { 3842 + struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3843 + struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3844 + 3845 + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3846 + return -EINVAL; 3847 + 3848 + if (!entry || !entry->lo.fields_vapic.guest_mode) 3849 + return 0; 3850 + 3851 + if (!ir_data->iommu) 3852 + return -ENODEV; 3853 + 3854 + __amd_iommu_update_ga(entry, cpu, ga_log_intr); 3855 + 3856 + return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 3857 + ir_data->irq_2_irte.index, entry); 3858 + } 3859 + EXPORT_SYMBOL(amd_iommu_update_ga); 3860 + 3861 + int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) 3808 3862 { 3809 3863 struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 3810 3864 struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 3811 3865 u64 valid; 3812 3866 3813 - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry) 3867 + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3868 + return -EINVAL; 3869 + 3870 + if (!entry) 3814 3871 return 0; 3815 3872 3816 3873 valid = entry->lo.fields_vapic.valid; ··· 3877 3820 3878 3821 entry->lo.fields_vapic.valid = valid; 3879 3822 entry->lo.fields_vapic.guest_mode = 1; 3880 - entry->lo.fields_vapic.ga_log_intr = 1; 3881 3823 entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr; 3882 3824 entry->hi.fields.vector = ir_data->ga_vector; 3883 3825 entry->lo.fields_vapic.ga_tag = ir_data->ga_tag; 3826 + 3827 + __amd_iommu_update_ga(entry, cpu, ga_log_intr); 3884 3828 3885 3829 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 3886 3830 ir_data->irq_2_irte.index, entry); ··· 3895 3837 struct irq_cfg *cfg = ir_data->cfg; 3896 3838 u64 valid; 3897 3839 3898 - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || 3899 - !entry || !entry->lo.fields_vapic.guest_mode) 3840 + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) 3841 + return -EINVAL; 3842 + 3843 + if (!entry || !entry->lo.fields_vapic.guest_mode) 3900 3844 return 0; 3901 3845 3902 3846 valid = entry->lo.fields_remap.valid; ··· 3920 3860 } 3921 3861 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode); 3922 3862 3923 - static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) 3863 + static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info) 3924 3864 { 3925 3865 int ret; 3926 - struct amd_iommu_pi_data *pi_data = vcpu_info; 3927 - struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; 3866 + struct amd_iommu_pi_data *pi_data = info; 3928 3867 struct amd_ir_data *ir_data = data->chip_data; 3929 3868 struct irq_2_irte *irte_info = &ir_data->irq_2_irte; 3930 3869 struct iommu_dev_data *dev_data; ··· 3944 3885 return -EINVAL; 3945 3886 3946 3887 ir_data->cfg = irqd_cfg(data); 3947 - pi_data->ir_data = ir_data; 3948 3888 3949 - pi_data->prev_ga_tag = ir_data->cached_ga_tag; 3950 - if (pi_data->is_guest_mode) { 3951 - ir_data->ga_root_ptr = (pi_data->base >> 12); 3952 - ir_data->ga_vector = vcpu_pi_info->vector; 3889 + if (pi_data) { 3890 + pi_data->ir_data = ir_data; 3891 + 3892 + ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12); 3893 + ir_data->ga_vector = pi_data->vector; 3953 3894 ir_data->ga_tag = pi_data->ga_tag; 3954 - ret = amd_iommu_activate_guest_mode(ir_data); 3955 - if (!ret) 3956 - ir_data->cached_ga_tag = pi_data->ga_tag; 3895 + if (pi_data->is_guest_mode) 3896 + ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu, 3897 + pi_data->ga_log_intr); 3898 + else 3899 + ret = amd_iommu_deactivate_guest_mode(ir_data); 3957 3900 } else { 3958 3901 ret = amd_iommu_deactivate_guest_mode(ir_data); 3959 - 3960 - /* 3961 - * This communicates the ga_tag back to the caller 3962 - * so that it can do all the necessary clean up. 3963 - */ 3964 - if (!ret) 3965 - ir_data->cached_ga_tag = 0; 3966 3902 } 3967 3903 3968 3904 return ret; ··· 4049 3995 4050 3996 return 0; 4051 3997 } 4052 - 4053 - int amd_iommu_update_ga(int cpu, bool is_run, void *data) 4054 - { 4055 - struct amd_ir_data *ir_data = (struct amd_ir_data *)data; 4056 - struct irte_ga *entry = (struct irte_ga *) ir_data->entry; 4057 - 4058 - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || 4059 - !entry || !entry->lo.fields_vapic.guest_mode) 4060 - return 0; 4061 - 4062 - if (!ir_data->iommu) 4063 - return -ENODEV; 4064 - 4065 - if (cpu >= 0) { 4066 - entry->lo.fields_vapic.destination = 4067 - APICID_TO_IRTE_DEST_LO(cpu); 4068 - entry->hi.fields.destination = 4069 - APICID_TO_IRTE_DEST_HI(cpu); 4070 - } 4071 - entry->lo.fields_vapic.is_run = is_run; 4072 - 4073 - return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid, 4074 - ir_data->irq_2_irte.index, entry); 4075 - } 4076 - EXPORT_SYMBOL(amd_iommu_update_ga); 4077 3998 #endif

+5 -5

drivers/iommu/intel/irq_remapping.c

··· 1244 1244 static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) 1245 1245 { 1246 1246 struct intel_ir_data *ir_data = data->chip_data; 1247 - struct vcpu_data *vcpu_pi_info = info; 1247 + struct intel_iommu_pi_data *pi_data = info; 1248 1248 1249 1249 /* stop posting interrupts, back to the default mode */ 1250 - if (!vcpu_pi_info) { 1250 + if (!pi_data) { 1251 1251 __intel_ir_reconfigure_irte(data, true); 1252 1252 } else { 1253 1253 struct irte irte_pi; ··· 1265 1265 /* Update the posted mode fields */ 1266 1266 irte_pi.p_pst = 1; 1267 1267 irte_pi.p_urgent = 0; 1268 - irte_pi.p_vector = vcpu_pi_info->vector; 1269 - irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >> 1268 + irte_pi.p_vector = pi_data->vector; 1269 + irte_pi.pda_l = (pi_data->pi_desc_addr >> 1270 1270 (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); 1271 - irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & 1271 + irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) & 1272 1272 ~(-1UL << PDA_HIGH_BIT); 1273 1273 1274 1274 ir_data->irq_2_iommu.posted_vcpu = true;

+2 -2

drivers/irqchip/irq-gic-v4.c

··· 342 342 return irq_set_vcpu_affinity(irq, &info); 343 343 } 344 344 345 - int its_unmap_vlpi(int irq) 345 + void its_unmap_vlpi(int irq) 346 346 { 347 347 irq_clear_status_flags(irq, IRQ_DISABLE_UNLAZY); 348 - return irq_set_vcpu_affinity(irq, NULL); 348 + WARN_ON_ONCE(irq_set_vcpu_affinity(irq, NULL)); 349 349 } 350 350 351 351 int its_prop_update_vlpi(int irq, u8 config, bool inv)

+3 -7

drivers/vfio/pci/vfio_pci_intrs.c

··· 505 505 if (ret) 506 506 goto out_put_eventfd_ctx; 507 507 508 - ctx->producer.token = trigger; 509 - ctx->producer.irq = irq; 510 - ret = irq_bypass_register_producer(&ctx->producer); 508 + ret = irq_bypass_register_producer(&ctx->producer, trigger, irq); 511 509 if (unlikely(ret)) { 512 510 dev_info(&pdev->dev, 513 - "irq bypass producer (token %p) registration fails: %d\n", 514 - ctx->producer.token, ret); 515 - 516 - ctx->producer.token = NULL; 511 + "irq bypass producer (eventfd %p) registration fails: %d\n", 512 + trigger, ret); 517 513 } 518 514 ctx->trigger = trigger; 519 515

+4 -6

drivers/vhost/vdpa.c

··· 212 212 if (!vq->call_ctx.ctx) 213 213 return; 214 214 215 - vq->call_ctx.producer.irq = irq; 216 - ret = irq_bypass_register_producer(&vq->call_ctx.producer); 215 + ret = irq_bypass_register_producer(&vq->call_ctx.producer, 216 + vq->call_ctx.ctx, irq); 217 217 if (unlikely(ret)) 218 - dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n", 219 - qid, vq->call_ctx.producer.token, ret); 218 + dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret = %d\n", 219 + qid, vq->call_ctx.ctx, ret); 220 220 } 221 221 222 222 static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid) ··· 712 712 if (ops->get_status(vdpa) & 713 713 VIRTIO_CONFIG_S_DRIVER_OK) 714 714 vhost_vdpa_unsetup_vq_irq(v, idx); 715 - vq->call_ctx.producer.token = NULL; 716 715 } 717 716 break; 718 717 } ··· 752 753 cb.callback = vhost_vdpa_virtqueue_cb; 753 754 cb.private = vq; 754 755 cb.trigger = vq->call_ctx.ctx; 755 - vq->call_ctx.producer.token = vq->call_ctx.ctx; 756 756 if (ops->get_status(vdpa) & 757 757 VIRTIO_CONFIG_S_DRIVER_OK) 758 758 vhost_vdpa_setup_vq_irq(v, idx);

+1 -1

include/kvm/arm_vgic.h

··· 434 434 int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq, 435 435 struct kvm_kernel_irq_routing_entry *irq_entry); 436 436 437 - int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq); 437 + void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq); 438 438 439 439 int vgic_v4_load(struct kvm_vcpu *vcpu); 440 440 void vgic_v4_commit(struct kvm_vcpu *vcpu);

+4 -21

include/linux/amd-iommu.h

··· 12 12 13 13 struct amd_iommu; 14 14 15 - /* 16 - * This is mainly used to communicate information back-and-forth 17 - * between SVM and IOMMU for setting up and tearing down posted 18 - * interrupt 19 - */ 20 - struct amd_iommu_pi_data { 21 - u32 ga_tag; 22 - u32 prev_ga_tag; 23 - u64 base; 24 - bool is_guest_mode; 25 - struct vcpu_data *vcpu_data; 26 - void *ir_data; 27 - }; 28 - 29 15 #ifdef CONFIG_AMD_IOMMU 30 16 31 17 struct task_struct; ··· 30 44 /* IOMMU AVIC Function */ 31 45 extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); 32 46 33 - extern int 34 - amd_iommu_update_ga(int cpu, bool is_run, void *data); 35 - 36 - extern int amd_iommu_activate_guest_mode(void *data); 47 + extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr); 48 + extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr); 37 49 extern int amd_iommu_deactivate_guest_mode(void *data); 38 50 39 51 #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ ··· 42 58 return 0; 43 59 } 44 60 45 - static inline int 46 - amd_iommu_update_ga(int cpu, bool is_run, void *data) 61 + static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr) 47 62 { 48 63 return 0; 49 64 } 50 65 51 - static inline int amd_iommu_activate_guest_mode(void *data) 66 + static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr) 52 67 { 53 68 return 0; 54 69 }

+26 -20

include/linux/irqbypass.h

··· 10 10 11 11 #include <linux/list.h> 12 12 13 + struct eventfd_ctx; 13 14 struct irq_bypass_consumer; 14 15 15 16 /* ··· 19 18 * The IRQ bypass manager is a simple set of lists and callbacks that allows 20 19 * IRQ producers (ex. physical interrupt sources) to be matched to IRQ 21 20 * consumers (ex. virtualization hardware that allows IRQ bypass or offload) 22 - * via a shared token (ex. eventfd_ctx). Producers and consumers register 23 - * independently. When a token match is found, the optional @stop callback 24 - * will be called for each participant. The pair will then be connected via 25 - * the @add_* callbacks, and finally the optional @start callback will allow 26 - * any final coordination. When either participant is unregistered, the 27 - * process is repeated using the @del_* callbacks in place of the @add_* 28 - * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings 29 - * are not supported. 21 + * via a shared eventfd_ctx. Producers and consumers register independently. 22 + * When a producer and consumer are paired, i.e. an eventfd match is found, the 23 + * optional @stop callback will be called for each participant. The pair will 24 + * then be connected via the @add_* callbacks, and finally the optional @start 25 + * callback will allow any final coordination. When either participant is 26 + * unregistered, the process is repeated using the @del_* callbacks in place of 27 + * the @add_* callbacks. eventfds must be unique per producer/consumer, 1:N 28 + * pairings are not supported. 30 29 */ 30 + 31 + struct irq_bypass_consumer; 31 32 32 33 /** 33 34 * struct irq_bypass_producer - IRQ bypass producer definition 34 - * @node: IRQ bypass manager private list management 35 - * @token: opaque token to match between producer and consumer (non-NULL) 35 + * @eventfd: eventfd context used to match producers and consumers 36 + * @consumer: The connected consumer (NULL if no connection) 36 37 * @irq: Linux IRQ number for the producer device 37 38 * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional) 38 39 * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional) ··· 46 43 * for a physical device assigned to a VM. 47 44 */ 48 45 struct irq_bypass_producer { 49 - struct list_head node; 50 - void *token; 46 + struct eventfd_ctx *eventfd; 47 + struct irq_bypass_consumer *consumer; 51 48 int irq; 52 49 int (*add_consumer)(struct irq_bypass_producer *, 53 50 struct irq_bypass_consumer *); ··· 59 56 60 57 /** 61 58 * struct irq_bypass_consumer - IRQ bypass consumer definition 62 - * @node: IRQ bypass manager private list management 63 - * @token: opaque token to match between producer and consumer (non-NULL) 59 + * @eventfd: eventfd context used to match producers and consumers 60 + * @producer: The connected producer (NULL if no connection) 64 61 * @add_producer: Connect the IRQ consumer to an IRQ producer 65 62 * @del_producer: Disconnect the IRQ consumer from an IRQ producer 66 63 * @stop: Perform any quiesce operations necessary prior to add/del (optional) ··· 72 69 * portions of the interrupt handling to the VM. 73 70 */ 74 71 struct irq_bypass_consumer { 75 - struct list_head node; 76 - void *token; 72 + struct eventfd_ctx *eventfd; 73 + struct irq_bypass_producer *producer; 74 + 77 75 int (*add_producer)(struct irq_bypass_consumer *, 78 76 struct irq_bypass_producer *); 79 77 void (*del_producer)(struct irq_bypass_consumer *, ··· 83 79 void (*start)(struct irq_bypass_consumer *); 84 80 }; 85 81 86 - int irq_bypass_register_producer(struct irq_bypass_producer *); 87 - void irq_bypass_unregister_producer(struct irq_bypass_producer *); 88 - int irq_bypass_register_consumer(struct irq_bypass_consumer *); 89 - void irq_bypass_unregister_consumer(struct irq_bypass_consumer *); 82 + int irq_bypass_register_producer(struct irq_bypass_producer *producer, 83 + struct eventfd_ctx *eventfd, int irq); 84 + void irq_bypass_unregister_producer(struct irq_bypass_producer *producer); 85 + int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, 86 + struct eventfd_ctx *eventfd); 87 + void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer); 90 88 91 89 #endif /* IRQBYPASS_H */

+1 -1

include/linux/irqchip/arm-gic-v4.h

··· 146 146 int its_invall_vpe(struct its_vpe *vpe); 147 147 int its_map_vlpi(int irq, struct its_vlpi_map *map); 148 148 int its_get_vlpi(int irq, struct its_vlpi_map *map); 149 - int its_unmap_vlpi(int irq); 149 + void its_unmap_vlpi(int irq); 150 150 int its_prop_update_vlpi(int irq, u8 config, bool inv); 151 151 int its_prop_update_vsgi(int irq, u8 priority, bool group); 152 152

+7 -11

include/linux/kvm_host.h

··· 190 190 191 191 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 192 192 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 193 + #define KVM_PIT_IRQ_SOURCE_ID 2 193 194 194 195 extern struct mutex kvm_lock; 195 196 extern struct list_head vm_list; ··· 1023 1022 void vcpu_load(struct kvm_vcpu *vcpu); 1024 1023 void vcpu_put(struct kvm_vcpu *vcpu); 1025 1024 1026 - #ifdef __KVM_HAVE_IOAPIC 1025 + #ifdef CONFIG_KVM_IOAPIC 1027 1026 void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); 1028 - void kvm_arch_post_irq_routing_update(struct kvm *kvm); 1029 1027 #else 1030 1028 static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) 1031 - { 1032 - } 1033 - static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) 1034 1029 { 1035 1030 } 1036 1031 #endif ··· 1785 1788 struct kvm_irq_ack_notifier *kian); 1786 1789 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 1787 1790 struct kvm_irq_ack_notifier *kian); 1788 - int kvm_request_irq_source_id(struct kvm *kvm); 1789 - void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); 1790 1791 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); 1791 1792 1792 1793 /* ··· 2401 2406 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); 2402 2407 2403 2408 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 2409 + struct kvm_kernel_irqfd; 2410 + 2404 2411 bool kvm_arch_has_irq_bypass(void); 2405 2412 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, 2406 2413 struct irq_bypass_producer *); ··· 2410 2413 struct irq_bypass_producer *); 2411 2414 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); 2412 2415 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); 2413 - int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, 2414 - uint32_t guest_irq, bool set); 2415 - bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, 2416 - struct kvm_kernel_irq_routing_entry *); 2416 + void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, 2417 + struct kvm_kernel_irq_routing_entry *old, 2418 + struct kvm_kernel_irq_routing_entry *new); 2417 2419 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ 2418 2420 2419 2421 #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS

+4 -1

include/linux/kvm_irqfd.h

··· 55 55 /* Used for setup/shutdown */ 56 56 struct eventfd_ctx *eventfd; 57 57 struct list_head list; 58 - poll_table pt; 59 58 struct work_struct shutdown; 60 59 struct irq_bypass_consumer consumer; 61 60 struct irq_bypass_producer *producer; 61 + 62 + struct kvm_vcpu *irq_bypass_vcpu; 63 + struct list_head vcpu_list; 64 + void *irq_bypass_data; 62 65 }; 63 66 64 67 #endif /* __LINUX_KVM_IRQFD_H */

+2

include/linux/wait.h

··· 164 164 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); 165 165 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); 166 166 extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); 167 + extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, 168 + struct wait_queue_entry *wq_entry); 167 169 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); 168 170 169 171 static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)

+2 -82

include/trace/events/kvm.h

··· 82 82 TP_printk("gsi %u level %d source %d", 83 83 __entry->gsi, __entry->level, __entry->irq_source_id) 84 84 ); 85 - #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ 86 85 87 - #if defined(__KVM_HAVE_IOAPIC) 88 - #define kvm_deliver_mode \ 89 - {0x0, "Fixed"}, \ 90 - {0x1, "LowPrio"}, \ 91 - {0x2, "SMI"}, \ 92 - {0x3, "Res3"}, \ 93 - {0x4, "NMI"}, \ 94 - {0x5, "INIT"}, \ 95 - {0x6, "SIPI"}, \ 96 - {0x7, "ExtINT"} 97 - 98 - TRACE_EVENT(kvm_ioapic_set_irq, 99 - TP_PROTO(__u64 e, int pin, bool coalesced), 100 - TP_ARGS(e, pin, coalesced), 101 - 102 - TP_STRUCT__entry( 103 - __field( __u64, e ) 104 - __field( int, pin ) 105 - __field( bool, coalesced ) 106 - ), 107 - 108 - TP_fast_assign( 109 - __entry->e = e; 110 - __entry->pin = pin; 111 - __entry->coalesced = coalesced; 112 - ), 113 - 114 - TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s", 115 - __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e, 116 - __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), 117 - (__entry->e & (1<<11)) ? "logical" : "physical", 118 - (__entry->e & (1<<15)) ? "level" : "edge", 119 - (__entry->e & (1<<16)) ? "|masked" : "", 120 - __entry->coalesced ? " (coalesced)" : "") 121 - ); 122 - 123 - TRACE_EVENT(kvm_ioapic_delayed_eoi_inj, 124 - TP_PROTO(__u64 e), 125 - TP_ARGS(e), 126 - 127 - TP_STRUCT__entry( 128 - __field( __u64, e ) 129 - ), 130 - 131 - TP_fast_assign( 132 - __entry->e = e; 133 - ), 134 - 135 - TP_printk("dst %x vec %u (%s|%s|%s%s)", 136 - (u8)(__entry->e >> 56), (u8)__entry->e, 137 - __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode), 138 - (__entry->e & (1<<11)) ? "logical" : "physical", 139 - (__entry->e & (1<<15)) ? "level" : "edge", 140 - (__entry->e & (1<<16)) ? "|masked" : "") 141 - ); 142 - 143 - TRACE_EVENT(kvm_msi_set_irq, 144 - TP_PROTO(__u64 address, __u64 data), 145 - TP_ARGS(address, data), 146 - 147 - TP_STRUCT__entry( 148 - __field( __u64, address ) 149 - __field( __u64, data ) 150 - ), 151 - 152 - TP_fast_assign( 153 - __entry->address = address; 154 - __entry->data = data; 155 - ), 156 - 157 - TP_printk("dst %llx vec %u (%s|%s|%s%s)", 158 - (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), 159 - (u8)__entry->data, 160 - __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), 161 - (__entry->address & (1<<2)) ? "logical" : "physical", 162 - (__entry->data & (1<<15)) ? "level" : "edge", 163 - (__entry->address & (1<<3)) ? "|rh" : "") 164 - ); 86 + #ifdef CONFIG_KVM_IOAPIC 165 87 166 88 #define kvm_irqchips \ 167 89 {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ 168 90 {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ 169 91 {KVM_IRQCHIP_IOAPIC, "IOAPIC"} 170 92 171 - #endif /* defined(__KVM_HAVE_IOAPIC) */ 172 - 173 - #if defined(CONFIG_HAVE_KVM_IRQCHIP) 93 + #endif /* CONFIG_KVM_IOAPIC */ 174 94 175 95 #ifdef kvm_irqchips 176 96 #define kvm_ack_irq_string "irqchip %s pin %u"

+20 -2

kernel/sched/wait.c

··· 40 40 { 41 41 unsigned long flags; 42 42 43 - wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; 43 + wq_entry->flags |= WQ_FLAG_PRIORITY; 44 44 spin_lock_irqsave(&wq_head->lock, flags); 45 45 __add_wait_queue(wq_head, wq_entry); 46 46 spin_unlock_irqrestore(&wq_head->lock, flags); 47 47 } 48 48 EXPORT_SYMBOL_GPL(add_wait_queue_priority); 49 + 50 + int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, 51 + struct wait_queue_entry *wq_entry) 52 + { 53 + struct list_head *head = &wq_head->head; 54 + 55 + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; 56 + 57 + guard(spinlock_irqsave)(&wq_head->lock); 58 + 59 + if (!list_empty(head) && 60 + (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY)) 61 + return -EBUSY; 62 + 63 + list_add(&wq_entry->entry, head); 64 + return 0; 65 + } 66 + EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive); 49 67 50 68 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) 51 69 { ··· 82 64 * the non-exclusive tasks. Normally, exclusive tasks will be at the end of 83 65 * the list and any non-exclusive tasks will be woken first. A priority task 84 66 * may be at the head of the list, and can consume the event without any other 85 - * tasks being woken. 67 + * tasks being woken if it's also an exclusive task. 86 68 * 87 69 * There are circumstances in which we can try to wake a task which has already 88 70 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

+1

tools/testing/selftests/kvm/Makefile.kvm

··· 59 59 TEST_GEN_PROGS_COMMON = demand_paging_test 60 60 TEST_GEN_PROGS_COMMON += dirty_log_test 61 61 TEST_GEN_PROGS_COMMON += guest_print_test 62 + TEST_GEN_PROGS_COMMON += irqfd_test 62 63 TEST_GEN_PROGS_COMMON += kvm_binary_stats_test 63 64 TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus 64 65 TEST_GEN_PROGS_COMMON += kvm_page_table_test

+3 -9

tools/testing/selftests/kvm/arm64/vgic_irq.c

··· 620 620 * that no actual interrupt was injected for those cases. 621 621 */ 622 622 623 - for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { 624 - fd[f] = eventfd(0, 0); 625 - TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f])); 626 - } 623 + for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) 624 + fd[f] = kvm_new_eventfd(); 627 625 628 626 for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { 629 - struct kvm_irqfd irqfd = { 630 - .fd = fd[f], 631 - .gsi = i - MIN_SPI, 632 - }; 633 627 assert(i <= (uint64_t)UINT_MAX); 634 - vm_ioctl(vm, KVM_IRQFD, &irqfd); 628 + kvm_assign_irqfd(vm, i - MIN_SPI, fd[f]); 635 629 } 636 630 637 631 for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {

+1

tools/testing/selftests/kvm/config

··· 1 1 CONFIG_KVM=y 2 2 CONFIG_KVM_INTEL=y 3 3 CONFIG_KVM_AMD=y 4 + CONFIG_EVENTFD=y 4 5 CONFIG_USERFAULTFD=y 5 6 CONFIG_IDLE_PAGE_TRACKING=y

+40

tools/testing/selftests/kvm/include/kvm_util.h

··· 18 18 #include <asm/atomic.h> 19 19 #include <asm/kvm.h> 20 20 21 + #include <sys/eventfd.h> 21 22 #include <sys/ioctl.h> 22 23 23 24 #include "kvm_util_arch.h" ··· 500 499 int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); 501 500 502 501 TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); 502 + return fd; 503 + } 504 + 505 + static inline int __kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd, 506 + uint32_t flags) 507 + { 508 + struct kvm_irqfd irqfd = { 509 + .fd = eventfd, 510 + .gsi = gsi, 511 + .flags = flags, 512 + .resamplefd = -1, 513 + }; 514 + 515 + return __vm_ioctl(vm, KVM_IRQFD, &irqfd); 516 + } 517 + 518 + static inline void kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd, 519 + uint32_t flags) 520 + { 521 + int ret = __kvm_irqfd(vm, gsi, eventfd, flags); 522 + 523 + TEST_ASSERT_VM_VCPU_IOCTL(!ret, KVM_IRQFD, ret, vm); 524 + } 525 + 526 + static inline void kvm_assign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd) 527 + { 528 + kvm_irqfd(vm, gsi, eventfd, 0); 529 + } 530 + 531 + static inline void kvm_deassign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd) 532 + { 533 + kvm_irqfd(vm, gsi, eventfd, KVM_IRQFD_FLAG_DEASSIGN); 534 + } 535 + 536 + static inline int kvm_new_eventfd(void) 537 + { 538 + int fd = eventfd(0, 0); 539 + 540 + TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("eventfd()", fd)); 503 541 return fd; 504 542 } 505 543

+135

tools/testing/selftests/kvm/irqfd_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <errno.h> 3 + #include <pthread.h> 4 + #include <stdio.h> 5 + #include <stdlib.h> 6 + #include <string.h> 7 + #include <signal.h> 8 + #include <stdint.h> 9 + #include <sys/sysinfo.h> 10 + 11 + #include "kvm_util.h" 12 + 13 + static struct kvm_vm *vm1; 14 + static struct kvm_vm *vm2; 15 + static int __eventfd; 16 + static bool done; 17 + 18 + /* 19 + * KVM de-assigns based on eventfd *and* GSI, but requires unique eventfds when 20 + * assigning (the API isn't symmetrical). Abuse the oddity and use a per-task 21 + * GSI base to avoid false failures due to cross-task de-assign, i.e. so that 22 + * the secondary doesn't de-assign the primary's eventfd and cause assign to 23 + * unexpectedly succeed on the primary. 24 + */ 25 + #define GSI_BASE_PRIMARY 0x20 26 + #define GSI_BASE_SECONDARY 0x30 27 + 28 + static void juggle_eventfd_secondary(struct kvm_vm *vm, int eventfd) 29 + { 30 + int r, i; 31 + 32 + /* 33 + * The secondary task can encounter EBADF since the primary can close 34 + * the eventfd at any time. And because the primary can recreate the 35 + * eventfd, at the safe fd in the file table, the secondary can also 36 + * encounter "unexpected" success, e.g. if the close+recreate happens 37 + * between the first and second assignments. The secondary's role is 38 + * mostly to antagonize KVM, not to detect bugs. 39 + */ 40 + for (i = 0; i < 2; i++) { 41 + r = __kvm_irqfd(vm, GSI_BASE_SECONDARY, eventfd, 0); 42 + TEST_ASSERT(!r || errno == EBUSY || errno == EBADF, 43 + "Wanted success, EBUSY, or EBADF, r = %d, errno = %d", 44 + r, errno); 45 + 46 + /* De-assign should succeed unless the eventfd was closed. */ 47 + r = __kvm_irqfd(vm, GSI_BASE_SECONDARY + i, eventfd, KVM_IRQFD_FLAG_DEASSIGN); 48 + TEST_ASSERT(!r || errno == EBADF, 49 + "De-assign should succeed unless the fd was closed"); 50 + } 51 + } 52 + 53 + static void *secondary_irqfd_juggler(void *ign) 54 + { 55 + while (!READ_ONCE(done)) { 56 + juggle_eventfd_secondary(vm1, READ_ONCE(__eventfd)); 57 + juggle_eventfd_secondary(vm2, READ_ONCE(__eventfd)); 58 + } 59 + 60 + return NULL; 61 + } 62 + 63 + static void juggle_eventfd_primary(struct kvm_vm *vm, int eventfd) 64 + { 65 + int r1, r2; 66 + 67 + /* 68 + * At least one of the assigns should fail. KVM disallows assigning a 69 + * single eventfd to multiple GSIs (or VMs), so it's possible that both 70 + * assignments can fail, too. 71 + */ 72 + r1 = __kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, 0); 73 + TEST_ASSERT(!r1 || errno == EBUSY, 74 + "Wanted success or EBUSY, r = %d, errno = %d", r1, errno); 75 + 76 + r2 = __kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, 0); 77 + TEST_ASSERT(r1 || (r2 && errno == EBUSY), 78 + "Wanted failure (EBUSY), r1 = %d, r2 = %d, errno = %d", 79 + r1, r2, errno); 80 + 81 + /* 82 + * De-assign should always succeed, even if the corresponding assign 83 + * failed. 84 + */ 85 + kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, KVM_IRQFD_FLAG_DEASSIGN); 86 + kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, KVM_IRQFD_FLAG_DEASSIGN); 87 + } 88 + 89 + int main(int argc, char *argv[]) 90 + { 91 + pthread_t racing_thread; 92 + int r, i; 93 + 94 + /* Create "full" VMs, as KVM_IRQFD requires an in-kernel IRQ chip. */ 95 + vm1 = vm_create(1); 96 + vm2 = vm_create(1); 97 + 98 + WRITE_ONCE(__eventfd, kvm_new_eventfd()); 99 + 100 + kvm_irqfd(vm1, 10, __eventfd, 0); 101 + 102 + r = __kvm_irqfd(vm1, 11, __eventfd, 0); 103 + TEST_ASSERT(r && errno == EBUSY, 104 + "Wanted EBUSY, r = %d, errno = %d", r, errno); 105 + 106 + r = __kvm_irqfd(vm2, 12, __eventfd, 0); 107 + TEST_ASSERT(r && errno == EBUSY, 108 + "Wanted EBUSY, r = %d, errno = %d", r, errno); 109 + 110 + /* 111 + * De-assign all eventfds, along with multiple eventfds that were never 112 + * assigned. KVM's ABI is that de-assign is allowed so long as the 113 + * eventfd itself is valid. 114 + */ 115 + kvm_irqfd(vm1, 11, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); 116 + kvm_irqfd(vm1, 12, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); 117 + kvm_irqfd(vm1, 13, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); 118 + kvm_irqfd(vm1, 14, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); 119 + kvm_irqfd(vm1, 10, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN); 120 + 121 + close(__eventfd); 122 + 123 + pthread_create(&racing_thread, NULL, secondary_irqfd_juggler, vm2); 124 + 125 + for (i = 0; i < 10000; i++) { 126 + WRITE_ONCE(__eventfd, kvm_new_eventfd()); 127 + 128 + juggle_eventfd_primary(vm1, __eventfd); 129 + juggle_eventfd_primary(vm2, __eventfd); 130 + close(__eventfd); 131 + } 132 + 133 + WRITE_ONCE(done, true); 134 + pthread_join(racing_thread, NULL); 135 + }

+12 -1

tools/testing/selftests/kvm/lib/kvm_util.c

··· 1716 1716 /* Create an interrupt controller chip for the specified VM. */ 1717 1717 void vm_create_irqchip(struct kvm_vm *vm) 1718 1718 { 1719 - vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); 1719 + int r; 1720 + 1721 + /* 1722 + * Allocate a fully in-kernel IRQ chip by default, but fall back to a 1723 + * split model (x86 only) if that fails (KVM x86 allows compiling out 1724 + * support for KVM_CREATE_IRQCHIP). 1725 + */ 1726 + r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); 1727 + if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP)) 1728 + vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24); 1729 + else 1730 + TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm); 1720 1731 1721 1732 vm->has_irqchip = true; 1722 1733 }

+4 -17

tools/testing/selftests/kvm/x86/xen_shinfo_test.c

··· 547 547 int irq_fd[2] = { -1, -1 }; 548 548 549 549 if (do_eventfd_tests) { 550 - irq_fd[0] = eventfd(0, 0); 551 - irq_fd[1] = eventfd(0, 0); 550 + irq_fd[0] = kvm_new_eventfd(); 551 + irq_fd[1] = kvm_new_eventfd(); 552 552 553 - /* Unexpected, but not a KVM failure */ 554 - if (irq_fd[0] == -1 || irq_fd[1] == -1) 555 - do_evtchn_tests = do_eventfd_tests = false; 556 - } 557 - 558 - if (do_eventfd_tests) { 559 553 irq_routes.info.nr = 2; 560 554 561 555 irq_routes.entries[0].gsi = 32; ··· 566 572 567 573 vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info); 568 574 569 - struct kvm_irqfd ifd = { }; 570 - 571 - ifd.fd = irq_fd[0]; 572 - ifd.gsi = 32; 573 - vm_ioctl(vm, KVM_IRQFD, &ifd); 574 - 575 - ifd.fd = irq_fd[1]; 576 - ifd.gsi = 33; 577 - vm_ioctl(vm, KVM_IRQFD, &ifd); 575 + kvm_assign_irqfd(vm, 32, irq_fd[0]); 576 + kvm_assign_irqfd(vm, 33, irq_fd[1]); 578 577 579 578 struct sigaction sa = { }; 580 579 sa.sa_handler = handle_alrm;

+102 -57

virt/kvm/eventfd.c

··· 204 204 int ret = 0; 205 205 206 206 if (flags & EPOLLIN) { 207 + /* 208 + * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP, 209 + * as KVM holds irqfds.lock when registering the irqfd with the 210 + * eventfd. 211 + */ 207 212 u64 cnt; 208 213 eventfd_ctx_do_read(irqfd->eventfd, &cnt); 209 214 ··· 230 225 /* The eventfd is closing, detach from KVM */ 231 226 unsigned long iflags; 232 227 228 + /* 229 + * Taking irqfds.lock is safe here, as KVM holds a reference to 230 + * the eventfd when registering the irqfd, i.e. this path can't 231 + * be reached while kvm_irqfd_add() is running. 232 + */ 233 233 spin_lock_irqsave(&kvm->irqfds.lock, iflags); 234 234 235 235 /* ··· 255 245 return ret; 256 246 } 257 247 258 - static void 259 - irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 260 - poll_table *pt) 261 - { 262 - struct kvm_kernel_irqfd *irqfd = 263 - container_of(pt, struct kvm_kernel_irqfd, pt); 264 - add_wait_queue_priority(wqh, &irqfd->wait); 265 - } 266 - 267 - /* Must be called under irqfds.lock */ 268 248 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) 269 249 { 270 250 struct kvm_kernel_irq_routing_entry *e; 271 251 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 272 252 int n_entries; 253 + 254 + lockdep_assert_held(&kvm->irqfds.lock); 273 255 274 256 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 275 257 ··· 276 274 write_seqcount_end(&irqfd->irq_entry_sc); 277 275 } 278 276 277 + struct kvm_irqfd_pt { 278 + struct kvm_kernel_irqfd *irqfd; 279 + struct kvm *kvm; 280 + poll_table pt; 281 + int ret; 282 + }; 283 + 284 + static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh, 285 + poll_table *pt) 286 + { 287 + struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt); 288 + struct kvm_kernel_irqfd *irqfd = p->irqfd; 289 + struct kvm *kvm = p->kvm; 290 + 291 + /* 292 + * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing, 293 + * and irqfds.items. It does NOT protect registering with the eventfd. 294 + */ 295 + spin_lock_irq(&kvm->irqfds.lock); 296 + 297 + /* 298 + * Initialize the routing information prior to adding the irqfd to the 299 + * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the 300 + * irqfd is registered. 301 + */ 302 + irqfd_update(kvm, irqfd); 303 + 304 + /* 305 + * Add the irqfd as a priority waiter on the eventfd, with a custom 306 + * wake-up handler, so that KVM *and only KVM* is notified whenever the 307 + * underlying eventfd is signaled. 308 + */ 309 + init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 310 + 311 + /* 312 + * Temporarily lie to lockdep about holding irqfds.lock to avoid a 313 + * false positive regarding potential deadlock with irqfd_wakeup() 314 + * (see irqfd_wakeup() for details). 315 + * 316 + * Adding to the wait queue will fail if there is already a priority 317 + * waiter, i.e. if the eventfd is associated with another irqfd (in any 318 + * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown 319 + * jobs to complete, i.e. ensures the irqfd has been removed from the 320 + * eventfd's waitqueue before returning to userspace. 321 + */ 322 + spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_); 323 + p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait); 324 + spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_); 325 + if (p->ret) 326 + goto out; 327 + 328 + list_add_tail(&irqfd->list, &kvm->irqfds.items); 329 + 330 + out: 331 + spin_unlock_irq(&kvm->irqfds.lock); 332 + } 333 + 279 334 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 280 335 void __attribute__((weak)) kvm_arch_irq_bypass_stop( 281 336 struct irq_bypass_consumer *cons) ··· 344 285 { 345 286 } 346 287 347 - int __attribute__((weak)) kvm_arch_update_irqfd_routing( 348 - struct kvm *kvm, unsigned int host_irq, 349 - uint32_t guest_irq, bool set) 288 + void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd, 289 + struct kvm_kernel_irq_routing_entry *old, 290 + struct kvm_kernel_irq_routing_entry *new) 350 291 { 351 - return 0; 352 - } 353 292 354 - bool __attribute__((weak)) kvm_arch_irqfd_route_changed( 355 - struct kvm_kernel_irq_routing_entry *old, 356 - struct kvm_kernel_irq_routing_entry *new) 357 - { 358 - return true; 359 293 } 360 294 #endif 361 295 362 296 static int 363 297 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 364 298 { 365 - struct kvm_kernel_irqfd *irqfd, *tmp; 299 + struct kvm_kernel_irqfd *irqfd; 366 300 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 301 + struct kvm_irqfd_pt irqfd_pt; 367 302 int ret; 368 303 __poll_t events; 369 304 int idx; ··· 443 390 } 444 391 445 392 /* 446 - * Install our own custom wake-up handling so we are notified via 447 - * a callback whenever someone signals the underlying eventfd 393 + * Set the irqfd routing and add it to KVM's list before registering 394 + * the irqfd with the eventfd, so that the routing information is valid 395 + * and stays valid, e.g. if there are GSI routing changes, prior to 396 + * making the irqfd visible, i.e. before it might be signaled. 397 + * 398 + * Note, holding SRCU ensures a stable read of routing information, and 399 + * also prevents irqfd_shutdown() from freeing the irqfd before it's 400 + * fully initialized. 448 401 */ 449 - init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 450 - init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 451 - 452 - spin_lock_irq(&kvm->irqfds.lock); 453 - 454 - ret = 0; 455 - list_for_each_entry(tmp, &kvm->irqfds.items, list) { 456 - if (irqfd->eventfd != tmp->eventfd) 457 - continue; 458 - /* This fd is used for another irq already. */ 459 - ret = -EBUSY; 460 - spin_unlock_irq(&kvm->irqfds.lock); 461 - goto fail; 462 - } 463 - 464 402 idx = srcu_read_lock(&kvm->irq_srcu); 465 - irqfd_update(kvm, irqfd); 466 - 467 - list_add_tail(&irqfd->list, &kvm->irqfds.items); 468 - 469 - spin_unlock_irq(&kvm->irqfds.lock); 470 403 471 404 /* 472 - * Check if there was an event already pending on the eventfd 473 - * before we registered, and trigger it as if we didn't miss it. 405 + * Register the irqfd with the eventfd by polling on the eventfd, and 406 + * simultaneously and the irqfd to KVM's list. If there was en event 407 + * pending on the eventfd prior to registering, manually trigger IRQ 408 + * injection. 474 409 */ 475 - events = vfs_poll(fd_file(f), &irqfd->pt); 410 + irqfd_pt.irqfd = irqfd; 411 + irqfd_pt.kvm = kvm; 412 + init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register); 413 + 414 + events = vfs_poll(fd_file(f), &irqfd_pt.pt); 415 + 416 + ret = irqfd_pt.ret; 417 + if (ret) 418 + goto fail_poll; 476 419 477 420 if (events & EPOLLIN) 478 421 schedule_work(&irqfd->inject); 479 422 480 423 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 481 424 if (kvm_arch_has_irq_bypass()) { 482 - irqfd->consumer.token = (void *)irqfd->eventfd; 483 425 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; 484 426 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; 485 427 irqfd->consumer.stop = kvm_arch_irq_bypass_stop; 486 428 irqfd->consumer.start = kvm_arch_irq_bypass_start; 487 - ret = irq_bypass_register_consumer(&irqfd->consumer); 429 + ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd); 488 430 if (ret) 489 - pr_info("irq bypass consumer (token %p) registration fails: %d\n", 490 - irqfd->consumer.token, ret); 431 + pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n", 432 + irqfd->eventfd, ret); 491 433 } 492 434 #endif 493 435 494 436 srcu_read_unlock(&kvm->irq_srcu, idx); 495 437 return 0; 496 438 439 + fail_poll: 440 + srcu_read_unlock(&kvm->irq_srcu, idx); 497 441 fail: 498 442 if (irqfd->resampler) 499 443 irqfd_resampler_shutdown(irqfd); ··· 667 617 irqfd_update(kvm, irqfd); 668 618 669 619 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS) 670 - if (irqfd->producer && 671 - kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { 672 - int ret = kvm_arch_update_irqfd_routing( 673 - irqfd->kvm, irqfd->producer->irq, 674 - irqfd->gsi, 1); 675 - WARN_ON(ret); 676 - } 620 + if (irqfd->producer) 621 + kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry); 677 622 #endif 678 623 } 679 624

-2

virt/kvm/irqchip.c

··· 222 222 kvm_arch_irq_routing_update(kvm); 223 223 mutex_unlock(&kvm->irq_lock); 224 224 225 - kvm_arch_post_irq_routing_update(kvm); 226 - 227 225 synchronize_srcu_expedited(&kvm->irq_srcu); 228 226 229 227 new = old;

+69 -121

virt/lib/irqbypass.c

··· 22 22 MODULE_LICENSE("GPL v2"); 23 23 MODULE_DESCRIPTION("IRQ bypass manager utility module"); 24 24 25 - static LIST_HEAD(producers); 26 - static LIST_HEAD(consumers); 25 + static DEFINE_XARRAY(producers); 26 + static DEFINE_XARRAY(consumers); 27 27 static DEFINE_MUTEX(lock); 28 28 29 29 /* @lock must be held when calling connect */ ··· 51 51 if (prod->start) 52 52 prod->start(prod); 53 53 54 + if (!ret) { 55 + prod->consumer = cons; 56 + cons->producer = prod; 57 + } 54 58 return ret; 55 59 } 56 60 ··· 76 72 cons->start(cons); 77 73 if (prod->start) 78 74 prod->start(prod); 75 + 76 + prod->consumer = NULL; 77 + cons->producer = NULL; 79 78 } 80 79 81 80 /** 82 81 * irq_bypass_register_producer - register IRQ bypass producer 83 82 * @producer: pointer to producer structure 83 + * @eventfd: pointer to the eventfd context associated with the producer 84 + * @irq: Linux IRQ number of the underlying producer device 84 85 * 85 - * Add the provided IRQ producer to the list of producers and connect 86 - * with any matching token found on the IRQ consumers list. 86 + * Add the provided IRQ producer to the set of producers and connect with the 87 + * consumer with a matching eventfd, if one exists. 87 88 */ 88 - int irq_bypass_register_producer(struct irq_bypass_producer *producer) 89 + int irq_bypass_register_producer(struct irq_bypass_producer *producer, 90 + struct eventfd_ctx *eventfd, int irq) 89 91 { 90 - struct irq_bypass_producer *tmp; 92 + unsigned long index = (unsigned long)eventfd; 91 93 struct irq_bypass_consumer *consumer; 92 94 int ret; 93 95 94 - if (!producer->token) 96 + if (WARN_ON_ONCE(producer->eventfd)) 95 97 return -EINVAL; 96 98 97 - might_sleep(); 99 + producer->irq = irq; 98 100 99 - if (!try_module_get(THIS_MODULE)) 100 - return -ENODEV; 101 + guard(mutex)(&lock); 101 102 102 - mutex_lock(&lock); 103 + ret = xa_insert(&producers, index, producer, GFP_KERNEL); 104 + if (ret) 105 + return ret; 103 106 104 - list_for_each_entry(tmp, &producers, node) { 105 - if (tmp->token == producer->token) { 106 - ret = -EBUSY; 107 - goto out_err; 107 + consumer = xa_load(&consumers, index); 108 + if (consumer) { 109 + ret = __connect(producer, consumer); 110 + if (ret) { 111 + WARN_ON_ONCE(xa_erase(&producers, index) != producer); 112 + return ret; 108 113 } 109 114 } 110 115 111 - list_for_each_entry(consumer, &consumers, node) { 112 - if (consumer->token == producer->token) { 113 - ret = __connect(producer, consumer); 114 - if (ret) 115 - goto out_err; 116 - break; 117 - } 118 - } 119 - 120 - list_add(&producer->node, &producers); 121 - 122 - mutex_unlock(&lock); 123 - 116 + producer->eventfd = eventfd; 124 117 return 0; 125 - out_err: 126 - mutex_unlock(&lock); 127 - module_put(THIS_MODULE); 128 - return ret; 129 118 } 130 119 EXPORT_SYMBOL_GPL(irq_bypass_register_producer); 131 120 ··· 126 129 * irq_bypass_unregister_producer - unregister IRQ bypass producer 127 130 * @producer: pointer to producer structure 128 131 * 129 - * Remove a previously registered IRQ producer from the list of producers 130 - * and disconnect it from any connected IRQ consumer. 132 + * Remove a previously registered IRQ producer (note, it's safe to call this 133 + * even if registration was unsuccessful). Disconnect from the associated 134 + * consumer, if one exists. 131 135 */ 132 136 void irq_bypass_unregister_producer(struct irq_bypass_producer *producer) 133 137 { 134 - struct irq_bypass_producer *tmp; 135 - struct irq_bypass_consumer *consumer; 138 + unsigned long index = (unsigned long)producer->eventfd; 136 139 137 - if (!producer->token) 140 + if (!producer->eventfd) 138 141 return; 139 142 140 - might_sleep(); 143 + guard(mutex)(&lock); 141 144 142 - if (!try_module_get(THIS_MODULE)) 143 - return; /* nothing in the list anyway */ 145 + if (producer->consumer) 146 + __disconnect(producer, producer->consumer); 144 147 145 - mutex_lock(&lock); 146 - 147 - list_for_each_entry(tmp, &producers, node) { 148 - if (tmp->token != producer->token) 149 - continue; 150 - 151 - list_for_each_entry(consumer, &consumers, node) { 152 - if (consumer->token == producer->token) { 153 - __disconnect(producer, consumer); 154 - break; 155 - } 156 - } 157 - 158 - list_del(&producer->node); 159 - module_put(THIS_MODULE); 160 - break; 161 - } 162 - 163 - mutex_unlock(&lock); 164 - 165 - module_put(THIS_MODULE); 148 + WARN_ON_ONCE(xa_erase(&producers, index) != producer); 149 + producer->eventfd = NULL; 166 150 } 167 151 EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer); 168 152 169 153 /** 170 154 * irq_bypass_register_consumer - register IRQ bypass consumer 171 155 * @consumer: pointer to consumer structure 156 + * @eventfd: pointer to the eventfd context associated with the consumer 172 157 * 173 - * Add the provided IRQ consumer to the list of consumers and connect 174 - * with any matching token found on the IRQ producer list. 158 + * Add the provided IRQ consumer to the set of consumers and connect with the 159 + * producer with a matching eventfd, if one exists. 175 160 */ 176 - int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer) 161 + int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer, 162 + struct eventfd_ctx *eventfd) 177 163 { 178 - struct irq_bypass_consumer *tmp; 164 + unsigned long index = (unsigned long)eventfd; 179 165 struct irq_bypass_producer *producer; 180 166 int ret; 181 167 182 - if (!consumer->token || 183 - !consumer->add_producer || !consumer->del_producer) 168 + if (WARN_ON_ONCE(consumer->eventfd)) 184 169 return -EINVAL; 185 170 186 - might_sleep(); 171 + if (!consumer->add_producer || !consumer->del_producer) 172 + return -EINVAL; 187 173 188 - if (!try_module_get(THIS_MODULE)) 189 - return -ENODEV; 174 + guard(mutex)(&lock); 190 175 191 - mutex_lock(&lock); 176 + ret = xa_insert(&consumers, index, consumer, GFP_KERNEL); 177 + if (ret) 178 + return ret; 192 179 193 - list_for_each_entry(tmp, &consumers, node) { 194 - if (tmp->token == consumer->token || tmp == consumer) { 195 - ret = -EBUSY; 196 - goto out_err; 180 + producer = xa_load(&producers, index); 181 + if (producer) { 182 + ret = __connect(producer, consumer); 183 + if (ret) { 184 + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); 185 + return ret; 197 186 } 198 187 } 199 188 200 - list_for_each_entry(producer, &producers, node) { 201 - if (producer->token == consumer->token) { 202 - ret = __connect(producer, consumer); 203 - if (ret) 204 - goto out_err; 205 - break; 206 - } 207 - } 208 - 209 - list_add(&consumer->node, &consumers); 210 - 211 - mutex_unlock(&lock); 212 - 189 + consumer->eventfd = eventfd; 213 190 return 0; 214 - out_err: 215 - mutex_unlock(&lock); 216 - module_put(THIS_MODULE); 217 - return ret; 218 191 } 219 192 EXPORT_SYMBOL_GPL(irq_bypass_register_consumer); 220 193 ··· 192 225 * irq_bypass_unregister_consumer - unregister IRQ bypass consumer 193 226 * @consumer: pointer to consumer structure 194 227 * 195 - * Remove a previously registered IRQ consumer from the list of consumers 196 - * and disconnect it from any connected IRQ producer. 228 + * Remove a previously registered IRQ consumer (note, it's safe to call this 229 + * even if registration was unsuccessful). Disconnect from the associated 230 + * producer, if one exists. 197 231 */ 198 232 void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer) 199 233 { 200 - struct irq_bypass_consumer *tmp; 201 - struct irq_bypass_producer *producer; 234 + unsigned long index = (unsigned long)consumer->eventfd; 202 235 203 - if (!consumer->token) 236 + if (!consumer->eventfd) 204 237 return; 205 238 206 - might_sleep(); 239 + guard(mutex)(&lock); 207 240 208 - if (!try_module_get(THIS_MODULE)) 209 - return; /* nothing in the list anyway */ 241 + if (consumer->producer) 242 + __disconnect(consumer->producer, consumer); 210 243 211 - mutex_lock(&lock); 212 - 213 - list_for_each_entry(tmp, &consumers, node) { 214 - if (tmp != consumer) 215 - continue; 216 - 217 - list_for_each_entry(producer, &producers, node) { 218 - if (producer->token == consumer->token) { 219 - __disconnect(producer, consumer); 220 - break; 221 - } 222 - } 223 - 224 - list_del(&consumer->node); 225 - module_put(THIS_MODULE); 226 - break; 227 - } 228 - 229 - mutex_unlock(&lock); 230 - 231 - module_put(THIS_MODULE); 244 + WARN_ON_ONCE(xa_erase(&consumers, index) != consumer); 245 + consumer->eventfd = NULL; 232 246 } 233 247 EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);

Configure Feed

Configure Feed