Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'irq-urgent-2021-08-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull irq fixes from Thomas Gleixner:
"A set of fixes for PCI/MSI and x86 interrupt startup:

- Mask all MSI-X entries when enabling MSI-X otherwise stale unmasked
entries stay around e.g. when a crashkernel is booted.

- Enforce masking of a MSI-X table entry when updating it, which
mandatory according to speification

- Ensure that writes to MSI[-X} tables are flushed.

- Prevent invalid bits being set in the MSI mask register

- Properly serialize modifications to the mask cache and the mask
register for multi-MSI.

- Cure the violation of the affinity setting rules on X86 during
interrupt startup which can cause lost and stale interrupts. Move
the initial affinity setting ahead of actualy enabling the
interrupt.

- Ensure that MSI interrupts are completely torn down before freeing
them in the error handling case.

- Prevent an array out of bounds access in the irq timings code"

* tag 'irq-urgent-2021-08-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
driver core: Add missing kernel doc for device::msi_lock
genirq/msi: Ensure deactivation on teardown
genirq/timings: Prevent potential array overflow in __irq_timings_store()
x86/msi: Force affinity setup before startup
x86/ioapic: Force affinity setup before startup
genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP
PCI/MSI: Protect msi_desc::masked for multi-MSI
PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown()
PCI/MSI: Correct misleading comments
PCI/MSI: Do not set invalid bits in MSI mask
PCI/MSI: Enforce MSI[X] entry updates to be visible
PCI/MSI: Enforce that MSI-X table entry is masked for update
PCI/MSI: Mask all unused MSI-X entries
PCI/MSI: Enable and mask MSI-X early

+114 -62
+4 -2
arch/x86/kernel/apic/io_apic.c
··· 1986 1986 .irq_set_affinity = ioapic_set_affinity, 1987 1987 .irq_retrigger = irq_chip_retrigger_hierarchy, 1988 1988 .irq_get_irqchip_state = ioapic_irq_get_chip_state, 1989 - .flags = IRQCHIP_SKIP_SET_WAKE, 1989 + .flags = IRQCHIP_SKIP_SET_WAKE | 1990 + IRQCHIP_AFFINITY_PRE_STARTUP, 1990 1991 }; 1991 1992 1992 1993 static struct irq_chip ioapic_ir_chip __read_mostly = { ··· 2000 1999 .irq_set_affinity = ioapic_set_affinity, 2001 2000 .irq_retrigger = irq_chip_retrigger_hierarchy, 2002 2001 .irq_get_irqchip_state = ioapic_irq_get_chip_state, 2003 - .flags = IRQCHIP_SKIP_SET_WAKE, 2002 + .flags = IRQCHIP_SKIP_SET_WAKE | 2003 + IRQCHIP_AFFINITY_PRE_STARTUP, 2004 2004 }; 2005 2005 2006 2006 static inline void init_IO_APIC_traps(void)
+8 -3
arch/x86/kernel/apic/msi.c
··· 58 58 * The quirk bit is not set in this case. 59 59 * - The new vector is the same as the old vector 60 60 * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) 61 + * - The interrupt is not yet started up 61 62 * - The new destination CPU is the same as the old destination CPU 62 63 */ 63 64 if (!irqd_msi_nomask_quirk(irqd) || 64 65 cfg->vector == old_cfg.vector || 65 66 old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || 67 + !irqd_is_started(irqd) || 66 68 cfg->dest_apicid == old_cfg.dest_apicid) { 67 69 irq_msi_update_msg(irqd, cfg); 68 70 return ret; ··· 152 150 .irq_ack = irq_chip_ack_parent, 153 151 .irq_retrigger = irq_chip_retrigger_hierarchy, 154 152 .irq_set_affinity = msi_set_affinity, 155 - .flags = IRQCHIP_SKIP_SET_WAKE, 153 + .flags = IRQCHIP_SKIP_SET_WAKE | 154 + IRQCHIP_AFFINITY_PRE_STARTUP, 156 155 }; 157 156 158 157 int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, ··· 222 219 .irq_mask = pci_msi_mask_irq, 223 220 .irq_ack = irq_chip_ack_parent, 224 221 .irq_retrigger = irq_chip_retrigger_hierarchy, 225 - .flags = IRQCHIP_SKIP_SET_WAKE, 222 + .flags = IRQCHIP_SKIP_SET_WAKE | 223 + IRQCHIP_AFFINITY_PRE_STARTUP, 226 224 }; 227 225 228 226 static struct msi_domain_info pci_msi_ir_domain_info = { ··· 277 273 .irq_retrigger = irq_chip_retrigger_hierarchy, 278 274 .irq_compose_msi_msg = dmar_msi_compose_msg, 279 275 .irq_write_msi_msg = dmar_msi_write_msg, 280 - .flags = IRQCHIP_SKIP_SET_WAKE, 276 + .flags = IRQCHIP_SKIP_SET_WAKE | 277 + IRQCHIP_AFFINITY_PRE_STARTUP, 281 278 }; 282 279 283 280 static int dmar_msi_init(struct irq_domain *domain,
+1 -1
arch/x86/kernel/hpet.c
··· 508 508 .irq_set_affinity = msi_domain_set_affinity, 509 509 .irq_retrigger = irq_chip_retrigger_hierarchy, 510 510 .irq_write_msi_msg = hpet_msi_write_msg, 511 - .flags = IRQCHIP_SKIP_SET_WAKE, 511 + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, 512 512 }; 513 513 514 514 static int hpet_msi_init(struct irq_domain *domain,
+1
drivers/base/core.c
··· 2837 2837 device_pm_init(dev); 2838 2838 set_dev_node(dev, -1); 2839 2839 #ifdef CONFIG_GENERIC_MSI_IRQ 2840 + raw_spin_lock_init(&dev->msi_lock); 2840 2841 INIT_LIST_HEAD(&dev->msi_list); 2841 2842 #endif 2842 2843 INIT_LIST_HEAD(&dev->links.consumers);
+78 -49
drivers/pci/msi.c
··· 143 143 * reliably as devices without an INTx disable bit will then generate a 144 144 * level IRQ which will never be cleared. 145 145 */ 146 - u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) 146 + void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) 147 147 { 148 - u32 mask_bits = desc->masked; 148 + raw_spinlock_t *lock = &desc->dev->msi_lock; 149 + unsigned long flags; 149 150 150 151 if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit) 151 - return 0; 152 + return; 152 153 153 - mask_bits &= ~mask; 154 - mask_bits |= flag; 154 + raw_spin_lock_irqsave(lock, flags); 155 + desc->masked &= ~mask; 156 + desc->masked |= flag; 155 157 pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos, 156 - mask_bits); 157 - 158 - return mask_bits; 158 + desc->masked); 159 + raw_spin_unlock_irqrestore(lock, flags); 159 160 } 160 161 161 162 static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) 162 163 { 163 - desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag); 164 + __pci_msi_desc_mask_irq(desc, mask, flag); 164 165 } 165 166 166 167 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) ··· 290 289 /* Don't touch the hardware now */ 291 290 } else if (entry->msi_attrib.is_msix) { 292 291 void __iomem *base = pci_msix_desc_addr(entry); 292 + bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT); 293 293 294 294 if (!base) 295 295 goto skip; 296 296 297 + /* 298 + * The specification mandates that the entry is masked 299 + * when the message is modified: 300 + * 301 + * "If software changes the Address or Data value of an 302 + * entry while the entry is unmasked, the result is 303 + * undefined." 304 + */ 305 + if (unmasked) 306 + __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT); 307 + 297 308 writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); 298 309 writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); 299 310 writel(msg->data, base + PCI_MSIX_ENTRY_DATA); 311 + 312 + if (unmasked) 313 + __pci_msix_desc_mask_irq(entry, 0); 314 + 315 + /* Ensure that the writes are visible in the device */ 316 + readl(base + PCI_MSIX_ENTRY_DATA); 300 317 } else { 301 318 int pos = dev->msi_cap; 302 319 u16 msgctl; ··· 335 316 pci_write_config_word(dev, pos + PCI_MSI_DATA_32, 336 317 msg->data); 337 318 } 319 + /* Ensure that the writes are visible in the device */ 320 + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); 338 321 } 339 322 340 323 skip: ··· 657 636 /* Configure MSI capability structure */ 658 637 ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI); 659 638 if (ret) { 660 - msi_mask_irq(entry, mask, ~mask); 639 + msi_mask_irq(entry, mask, 0); 661 640 free_msi_irqs(dev); 662 641 return ret; 663 642 } 664 643 665 644 ret = msi_verify_entries(dev); 666 645 if (ret) { 667 - msi_mask_irq(entry, mask, ~mask); 646 + msi_mask_irq(entry, mask, 0); 668 647 free_msi_irqs(dev); 669 648 return ret; 670 649 } 671 650 672 651 ret = populate_msi_sysfs(dev); 673 652 if (ret) { 674 - msi_mask_irq(entry, mask, ~mask); 653 + msi_mask_irq(entry, mask, 0); 675 654 free_msi_irqs(dev); 676 655 return ret; 677 656 } ··· 712 691 { 713 692 struct irq_affinity_desc *curmsk, *masks = NULL; 714 693 struct msi_desc *entry; 694 + void __iomem *addr; 715 695 int ret, i; 716 696 int vec_count = pci_msix_vec_count(dev); 717 697 ··· 733 711 734 712 entry->msi_attrib.is_msix = 1; 735 713 entry->msi_attrib.is_64 = 1; 714 + 736 715 if (entries) 737 716 entry->msi_attrib.entry_nr = entries[i].entry; 738 717 else ··· 745 722 entry->msi_attrib.default_irq = dev->irq; 746 723 entry->mask_base = base; 747 724 725 + addr = pci_msix_desc_addr(entry); 726 + if (addr) 727 + entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); 728 + 748 729 list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); 749 730 if (masks) 750 731 curmsk++; ··· 759 732 return ret; 760 733 } 761 734 762 - static void msix_program_entries(struct pci_dev *dev, 763 - struct msix_entry *entries) 735 + static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries) 764 736 { 765 737 struct msi_desc *entry; 766 - int i = 0; 767 - void __iomem *desc_addr; 768 738 769 739 for_each_pci_msi_entry(entry, dev) { 770 - if (entries) 771 - entries[i++].vector = entry->irq; 772 - 773 - desc_addr = pci_msix_desc_addr(entry); 774 - if (desc_addr) 775 - entry->masked = readl(desc_addr + 776 - PCI_MSIX_ENTRY_VECTOR_CTRL); 777 - else 778 - entry->masked = 0; 779 - 780 - msix_mask_irq(entry, 1); 740 + if (entries) { 741 + entries->vector = entry->irq; 742 + entries++; 743 + } 781 744 } 745 + } 746 + 747 + static void msix_mask_all(void __iomem *base, int tsize) 748 + { 749 + u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; 750 + int i; 751 + 752 + for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE) 753 + writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); 782 754 } 783 755 784 756 /** ··· 794 768 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, 795 769 int nvec, struct irq_affinity *affd) 796 770 { 797 - int ret; 798 - u16 control; 799 771 void __iomem *base; 772 + int ret, tsize; 773 + u16 control; 800 774 801 - /* Ensure MSI-X is disabled while it is set up */ 802 - pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); 775 + /* 776 + * Some devices require MSI-X to be enabled before the MSI-X 777 + * registers can be accessed. Mask all the vectors to prevent 778 + * interrupts coming in before they're fully set up. 779 + */ 780 + pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL | 781 + PCI_MSIX_FLAGS_ENABLE); 803 782 804 783 pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control); 805 784 /* Request & Map MSI-X table region */ 806 - base = msix_map_region(dev, msix_table_size(control)); 807 - if (!base) 808 - return -ENOMEM; 785 + tsize = msix_table_size(control); 786 + base = msix_map_region(dev, tsize); 787 + if (!base) { 788 + ret = -ENOMEM; 789 + goto out_disable; 790 + } 791 + 792 + /* Ensure that all table entries are masked. */ 793 + msix_mask_all(base, tsize); 809 794 810 795 ret = msix_setup_entries(dev, base, entries, nvec, affd); 811 796 if (ret) 812 - return ret; 797 + goto out_disable; 813 798 814 799 ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX); 815 800 if (ret) ··· 831 794 if (ret) 832 795 goto out_free; 833 796 834 - /* 835 - * Some devices require MSI-X to be enabled before we can touch the 836 - * MSI-X registers. We need to mask all the vectors to prevent 837 - * interrupts coming in before they're fully set up. 838 - */ 839 - pci_msix_clear_and_set_ctrl(dev, 0, 840 - PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE); 841 - 842 - msix_program_entries(dev, entries); 797 + msix_update_entries(dev, entries); 843 798 844 799 ret = populate_msi_sysfs(dev); 845 800 if (ret) ··· 864 835 865 836 out_free: 866 837 free_msi_irqs(dev); 838 + 839 + out_disable: 840 + pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); 867 841 868 842 return ret; 869 843 } ··· 962 930 963 931 /* Return the device with MSI unmasked as initial states */ 964 932 mask = msi_mask(desc->msi_attrib.multi_cap); 965 - /* Keep cached state to be restored */ 966 - __pci_msi_desc_mask_irq(desc, mask, ~mask); 933 + msi_mask_irq(desc, mask, 0); 967 934 968 935 /* Restore dev->irq to its default pin-assertion IRQ */ 969 936 dev->irq = desc->msi_attrib.default_irq; ··· 1047 1016 } 1048 1017 1049 1018 /* Return the device with MSI-X masked as initial states */ 1050 - for_each_pci_msi_entry(entry, dev) { 1051 - /* Keep cached states to be restored */ 1019 + for_each_pci_msi_entry(entry, dev) 1052 1020 __pci_msix_desc_mask_irq(entry, 1); 1053 - } 1054 1021 1055 1022 pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); 1056 1023 pci_intx_for_msi(dev, 1);
+2
include/linux/device.h
··· 407 407 * @em_pd: device's energy model performance domain 408 408 * @pins: For device pin management. 409 409 * See Documentation/driver-api/pin-control.rst for details. 410 + * @msi_lock: Lock to protect MSI mask cache and mask register 410 411 * @msi_list: Hosts MSI descriptors 411 412 * @msi_domain: The generic MSI domain this device is using. 412 413 * @numa_node: NUMA node this device is close to. ··· 507 506 struct dev_pin_info *pins; 508 507 #endif 509 508 #ifdef CONFIG_GENERIC_MSI_IRQ 509 + raw_spinlock_t msi_lock; 510 510 struct list_head msi_list; 511 511 #endif 512 512 #ifdef CONFIG_DMA_OPS
+2
include/linux/irq.h
··· 569 569 * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips 570 570 * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs 571 571 * in the suspend path if they are in disabled state 572 + * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup 572 573 */ 573 574 enum { 574 575 IRQCHIP_SET_TYPE_MASKED = (1 << 0), ··· 582 581 IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), 583 582 IRQCHIP_SUPPORTS_NMI = (1 << 8), 584 583 IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), 584 + IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), 585 585 }; 586 586 587 587 #include <linux/irqdesc.h>
+1 -1
include/linux/msi.h
··· 233 233 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); 234 234 235 235 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag); 236 - u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); 236 + void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); 237 237 void pci_msi_mask_irq(struct irq_data *data); 238 238 void pci_msi_unmask_irq(struct irq_data *data); 239 239
+4 -1
kernel/irq/chip.c
··· 265 265 } else { 266 266 switch (__irq_startup_managed(desc, aff, force)) { 267 267 case IRQ_STARTUP_NORMAL: 268 + if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP) 269 + irq_setup_affinity(desc); 268 270 ret = __irq_startup(desc); 269 - irq_setup_affinity(desc); 271 + if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)) 272 + irq_setup_affinity(desc); 270 273 break; 271 274 case IRQ_STARTUP_MANAGED: 272 275 irq_do_set_affinity(d, aff, false);
+8 -5
kernel/irq/msi.c
··· 476 476 return 0; 477 477 478 478 cleanup: 479 - for_each_msi_vector(desc, i, dev) { 480 - irq_data = irq_domain_get_irq_data(domain, i); 481 - if (irqd_is_activated(irq_data)) 482 - irq_domain_deactivate_irq(irq_data); 483 - } 484 479 msi_domain_free_irqs(domain, dev); 485 480 return ret; 486 481 } ··· 500 505 501 506 void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) 502 507 { 508 + struct irq_data *irq_data; 503 509 struct msi_desc *desc; 510 + int i; 511 + 512 + for_each_msi_vector(desc, i, dev) { 513 + irq_data = irq_domain_get_irq_data(domain, i); 514 + if (irqd_is_activated(irq_data)) 515 + irq_domain_deactivate_irq(irq_data); 516 + } 504 517 505 518 for_each_msi_entry(desc, dev) { 506 519 /*
+5
kernel/irq/timings.c
··· 453 453 */ 454 454 index = irq_timings_interval_index(interval); 455 455 456 + if (index > PREDICTION_BUFFER_SIZE - 1) { 457 + irqs->count = 0; 458 + return; 459 + } 460 + 456 461 /* 457 462 * Store the index as an element of the pattern in another 458 463 * circular array.