Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-urgent-2020-02-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:
"A set of fixes for X86:

- Ensure that the PIT is set up when the local APIC is disable or
configured in legacy mode. This is caused by an ordering issue
introduced in the recent changes which skip PIT initialization when
the TSC and APIC frequencies are already known.

- Handle malformed SRAT tables during early ACPI parsing which caused
an infinite loop anda boot hang.

- Fix a long standing race in the affinity setting code which affects
PCI devices with non-maskable MSI interrupts. The problem is caused
by the non-atomic writes of the MSI address (destination APIC id)
and data (vector) fields which the device uses to construct the MSI
message. The non-atomic writes are mandated by PCI.

If both fields change and the device raises an interrupt after
writing address and before writing data, then the MSI block
constructs a inconsistent message which causes interrupts to be
lost and subsequent malfunction of the device.

The fix is to redirect the interrupt to the new vector on the
current CPU first and then switch it over to the new target CPU.
This allows to observe an eventually raised interrupt in the
transitional stage (old CPU, new vector) to be observed in the APIC
IRR and retriggered on the new target CPU and the new vector.

The potential spurious interrupts caused by this are harmless and
can in the worst case expose a buggy driver (all handlers have to
be able to deal with spurious interrupts as they can and do happen
for various reasons).

- Add the missing suspend/resume mechanism for the HYPERV hypercall
page which prevents resume hibernation on HYPERV guests. This
change got lost before the merge window.

- Mask the IOAPIC before disabling the local APIC to prevent
potentially stale IOAPIC remote IRR bits which cause stale
interrupt lines after resume"

* tag 'x86-urgent-2020-02-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/apic: Mask IOAPIC entries when disabling the local APIC
x86/hyperv: Suspend/resume the hypercall page for hibernation
x86/apic/msi: Plug non-maskable MSI affinity race
x86/boot: Handle malformed SRAT tables during early ACPI parsing
x86/timer: Don't skip PIT setup when APIC is disabled or in legacy mode

+261 -12
+6
arch/x86/boot/compressed/acpi.c
··· 393 393 table = table_addr + sizeof(struct acpi_table_srat); 394 394 395 395 while (table + sizeof(struct acpi_subtable_header) < table_end) { 396 + 396 397 sub_table = (struct acpi_subtable_header *)table; 398 + if (!sub_table->length) { 399 + debug_putstr("Invalid zero length SRAT subtable.\n"); 400 + return 0; 401 + } 402 + 397 403 if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) { 398 404 struct acpi_srat_mem_affinity *ma; 399 405
+50
arch/x86/hyperv/hv_init.c
··· 21 21 #include <linux/hyperv.h> 22 22 #include <linux/slab.h> 23 23 #include <linux/cpuhotplug.h> 24 + #include <linux/syscore_ops.h> 24 25 #include <clocksource/hyperv_timer.h> 25 26 26 27 void *hv_hypercall_pg; 27 28 EXPORT_SYMBOL_GPL(hv_hypercall_pg); 29 + 30 + /* Storage to save the hypercall page temporarily for hibernation */ 31 + static void *hv_hypercall_pg_saved; 28 32 29 33 u32 *hv_vp_index; 30 34 EXPORT_SYMBOL_GPL(hv_vp_index); ··· 250 246 return 1; 251 247 } 252 248 249 + static int hv_suspend(void) 250 + { 251 + union hv_x64_msr_hypercall_contents hypercall_msr; 252 + 253 + /* 254 + * Reset the hypercall page as it is going to be invalidated 255 + * accross hibernation. Setting hv_hypercall_pg to NULL ensures 256 + * that any subsequent hypercall operation fails safely instead of 257 + * crashing due to an access of an invalid page. The hypercall page 258 + * pointer is restored on resume. 259 + */ 260 + hv_hypercall_pg_saved = hv_hypercall_pg; 261 + hv_hypercall_pg = NULL; 262 + 263 + /* Disable the hypercall page in the hypervisor */ 264 + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); 265 + hypercall_msr.enable = 0; 266 + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); 267 + 268 + return 0; 269 + } 270 + 271 + static void hv_resume(void) 272 + { 273 + union hv_x64_msr_hypercall_contents hypercall_msr; 274 + 275 + /* Re-enable the hypercall page */ 276 + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); 277 + hypercall_msr.enable = 1; 278 + hypercall_msr.guest_physical_address = 279 + vmalloc_to_pfn(hv_hypercall_pg_saved); 280 + wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); 281 + 282 + hv_hypercall_pg = hv_hypercall_pg_saved; 283 + hv_hypercall_pg_saved = NULL; 284 + } 285 + 286 + static struct syscore_ops hv_syscore_ops = { 287 + .suspend = hv_suspend, 288 + .resume = hv_resume, 289 + }; 290 + 253 291 /* 254 292 * This function is to be invoked early in the boot sequence after the 255 293 * hypervisor has been detected. ··· 376 330 377 331 x86_init.pci.arch_init = hv_pci_init; 378 332 333 + register_syscore_ops(&hv_syscore_ops); 334 + 379 335 return; 380 336 381 337 remove_cpuhp_state: ··· 396 348 void hyperv_cleanup(void) 397 349 { 398 350 union hv_x64_msr_hypercall_contents hypercall_msr; 351 + 352 + unregister_syscore_ops(&hv_syscore_ops); 399 353 400 354 /* Reset our OS id */ 401 355 wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+10
arch/x86/include/asm/apic.h
··· 140 140 extern void lapic_shutdown(void); 141 141 extern void sync_Arb_IDs(void); 142 142 extern void init_bsp_APIC(void); 143 + extern void apic_intr_mode_select(void); 143 144 extern void apic_intr_mode_init(void); 144 145 extern void init_apic_mappings(void); 145 146 void register_lapic_address(unsigned long address); ··· 189 188 # define setup_secondary_APIC_clock x86_init_noop 190 189 static inline void lapic_update_tsc_freq(void) { } 191 190 static inline void init_bsp_APIC(void) { } 191 + static inline void apic_intr_mode_select(void) { } 192 192 static inline void apic_intr_mode_init(void) { } 193 193 static inline void lapic_assign_system_vectors(void) { } 194 194 static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } ··· 452 450 * ... yummie. 453 451 */ 454 452 apic_eoi(); 453 + } 454 + 455 + 456 + static inline bool lapic_vector_set_in_irr(unsigned int vector) 457 + { 458 + u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 459 + 460 + return !!(irr & (1U << (vector % 32))); 455 461 } 456 462 457 463 static inline unsigned default_get_apic_id(unsigned long x)
+2
arch/x86/include/asm/x86_init.h
··· 51 51 * are set up. 52 52 * @intr_init: interrupt init code 53 53 * @trap_init: platform specific trap setup 54 + * @intr_mode_select: interrupt delivery mode selection 54 55 * @intr_mode_init: interrupt delivery mode setup 55 56 */ 56 57 struct x86_init_irqs { 57 58 void (*pre_vector_init)(void); 58 59 void (*intr_init)(void); 59 60 void (*trap_init)(void); 61 + void (*intr_mode_select)(void); 60 62 void (*intr_mode_init)(void); 61 63 }; 62 64
+25 -5
arch/x86/kernel/apic/apic.c
··· 830 830 if (!tsc_khz || !cpu_khz) 831 831 return true; 832 832 833 - /* Is there an APIC at all? */ 834 - if (!boot_cpu_has(X86_FEATURE_APIC)) 833 + /* Is there an APIC at all or is it disabled? */ 834 + if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic) 835 + return true; 836 + 837 + /* 838 + * If interrupt delivery mode is legacy PIC or virtual wire without 839 + * configuration, the local APIC timer wont be set up. Make sure 840 + * that the PIT is initialized. 841 + */ 842 + if (apic_intr_mode == APIC_PIC || 843 + apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG) 835 844 return true; 836 845 837 846 /* Virt guests may lack ARAT, but still have DEADLINE */ ··· 1331 1322 1332 1323 enum apic_intr_mode_id apic_intr_mode __ro_after_init; 1333 1324 1334 - static int __init apic_intr_mode_select(void) 1325 + static int __init __apic_intr_mode_select(void) 1335 1326 { 1336 1327 /* Check kernel option */ 1337 1328 if (disable_apic) { ··· 1393 1384 return APIC_SYMMETRIC_IO; 1394 1385 } 1395 1386 1387 + /* Select the interrupt delivery mode for the BSP */ 1388 + void __init apic_intr_mode_select(void) 1389 + { 1390 + apic_intr_mode = __apic_intr_mode_select(); 1391 + } 1392 + 1396 1393 /* 1397 1394 * An initial setup of the virtual wire mode. 1398 1395 */ ··· 1454 1439 void __init apic_intr_mode_init(void) 1455 1440 { 1456 1441 bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); 1457 - 1458 - apic_intr_mode = apic_intr_mode_select(); 1459 1442 1460 1443 switch (apic_intr_mode) { 1461 1444 case APIC_PIC: ··· 2639 2626 #endif 2640 2627 2641 2628 local_irq_save(flags); 2629 + 2630 + /* 2631 + * Mask IOAPIC before disabling the local APIC to prevent stale IRR 2632 + * entries on some implementations. 2633 + */ 2634 + mask_ioapic_entries(); 2635 + 2642 2636 disable_local_APIC(); 2643 2637 2644 2638 irq_remapping_disable();
+125 -3
arch/x86/kernel/apic/msi.c
··· 23 23 24 24 static struct irq_domain *msi_default_domain; 25 25 26 - static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) 26 + static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) 27 27 { 28 - struct irq_cfg *cfg = irqd_cfg(data); 29 - 30 28 msg->address_hi = MSI_ADDR_BASE_HI; 31 29 32 30 if (x2apic_enabled()) ··· 45 47 MSI_DATA_VECTOR(cfg->vector); 46 48 } 47 49 50 + static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) 51 + { 52 + __irq_msi_compose_msg(irqd_cfg(data), msg); 53 + } 54 + 55 + static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) 56 + { 57 + struct msi_msg msg[2] = { [1] = { }, }; 58 + 59 + __irq_msi_compose_msg(cfg, msg); 60 + irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); 61 + } 62 + 63 + static int 64 + msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) 65 + { 66 + struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd); 67 + struct irq_data *parent = irqd->parent_data; 68 + unsigned int cpu; 69 + int ret; 70 + 71 + /* Save the current configuration */ 72 + cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd)); 73 + old_cfg = *cfg; 74 + 75 + /* Allocate a new target vector */ 76 + ret = parent->chip->irq_set_affinity(parent, mask, force); 77 + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) 78 + return ret; 79 + 80 + /* 81 + * For non-maskable and non-remapped MSI interrupts the migration 82 + * to a different destination CPU and a different vector has to be 83 + * done careful to handle the possible stray interrupt which can be 84 + * caused by the non-atomic update of the address/data pair. 85 + * 86 + * Direct update is possible when: 87 + * - The MSI is maskable (remapped MSI does not use this code path)). 88 + * The quirk bit is not set in this case. 89 + * - The new vector is the same as the old vector 90 + * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) 91 + * - The new destination CPU is the same as the old destination CPU 92 + */ 93 + if (!irqd_msi_nomask_quirk(irqd) || 94 + cfg->vector == old_cfg.vector || 95 + old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || 96 + cfg->dest_apicid == old_cfg.dest_apicid) { 97 + irq_msi_update_msg(irqd, cfg); 98 + return ret; 99 + } 100 + 101 + /* 102 + * Paranoia: Validate that the interrupt target is the local 103 + * CPU. 104 + */ 105 + if (WARN_ON_ONCE(cpu != smp_processor_id())) { 106 + irq_msi_update_msg(irqd, cfg); 107 + return ret; 108 + } 109 + 110 + /* 111 + * Redirect the interrupt to the new vector on the current CPU 112 + * first. This might cause a spurious interrupt on this vector if 113 + * the device raises an interrupt right between this update and the 114 + * update to the final destination CPU. 115 + * 116 + * If the vector is in use then the installed device handler will 117 + * denote it as spurious which is no harm as this is a rare event 118 + * and interrupt handlers have to cope with spurious interrupts 119 + * anyway. If the vector is unused, then it is marked so it won't 120 + * trigger the 'No irq handler for vector' warning in do_IRQ(). 121 + * 122 + * This requires to hold vector lock to prevent concurrent updates to 123 + * the affected vector. 124 + */ 125 + lock_vector_lock(); 126 + 127 + /* 128 + * Mark the new target vector on the local CPU if it is currently 129 + * unused. Reuse the VECTOR_RETRIGGERED state which is also used in 130 + * the CPU hotplug path for a similar purpose. This cannot be 131 + * undone here as the current CPU has interrupts disabled and 132 + * cannot handle the interrupt before the whole set_affinity() 133 + * section is done. In the CPU unplug case, the current CPU is 134 + * about to vanish and will not handle any interrupts anymore. The 135 + * vector is cleaned up when the CPU comes online again. 136 + */ 137 + if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector]))) 138 + this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED); 139 + 140 + /* Redirect it to the new vector on the local CPU temporarily */ 141 + old_cfg.vector = cfg->vector; 142 + irq_msi_update_msg(irqd, &old_cfg); 143 + 144 + /* Now transition it to the target CPU */ 145 + irq_msi_update_msg(irqd, cfg); 146 + 147 + /* 148 + * All interrupts after this point are now targeted at the new 149 + * vector/CPU. 150 + * 151 + * Drop vector lock before testing whether the temporary assignment 152 + * to the local CPU was hit by an interrupt raised in the device, 153 + * because the retrigger function acquires vector lock again. 154 + */ 155 + unlock_vector_lock(); 156 + 157 + /* 158 + * Check whether the transition raced with a device interrupt and 159 + * is pending in the local APICs IRR. It is safe to do this outside 160 + * of vector lock as the irq_desc::lock of this interrupt is still 161 + * held and interrupts are disabled: The check is not accessing the 162 + * underlying vector store. It's just checking the local APIC's 163 + * IRR. 164 + */ 165 + if (lapic_vector_set_in_irr(cfg->vector)) 166 + irq_data_get_irq_chip(irqd)->irq_retrigger(irqd); 167 + 168 + return ret; 169 + } 170 + 48 171 /* 49 172 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, 50 173 * which implement the MSI or MSI-X Capability Structure. ··· 177 58 .irq_ack = irq_chip_ack_parent, 178 59 .irq_retrigger = irq_chip_retrigger_hierarchy, 179 60 .irq_compose_msi_msg = irq_msi_compose_msg, 61 + .irq_set_affinity = msi_set_affinity, 180 62 .flags = IRQCHIP_SKIP_SET_WAKE, 181 63 }; 182 64 ··· 266 146 } 267 147 if (!msi_default_domain) 268 148 pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); 149 + else 150 + msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; 269 151 } 270 152 271 153 #ifdef CONFIG_IRQ_REMAP
+11 -3
arch/x86/kernel/time.c
··· 91 91 92 92 static __init void x86_late_time_init(void) 93 93 { 94 - x86_init.timers.timer_init(); 95 94 /* 96 - * After PIT/HPET timers init, select and setup 97 - * the final interrupt mode for delivering IRQs. 95 + * Before PIT/HPET init, select the interrupt mode. This is required 96 + * to make the decision whether PIT should be initialized correct. 97 + */ 98 + x86_init.irqs.intr_mode_select(); 99 + 100 + /* Setup the legacy timers */ 101 + x86_init.timers.timer_init(); 102 + 103 + /* 104 + * After PIT/HPET timers init, set up the final interrupt mode for 105 + * delivering IRQs. 98 106 */ 99 107 x86_init.irqs.intr_mode_init(); 100 108 tsc_init();
+1
arch/x86/kernel/x86_init.c
··· 80 80 .pre_vector_init = init_ISA_irqs, 81 81 .intr_init = native_init_IRQ, 82 82 .trap_init = x86_init_noop, 83 + .intr_mode_select = apic_intr_mode_select, 83 84 .intr_mode_init = apic_intr_mode_init 84 85 }, 85 86
+1
arch/x86/xen/enlighten_pv.c
··· 1205 1205 x86_platform.get_nmi_reason = xen_get_nmi_reason; 1206 1206 1207 1207 x86_init.resources.memory_setup = xen_memory_setup; 1208 + x86_init.irqs.intr_mode_select = x86_init_noop; 1208 1209 x86_init.irqs.intr_mode_init = x86_init_noop; 1209 1210 x86_init.oem.arch_setup = xen_arch_setup; 1210 1211 x86_init.oem.banner = xen_banner;
+18
include/linux/irq.h
··· 209 209 * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target 210 210 * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set 211 211 * IRQD_CAN_RESERVE - Can use reservation mode 212 + * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change 213 + * required 212 214 */ 213 215 enum { 214 216 IRQD_TRIGGER_MASK = 0xf, ··· 233 231 IRQD_SINGLE_TARGET = (1 << 24), 234 232 IRQD_DEFAULT_TRIGGER_SET = (1 << 25), 235 233 IRQD_CAN_RESERVE = (1 << 26), 234 + IRQD_MSI_NOMASK_QUIRK = (1 << 27), 236 235 }; 237 236 238 237 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) ··· 391 388 static inline bool irqd_can_reserve(struct irq_data *d) 392 389 { 393 390 return __irqd_to_state(d) & IRQD_CAN_RESERVE; 391 + } 392 + 393 + static inline void irqd_set_msi_nomask_quirk(struct irq_data *d) 394 + { 395 + __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK; 396 + } 397 + 398 + static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d) 399 + { 400 + __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK; 401 + } 402 + 403 + static inline bool irqd_msi_nomask_quirk(struct irq_data *d) 404 + { 405 + return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; 394 406 } 395 407 396 408 #undef __irqd_to_state
+7
include/linux/irqdomain.h
··· 207 207 IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5), 208 208 209 209 /* 210 + * Quirk to handle MSI implementations which do not provide 211 + * masking. Currently known to affect x86, but partially 212 + * handled in core code. 213 + */ 214 + IRQ_DOMAIN_MSI_NOMASK_QUIRK = (1 << 6), 215 + 216 + /* 210 217 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved 211 218 * for implementation specific purposes and ignored by the 212 219 * core code.
+1
kernel/irq/debugfs.c
··· 114 114 BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), 115 115 BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), 116 116 BIT_MASK_DESCR(IRQD_CAN_RESERVE), 117 + BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), 117 118 118 119 BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), 119 120
+4 -1
kernel/irq/msi.c
··· 453 453 continue; 454 454 455 455 irq_data = irq_domain_get_irq_data(domain, desc->irq); 456 - if (!can_reserve) 456 + if (!can_reserve) { 457 457 irqd_clr_can_reserve(irq_data); 458 + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) 459 + irqd_set_msi_nomask_quirk(irq_data); 460 + } 458 461 ret = irq_domain_activate_irq(irq_data, can_reserve); 459 462 if (ret) 460 463 goto cleanup;