Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'nmi.2023.02.14a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu

Pull x86 NMI diagnostics from Paul McKenney:
"Add diagnostics to the x86 NMI handler to help detect NMI-handler bugs
on the one hand and failing hardware on the other"

* tag 'nmi.2023.02.14a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu:
x86/nmi: Print reasons why backtrace NMIs are ignored
x86/nmi: Accumulate NMI-progress evidence in exc_nmi()

+128 -1
+107 -1
arch/x86/kernel/nmi.c
··· 69 69 unsigned int unknown; 70 70 unsigned int external; 71 71 unsigned int swallow; 72 + unsigned long recv_jiffies; 73 + unsigned long idt_seq; 74 + unsigned long idt_nmi_seq; 75 + unsigned long idt_ignored; 76 + atomic_long_t idt_calls; 77 + unsigned long idt_seq_snap; 78 + unsigned long idt_nmi_seq_snap; 79 + unsigned long idt_ignored_snap; 80 + long idt_calls_snap; 72 81 }; 73 82 74 83 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); ··· 488 479 DEFINE_IDTENTRY_RAW(exc_nmi) 489 480 { 490 481 irqentry_state_t irq_state; 482 + struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats); 491 483 492 484 /* 493 485 * Re-enable NMIs right here when running as an SEV-ES guest. This might 494 486 * cause nested NMIs, but those can be handled safely. 495 487 */ 496 488 sev_es_nmi_complete(); 489 + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) 490 + arch_atomic_long_inc(&nsp->idt_calls); 497 491 498 492 if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) 499 493 return; ··· 507 495 } 508 496 this_cpu_write(nmi_state, NMI_EXECUTING); 509 497 this_cpu_write(nmi_cr2, read_cr2()); 498 + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { 499 + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); 500 + WARN_ON_ONCE(!(nsp->idt_seq & 0x1)); 501 + WRITE_ONCE(nsp->recv_jiffies, jiffies); 502 + } 510 503 nmi_restart: 511 504 512 505 /* ··· 526 509 527 510 inc_irq_stat(__nmi_count); 528 511 529 - if (!ignore_nmis) 512 + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) { 513 + WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1); 514 + } else if (!ignore_nmis) { 515 + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { 516 + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); 517 + WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1)); 518 + } 530 519 default_do_nmi(regs); 520 + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { 521 + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); 522 + WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1); 523 + } 524 + } 531 525 532 526 irqentry_nmi_exit(regs, irq_state); 533 527 ··· 553 525 554 526 if (user_mode(regs)) 555 527 mds_user_clear_cpu_buffers(); 528 + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { 529 + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); 530 + WARN_ON_ONCE(nsp->idt_seq & 0x1); 531 + WRITE_ONCE(nsp->recv_jiffies, jiffies); 532 + } 556 533 } 557 534 558 535 #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) ··· 568 535 #endif 569 536 #if IS_MODULE(CONFIG_KVM_INTEL) 570 537 EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); 538 + #endif 539 + 540 + #ifdef CONFIG_NMI_CHECK_CPU 541 + 542 + static char *nmi_check_stall_msg[] = { 543 + /* */ 544 + /* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */ 545 + /* | +------ cpu_is_offline(cpu) */ 546 + /* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */ 547 + /* | | | NMI handler has been invoked. */ 548 + /* | | | */ 549 + /* V V V */ 550 + /* 0 0 0 */ "NMIs are not reaching exc_nmi() handler", 551 + /* 0 0 1 */ "exc_nmi() handler is ignoring NMIs", 552 + /* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler", 553 + /* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs", 554 + /* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler", 555 + /* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs", 556 + /* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler", 557 + /* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs", 558 + }; 559 + 560 + void nmi_backtrace_stall_snap(const struct cpumask *btp) 561 + { 562 + int cpu; 563 + struct nmi_stats *nsp; 564 + 565 + for_each_cpu(cpu, btp) { 566 + nsp = per_cpu_ptr(&nmi_stats, cpu); 567 + nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq); 568 + nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq); 569 + nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored); 570 + nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls); 571 + } 572 + } 573 + 574 + void nmi_backtrace_stall_check(const struct cpumask *btp) 575 + { 576 + int cpu; 577 + int idx; 578 + unsigned long nmi_seq; 579 + unsigned long j = jiffies; 580 + char *modp; 581 + char *msgp; 582 + char *msghp; 583 + struct nmi_stats *nsp; 584 + 585 + for_each_cpu(cpu, btp) { 586 + nsp = per_cpu_ptr(&nmi_stats, cpu); 587 + modp = ""; 588 + msghp = ""; 589 + nmi_seq = READ_ONCE(nsp->idt_nmi_seq); 590 + if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) { 591 + msgp = "CPU entered NMI handler function, but has not exited"; 592 + } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) { 593 + msgp = "CPU is handling NMIs"; 594 + } else { 595 + idx = ((nsp->idt_seq_snap & 0x1) << 2) | 596 + (cpu_is_offline(cpu) << 1) | 597 + (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls)); 598 + msgp = nmi_check_stall_msg[idx]; 599 + if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) 600 + modp = ", but OK because ignore_nmis was set"; 601 + if (nmi_seq & ~0x1) 602 + msghp = " (CPU currently in NMI handler function)"; 603 + else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) 604 + msghp = " (CPU exited one NMI handler function)"; 605 + } 606 + pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n", 607 + __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies)); 608 + } 609 + } 610 + 571 611 #endif 572 612 573 613 void stop_nmi(void)
+8
include/linux/nmi.h
··· 214 214 #include <asm/nmi.h> 215 215 #endif 216 216 217 + #ifdef CONFIG_NMI_CHECK_CPU 218 + void nmi_backtrace_stall_snap(const struct cpumask *btp); 219 + void nmi_backtrace_stall_check(const struct cpumask *btp); 220 + #else 221 + static inline void nmi_backtrace_stall_snap(const struct cpumask *btp) {} 222 + static inline void nmi_backtrace_stall_check(const struct cpumask *btp) {} 223 + #endif 224 + 217 225 #endif
+11
lib/Kconfig.debug
··· 1562 1562 depends on TRACE_IRQFLAGS 1563 1563 depends on TRACE_IRQFLAGS_NMI_SUPPORT 1564 1564 1565 + config NMI_CHECK_CPU 1566 + bool "Debugging for CPUs failing to respond to backtrace requests" 1567 + depends on DEBUG_KERNEL 1568 + depends on X86 1569 + default n 1570 + help 1571 + Enables debug prints when a CPU fails to respond to a given 1572 + backtrace NMI. These prints provide some reasons why a CPU 1573 + might legitimately be failing to respond, for example, if it 1574 + is offline of if ignore_nmis is set. 1575 + 1565 1576 config DEBUG_IRQFLAGS 1566 1577 bool "Debug IRQ flag manipulation" 1567 1578 help
+2
lib/nmi_backtrace.c
··· 64 64 if (!cpumask_empty(to_cpumask(backtrace_mask))) { 65 65 pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n", 66 66 this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask)); 67 + nmi_backtrace_stall_snap(to_cpumask(backtrace_mask)); 67 68 raise(to_cpumask(backtrace_mask)); 68 69 } 69 70 ··· 75 74 mdelay(1); 76 75 touch_softlockup_watchdog(); 77 76 } 77 + nmi_backtrace_stall_check(to_cpumask(backtrace_mask)); 78 78 79 79 /* 80 80 * Force flush any remote buffers that might be stuck in IRQ context