Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
"Six kernel side fixes: three related to NMI handling on AMD systems, a
race fix, a kexec initialization fix and a PEBS sampling fix"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/core: Fix perf_event_disable_inatomic() race
x86/perf/amd: Remove need to check "running" bit in NMI handler
x86/perf/amd: Resolve NMI latency issues for active PMCs
x86/perf/amd: Resolve race condition when disabling PMC
perf/x86/intel: Initialize TFA MSR
perf/x86/intel: Fix handling of wakeup_events for multi-entry PEBS

+190 -27
+135 -5
arch/x86/events/amd/core.c
··· 3 3 #include <linux/types.h> 4 4 #include <linux/init.h> 5 5 #include <linux/slab.h> 6 + #include <linux/delay.h> 6 7 #include <asm/apicdef.h> 8 + #include <asm/nmi.h> 7 9 8 10 #include "../perf_event.h" 11 + 12 + static DEFINE_PER_CPU(unsigned int, perf_nmi_counter); 9 13 10 14 static __initconst const u64 amd_hw_cache_event_ids 11 15 [PERF_COUNT_HW_CACHE_MAX] ··· 433 429 } 434 430 } 435 431 432 + /* 433 + * When a PMC counter overflows, an NMI is used to process the event and 434 + * reset the counter. NMI latency can result in the counter being updated 435 + * before the NMI can run, which can result in what appear to be spurious 436 + * NMIs. This function is intended to wait for the NMI to run and reset 437 + * the counter to avoid possible unhandled NMI messages. 438 + */ 439 + #define OVERFLOW_WAIT_COUNT 50 440 + 441 + static void amd_pmu_wait_on_overflow(int idx) 442 + { 443 + unsigned int i; 444 + u64 counter; 445 + 446 + /* 447 + * Wait for the counter to be reset if it has overflowed. This loop 448 + * should exit very, very quickly, but just in case, don't wait 449 + * forever... 450 + */ 451 + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { 452 + rdmsrl(x86_pmu_event_addr(idx), counter); 453 + if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) 454 + break; 455 + 456 + /* Might be in IRQ context, so can't sleep */ 457 + udelay(1); 458 + } 459 + } 460 + 461 + static void amd_pmu_disable_all(void) 462 + { 463 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 464 + int idx; 465 + 466 + x86_pmu_disable_all(); 467 + 468 + /* 469 + * This shouldn't be called from NMI context, but add a safeguard here 470 + * to return, since if we're in NMI context we can't wait for an NMI 471 + * to reset an overflowed counter value. 472 + */ 473 + if (in_nmi()) 474 + return; 475 + 476 + /* 477 + * Check each counter for overflow and wait for it to be reset by the 478 + * NMI if it has overflowed. This relies on the fact that all active 479 + * counters are always enabled when this function is caled and 480 + * ARCH_PERFMON_EVENTSEL_INT is always set. 481 + */ 482 + for (idx = 0; idx < x86_pmu.num_counters; idx++) { 483 + if (!test_bit(idx, cpuc->active_mask)) 484 + continue; 485 + 486 + amd_pmu_wait_on_overflow(idx); 487 + } 488 + } 489 + 490 + static void amd_pmu_disable_event(struct perf_event *event) 491 + { 492 + x86_pmu_disable_event(event); 493 + 494 + /* 495 + * This can be called from NMI context (via x86_pmu_stop). The counter 496 + * may have overflowed, but either way, we'll never see it get reset 497 + * by the NMI if we're already in the NMI. And the NMI latency support 498 + * below will take care of any pending NMI that might have been 499 + * generated by the overflow. 500 + */ 501 + if (in_nmi()) 502 + return; 503 + 504 + amd_pmu_wait_on_overflow(event->hw.idx); 505 + } 506 + 507 + /* 508 + * Because of NMI latency, if multiple PMC counters are active or other sources 509 + * of NMIs are received, the perf NMI handler can handle one or more overflowed 510 + * PMC counters outside of the NMI associated with the PMC overflow. If the NMI 511 + * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel 512 + * back-to-back NMI support won't be active. This PMC handler needs to take into 513 + * account that this can occur, otherwise this could result in unknown NMI 514 + * messages being issued. Examples of this is PMC overflow while in the NMI 515 + * handler when multiple PMCs are active or PMC overflow while handling some 516 + * other source of an NMI. 517 + * 518 + * Attempt to mitigate this by using the number of active PMCs to determine 519 + * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset 520 + * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the 521 + * number of active PMCs or 2. The value of 2 is used in case an NMI does not 522 + * arrive at the LAPIC in time to be collapsed into an already pending NMI. 523 + */ 524 + static int amd_pmu_handle_irq(struct pt_regs *regs) 525 + { 526 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 527 + int active, handled; 528 + 529 + /* 530 + * Obtain the active count before calling x86_pmu_handle_irq() since 531 + * it is possible that x86_pmu_handle_irq() may make a counter 532 + * inactive (through x86_pmu_stop). 533 + */ 534 + active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); 535 + 536 + /* Process any counter overflows */ 537 + handled = x86_pmu_handle_irq(regs); 538 + 539 + /* 540 + * If a counter was handled, record the number of possible remaining 541 + * NMIs that can occur. 542 + */ 543 + if (handled) { 544 + this_cpu_write(perf_nmi_counter, 545 + min_t(unsigned int, 2, active)); 546 + 547 + return handled; 548 + } 549 + 550 + if (!this_cpu_read(perf_nmi_counter)) 551 + return NMI_DONE; 552 + 553 + this_cpu_dec(perf_nmi_counter); 554 + 555 + return NMI_HANDLED; 556 + } 557 + 436 558 static struct event_constraint * 437 559 amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, 438 560 struct perf_event *event) ··· 751 621 752 622 static __initconst const struct x86_pmu amd_pmu = { 753 623 .name = "AMD", 754 - .handle_irq = x86_pmu_handle_irq, 755 - .disable_all = x86_pmu_disable_all, 624 + .handle_irq = amd_pmu_handle_irq, 625 + .disable_all = amd_pmu_disable_all, 756 626 .enable_all = x86_pmu_enable_all, 757 627 .enable = x86_pmu_enable_event, 758 - .disable = x86_pmu_disable_event, 628 + .disable = amd_pmu_disable_event, 759 629 .hw_config = amd_pmu_hw_config, 760 630 .schedule_events = x86_schedule_events, 761 631 .eventsel = MSR_K7_EVNTSEL0, ··· 862 732 cpuc->perf_ctr_virt_mask = 0; 863 733 864 734 /* Reload all events */ 865 - x86_pmu_disable_all(); 735 + amd_pmu_disable_all(); 866 736 x86_pmu_enable_all(0); 867 737 } 868 738 EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); ··· 880 750 cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; 881 751 882 752 /* Reload all events */ 883 - x86_pmu_disable_all(); 753 + amd_pmu_disable_all(); 884 754 x86_pmu_enable_all(0); 885 755 } 886 756 EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
+3 -10
arch/x86/events/core.c
··· 1349 1349 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1350 1350 struct hw_perf_event *hwc = &event->hw; 1351 1351 1352 - if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { 1352 + if (test_bit(hwc->idx, cpuc->active_mask)) { 1353 1353 x86_pmu.disable(event); 1354 + __clear_bit(hwc->idx, cpuc->active_mask); 1354 1355 cpuc->events[hwc->idx] = NULL; 1355 1356 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1356 1357 hwc->state |= PERF_HES_STOPPED; ··· 1448 1447 apic_write(APIC_LVTPC, APIC_DM_NMI); 1449 1448 1450 1449 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1451 - if (!test_bit(idx, cpuc->active_mask)) { 1452 - /* 1453 - * Though we deactivated the counter some cpus 1454 - * might still deliver spurious interrupts still 1455 - * in flight. Catch them: 1456 - */ 1457 - if (__test_and_clear_bit(idx, cpuc->running)) 1458 - handled++; 1450 + if (!test_bit(idx, cpuc->active_mask)) 1459 1451 continue; 1460 - } 1461 1452 1462 1453 event = cpuc->events[idx]; 1463 1454
+7 -1
arch/x86/events/intel/core.c
··· 3185 3185 return ret; 3186 3186 3187 3187 if (event->attr.precise_ip) { 3188 - if (!event->attr.freq) { 3188 + if (!(event->attr.freq || event->attr.wakeup_events)) { 3189 3189 event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; 3190 3190 if (!(event->attr.sample_type & 3191 3191 ~intel_pmu_large_pebs_flags(event))) ··· 3574 3574 intel_pmu_lbr_reset(); 3575 3575 3576 3576 cpuc->lbr_sel = NULL; 3577 + 3578 + if (x86_pmu.flags & PMU_FL_TFA) { 3579 + WARN_ON_ONCE(cpuc->tfa_shadow); 3580 + cpuc->tfa_shadow = ~0ULL; 3581 + intel_set_tfa(cpuc, false); 3582 + } 3577 3583 3578 3584 if (x86_pmu.version > 1) 3579 3585 flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+43 -9
kernel/events/core.c
··· 2009 2009 event->pmu->del(event, 0); 2010 2010 event->oncpu = -1; 2011 2011 2012 - if (event->pending_disable) { 2013 - event->pending_disable = 0; 2012 + if (READ_ONCE(event->pending_disable) >= 0) { 2013 + WRITE_ONCE(event->pending_disable, -1); 2014 2014 state = PERF_EVENT_STATE_OFF; 2015 2015 } 2016 2016 perf_event_set_state(event, state); ··· 2198 2198 2199 2199 void perf_event_disable_inatomic(struct perf_event *event) 2200 2200 { 2201 - event->pending_disable = 1; 2201 + WRITE_ONCE(event->pending_disable, smp_processor_id()); 2202 + /* can fail, see perf_pending_event_disable() */ 2202 2203 irq_work_queue(&event->pending); 2203 2204 } 2204 2205 ··· 5811 5810 } 5812 5811 } 5813 5812 5813 + static void perf_pending_event_disable(struct perf_event *event) 5814 + { 5815 + int cpu = READ_ONCE(event->pending_disable); 5816 + 5817 + if (cpu < 0) 5818 + return; 5819 + 5820 + if (cpu == smp_processor_id()) { 5821 + WRITE_ONCE(event->pending_disable, -1); 5822 + perf_event_disable_local(event); 5823 + return; 5824 + } 5825 + 5826 + /* 5827 + * CPU-A CPU-B 5828 + * 5829 + * perf_event_disable_inatomic() 5830 + * @pending_disable = CPU-A; 5831 + * irq_work_queue(); 5832 + * 5833 + * sched-out 5834 + * @pending_disable = -1; 5835 + * 5836 + * sched-in 5837 + * perf_event_disable_inatomic() 5838 + * @pending_disable = CPU-B; 5839 + * irq_work_queue(); // FAILS 5840 + * 5841 + * irq_work_run() 5842 + * perf_pending_event() 5843 + * 5844 + * But the event runs on CPU-B and wants disabling there. 5845 + */ 5846 + irq_work_queue_on(&event->pending, cpu); 5847 + } 5848 + 5814 5849 static void perf_pending_event(struct irq_work *entry) 5815 5850 { 5816 - struct perf_event *event = container_of(entry, 5817 - struct perf_event, pending); 5851 + struct perf_event *event = container_of(entry, struct perf_event, pending); 5818 5852 int rctx; 5819 5853 5820 5854 rctx = perf_swevent_get_recursion_context(); ··· 5858 5822 * and we won't recurse 'further'. 5859 5823 */ 5860 5824 5861 - if (event->pending_disable) { 5862 - event->pending_disable = 0; 5863 - perf_event_disable_local(event); 5864 - } 5825 + perf_pending_event_disable(event); 5865 5826 5866 5827 if (event->pending_wakeup) { 5867 5828 event->pending_wakeup = 0; ··· 10269 10236 10270 10237 10271 10238 init_waitqueue_head(&event->waitq); 10239 + event->pending_disable = -1; 10272 10240 init_irq_work(&event->pending, perf_pending_event); 10273 10241 10274 10242 mutex_init(&event->mmap_mutex);
+2 -2
kernel/events/ring_buffer.c
··· 392 392 * store that will be enabled on successful return 393 393 */ 394 394 if (!handle->size) { /* A, matches D */ 395 - event->pending_disable = 1; 395 + event->pending_disable = smp_processor_id(); 396 396 perf_output_wakeup(handle); 397 397 local_set(&rb->aux_nest, 0); 398 398 goto err_put; ··· 480 480 481 481 if (wakeup) { 482 482 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) 483 - handle->event->pending_disable = 1; 483 + handle->event->pending_disable = smp_processor_id(); 484 484 perf_output_wakeup(handle); 485 485 } 486 486