Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf_urgent_for_v5.12-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Borislav Petkov:

- Make sure PMU internal buffers are flushed for per-CPU events too and
properly handle PID/TID for large PEBS.

- Handle the case properly when there's no PMU and therefore return an
empty list of perf MSRs for VMX to switch instead of reading random
garbage from the stack.

* tag 'perf_urgent_for_v5.12-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/perf: Use RET0 as default for guest_get_msrs to handle "no PMU" case
perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
perf/core: Flush PMU internal buffers for per-CPU events

+51 -15
+6 -9
arch/x86/events/core.c
··· 81 81 DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); 82 82 DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); 83 83 84 - DEFINE_STATIC_CALL_NULL(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); 84 + /* 85 + * This one is magic, it will get called even when PMU init fails (because 86 + * there is no PMU), in which case it should simply return NULL. 87 + */ 88 + DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); 85 89 86 90 u64 __read_mostly hw_cache_event_ids 87 91 [PERF_COUNT_HW_CACHE_MAX] ··· 1948 1944 x86_perf_event_update(event); 1949 1945 } 1950 1946 1951 - static inline struct perf_guest_switch_msr * 1952 - perf_guest_get_msrs_nop(int *nr) 1953 - { 1954 - *nr = 0; 1955 - return NULL; 1956 - } 1957 - 1958 1947 static int __init init_hw_perf_events(void) 1959 1948 { 1960 1949 struct x86_pmu_quirk *quirk; ··· 2022 2025 x86_pmu.read = _x86_pmu_read; 2023 2026 2024 2027 if (!x86_pmu.guest_get_msrs) 2025 - x86_pmu.guest_get_msrs = perf_guest_get_msrs_nop; 2028 + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; 2026 2029 2027 2030 x86_pmu_static_call_update(); 2028 2031
+4 -1
arch/x86/events/intel/core.c
··· 3662 3662 if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { 3663 3663 event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; 3664 3664 if (!(event->attr.sample_type & 3665 - ~intel_pmu_large_pebs_flags(event))) 3665 + ~intel_pmu_large_pebs_flags(event))) { 3666 3666 event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; 3667 + event->attach_state |= PERF_ATTACH_SCHED_CB; 3668 + } 3667 3669 } 3668 3670 if (x86_pmu.pebs_aliases) 3669 3671 x86_pmu.pebs_aliases(event); ··· 3678 3676 ret = intel_pmu_setup_lbr_filter(event); 3679 3677 if (ret) 3680 3678 return ret; 3679 + event->attach_state |= PERF_ATTACH_SCHED_CB; 3681 3680 3682 3681 /* 3683 3682 * BTS is set up earlier in this path, so don't account twice
+1 -1
arch/x86/kvm/vmx/vmx.c
··· 6580 6580 int i, nr_msrs; 6581 6581 struct perf_guest_switch_msr *msrs; 6582 6582 6583 + /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 6583 6584 msrs = perf_guest_get_msrs(&nr_msrs); 6584 - 6585 6585 if (!msrs) 6586 6586 return; 6587 6587
+2
include/linux/perf_event.h
··· 606 606 #define PERF_ATTACH_TASK 0x04 607 607 #define PERF_ATTACH_TASK_DATA 0x08 608 608 #define PERF_ATTACH_ITRACE 0x10 609 + #define PERF_ATTACH_SCHED_CB 0x20 609 610 610 611 struct perf_cgroup; 611 612 struct perf_buffer; ··· 873 872 struct list_head cgrp_cpuctx_entry; 874 873 #endif 875 874 875 + struct list_head sched_cb_entry; 876 876 int sched_cb_usage; 877 877 878 878 int online;
+38 -4
kernel/events/core.c
··· 386 386 static atomic_t perf_sched_count; 387 387 388 388 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 389 + static DEFINE_PER_CPU(int, perf_sched_cb_usages); 389 390 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); 390 391 391 392 static atomic_t nr_mmap_events __read_mostly; ··· 3462 3461 } 3463 3462 } 3464 3463 3464 + static DEFINE_PER_CPU(struct list_head, sched_cb_list); 3465 + 3465 3466 void perf_sched_cb_dec(struct pmu *pmu) 3466 3467 { 3467 3468 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 3468 3469 3469 - --cpuctx->sched_cb_usage; 3470 + this_cpu_dec(perf_sched_cb_usages); 3471 + 3472 + if (!--cpuctx->sched_cb_usage) 3473 + list_del(&cpuctx->sched_cb_entry); 3470 3474 } 3471 3475 3472 3476 ··· 3479 3473 { 3480 3474 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 3481 3475 3482 - cpuctx->sched_cb_usage++; 3476 + if (!cpuctx->sched_cb_usage++) 3477 + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); 3478 + 3479 + this_cpu_inc(perf_sched_cb_usages); 3483 3480 } 3484 3481 3485 3482 /* ··· 3511 3502 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3512 3503 } 3513 3504 3505 + static void perf_pmu_sched_task(struct task_struct *prev, 3506 + struct task_struct *next, 3507 + bool sched_in) 3508 + { 3509 + struct perf_cpu_context *cpuctx; 3510 + 3511 + if (prev == next) 3512 + return; 3513 + 3514 + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { 3515 + /* will be handled in perf_event_context_sched_in/out */ 3516 + if (cpuctx->task_ctx) 3517 + continue; 3518 + 3519 + __perf_pmu_sched_task(cpuctx, sched_in); 3520 + } 3521 + } 3522 + 3514 3523 static void perf_event_switch(struct task_struct *task, 3515 3524 struct task_struct *next_prev, bool sched_in); 3516 3525 ··· 3550 3523 struct task_struct *next) 3551 3524 { 3552 3525 int ctxn; 3526 + 3527 + if (__this_cpu_read(perf_sched_cb_usages)) 3528 + perf_pmu_sched_task(task, next, false); 3553 3529 3554 3530 if (atomic_read(&nr_switch_events)) 3555 3531 perf_event_switch(task, next, false); ··· 3862 3832 3863 3833 if (atomic_read(&nr_switch_events)) 3864 3834 perf_event_switch(task, prev, true); 3835 + 3836 + if (__this_cpu_read(perf_sched_cb_usages)) 3837 + perf_pmu_sched_task(prev, task, true); 3865 3838 } 3866 3839 3867 3840 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) ··· 4689 4656 if (event->parent) 4690 4657 return; 4691 4658 4692 - if (event->attach_state & PERF_ATTACH_TASK) 4659 + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) 4693 4660 dec = true; 4694 4661 if (event->attr.mmap || event->attr.mmap_data) 4695 4662 atomic_dec(&nr_mmap_events); ··· 11208 11175 if (event->parent) 11209 11176 return; 11210 11177 11211 - if (event->attach_state & PERF_ATTACH_TASK) 11178 + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) 11212 11179 inc = true; 11213 11180 if (event->attr.mmap || event->attr.mmap_data) 11214 11181 atomic_inc(&nr_mmap_events); ··· 13005 12972 #ifdef CONFIG_CGROUP_PERF 13006 12973 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); 13007 12974 #endif 12975 + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); 13008 12976 } 13009 12977 } 13010 12978