Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

- Implement per-PMU context rescheduling to significantly improve
single-PMU performance, and related cleanups/fixes (Peter Zijlstra
and Namhyung Kim)

- Fix ancient bug resulting in a lot of events being dropped
erroneously at higher sampling frequencies (Luo Gengkun)

- uprobes enhancements:

- Implement RCU-protected hot path optimizations for better
performance:

"For baseline vs SRCU, peak througput increased from 3.7 M/s
(million uprobe triggerings per second) up to about 8 M/s. For
uretprobes it's a bit more modest with bump from 2.4 M/s to
5 M/s.

For SRCU vs RCU Tasks Trace, peak throughput for uprobes
increases further from 8 M/s to 10.3 M/s (+28%!), and for
uretprobes from 5.3 M/s to 5.8 M/s (+11%), as we have more
work to do on uretprobes side.

Even single-thread (no contention) performance is slightly
better: 3.276 M/s to 3.396 M/s (+3.5%) for uprobes, and 2.055
M/s to 2.174 M/s (+5.8%) for uretprobes."

(Andrii Nakryiko et al)

- Document mmap_lock, don't abuse get_user_pages_remote() (Oleg
Nesterov)

- Cleanups & fixes to prepare for future work:
- Remove uprobe_register_refctr()
- Simplify error handling for alloc_uprobe()
- Make uprobe_register() return struct uprobe *
- Fold __uprobe_unregister() into uprobe_unregister()
- Shift put_uprobe() from delete_uprobe() to uprobe_unregister()
- BPF: Fix use-after-free in bpf_uprobe_multi_link_attach()
(Oleg Nesterov)

- New feature & ABI extension: allow events to use PERF_SAMPLE READ
with inheritance, enabling sample based profiling of a group of
counters over a hierarchy of processes or threads (Ben Gainey)

- Intel uncore & power events updates:

- Add Arrow Lake and Lunar Lake support
- Add PERF_EV_CAP_READ_SCOPE
- Clean up and enhance cpumask and hotplug support
(Kan Liang)

- Add LNL uncore iMC freerunning support
- Use D0:F0 as a default device
(Zhenyu Wang)

- Intel PT: fix AUX snapshot handling race (Adrian Hunter)

- Misc fixes and cleanups (James Clark, Jiri Olsa, Oleg Nesterov and
Peter Zijlstra)

* tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (40 commits)
dmaengine: idxd: Clean up cpumask and hotplug for perfmon
iommu/vt-d: Clean up cpumask and hotplug for perfmon
perf/x86/intel/cstate: Clean up cpumask and hotplug
perf: Add PERF_EV_CAP_READ_SCOPE
perf: Generic hotplug support for a PMU with a scope
uprobes: perform lockless SRCU-protected uprobes_tree lookup
rbtree: provide rb_find_rcu() / rb_find_add_rcu()
perf/uprobe: split uprobe_unregister()
uprobes: travers uprobe's consumer list locklessly under SRCU protection
uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks
uprobes: protected uprobe lifetime with SRCU
uprobes: revamp uprobe refcounting and lifetime management
bpf: Fix use-after-free in bpf_uprobe_multi_link_attach()
perf/core: Fix small negative period being ignored
perf: Really fix event_function_call() locking
perf: Optimize __pmu_ctx_sched_out()
perf: Add context time freeze
perf: Fix event_function_call() locking
perf: Extract a few helpers
perf: Optimize context reschedule for single PMU cases
...

+1146 -857
+63
arch/x86/events/core.c
··· 41 41 #include <asm/desc.h> 42 42 #include <asm/ldt.h> 43 43 #include <asm/unwind.h> 44 + #include <asm/uprobes.h> 45 + #include <asm/ibt.h> 44 46 45 47 #include "perf_event.h" 46 48 ··· 2818 2816 return get_desc_base(desc); 2819 2817 } 2820 2818 2819 + #ifdef CONFIG_UPROBES 2820 + /* 2821 + * Heuristic-based check if uprobe is installed at the function entry. 2822 + * 2823 + * Under assumption of user code being compiled with frame pointers, 2824 + * `push %rbp/%ebp` is a good indicator that we indeed are. 2825 + * 2826 + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. 2827 + * If we get this wrong, captured stack trace might have one extra bogus 2828 + * entry, but the rest of stack trace will still be meaningful. 2829 + */ 2830 + static bool is_uprobe_at_func_entry(struct pt_regs *regs) 2831 + { 2832 + struct arch_uprobe *auprobe; 2833 + 2834 + if (!current->utask) 2835 + return false; 2836 + 2837 + auprobe = current->utask->auprobe; 2838 + if (!auprobe) 2839 + return false; 2840 + 2841 + /* push %rbp/%ebp */ 2842 + if (auprobe->insn[0] == 0x55) 2843 + return true; 2844 + 2845 + /* endbr64 (64-bit only) */ 2846 + if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn)) 2847 + return true; 2848 + 2849 + return false; 2850 + } 2851 + 2852 + #else 2853 + static bool is_uprobe_at_func_entry(struct pt_regs *regs) 2854 + { 2855 + return false; 2856 + } 2857 + #endif /* CONFIG_UPROBES */ 2858 + 2821 2859 #ifdef CONFIG_IA32_EMULATION 2822 2860 2823 2861 #include <linux/compat.h> ··· 2869 2827 unsigned long ss_base, cs_base; 2870 2828 struct stack_frame_ia32 frame; 2871 2829 const struct stack_frame_ia32 __user *fp; 2830 + u32 ret_addr; 2872 2831 2873 2832 if (user_64bit_mode(regs)) 2874 2833 return 0; ··· 2879 2836 2880 2837 fp = compat_ptr(ss_base + regs->bp); 2881 2838 pagefault_disable(); 2839 + 2840 + /* see perf_callchain_user() below for why we do this */ 2841 + if (is_uprobe_at_func_entry(regs) && 2842 + !get_user(ret_addr, (const u32 __user *)regs->sp)) 2843 + perf_callchain_store(entry, ret_addr); 2844 + 2882 2845 while (entry->nr < entry->max_stack) { 2883 2846 if (!valid_user_frame(fp, sizeof(frame))) 2884 2847 break; ··· 2913 2864 { 2914 2865 struct stack_frame frame; 2915 2866 const struct stack_frame __user *fp; 2867 + unsigned long ret_addr; 2916 2868 2917 2869 if (perf_guest_state()) { 2918 2870 /* TODO: We don't support guest os callchain now */ ··· 2937 2887 return; 2938 2888 2939 2889 pagefault_disable(); 2890 + 2891 + /* 2892 + * If we are called from uprobe handler, and we are indeed at the very 2893 + * entry to user function (which is normally a `push %rbp` instruction, 2894 + * under assumption of application being compiled with frame pointers), 2895 + * we should read return address from *regs->sp before proceeding 2896 + * to follow frame pointers, otherwise we'll skip immediate caller 2897 + * as %rbp is not yet setup. 2898 + */ 2899 + if (is_uprobe_at_func_entry(regs) && 2900 + !get_user(ret_addr, (const unsigned long __user *)regs->sp)) 2901 + perf_callchain_store(entry, ret_addr); 2902 + 2940 2903 while (entry->nr < entry->max_stack) { 2941 2904 if (!valid_user_frame(fp, sizeof(frame))) 2942 2905 break;
-3
arch/x86/events/intel/bts.c
··· 557 557 * disabled, so disallow intel_bts driver for unprivileged 558 558 * users on paranoid systems since it provides trace data 559 559 * to the user in a zero-copy fashion. 560 - * 561 - * Note that the default paranoia setting permits unprivileged 562 - * users to profile the kernel. 563 560 */ 564 561 if (event->attr.exclude_kernel) { 565 562 ret = perf_allow_kernel(&event->attr);
+5 -137
arch/x86/events/intel/cstate.c
··· 128 128 static struct device_attribute format_attr_##_var = \ 129 129 __ATTR(_name, 0444, __cstate_##_var##_show, NULL) 130 130 131 - static ssize_t cstate_get_attr_cpumask(struct device *dev, 132 - struct device_attribute *attr, 133 - char *buf); 134 - 135 131 /* Model -> events mapping */ 136 132 struct cstate_model { 137 133 unsigned long core_events; ··· 202 206 .attrs = cstate_format_attrs, 203 207 }; 204 208 205 - static cpumask_t cstate_core_cpu_mask; 206 - static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL); 207 - 208 - static struct attribute *cstate_cpumask_attrs[] = { 209 - &dev_attr_cpumask.attr, 210 - NULL, 211 - }; 212 - 213 - static struct attribute_group cpumask_attr_group = { 214 - .attrs = cstate_cpumask_attrs, 215 - }; 216 - 217 209 static const struct attribute_group *cstate_attr_groups[] = { 218 210 &cstate_events_attr_group, 219 211 &cstate_format_attr_group, 220 - &cpumask_attr_group, 221 212 NULL, 222 213 }; 223 214 ··· 252 269 [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr }, 253 270 }; 254 271 255 - static cpumask_t cstate_pkg_cpu_mask; 256 - 257 272 /* cstate_module PMU */ 258 273 static struct pmu cstate_module_pmu; 259 274 static bool has_cstate_module; ··· 272 291 [PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr }, 273 292 }; 274 293 275 - static cpumask_t cstate_module_cpu_mask; 276 - 277 - static ssize_t cstate_get_attr_cpumask(struct device *dev, 278 - struct device_attribute *attr, 279 - char *buf) 280 - { 281 - struct pmu *pmu = dev_get_drvdata(dev); 282 - 283 - if (pmu == &cstate_core_pmu) 284 - return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); 285 - else if (pmu == &cstate_pkg_pmu) 286 - return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); 287 - else if (pmu == &cstate_module_pmu) 288 - return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask); 289 - else 290 - return 0; 291 - } 292 - 293 294 static int cstate_pmu_event_init(struct perf_event *event) 294 295 { 295 296 u64 cfg = event->attr.config; 296 - int cpu; 297 297 298 298 if (event->attr.type != event->pmu->type) 299 299 return -ENOENT; ··· 293 331 if (!(core_msr_mask & (1 << cfg))) 294 332 return -EINVAL; 295 333 event->hw.event_base = core_msr[cfg].msr; 296 - cpu = cpumask_any_and(&cstate_core_cpu_mask, 297 - topology_sibling_cpumask(event->cpu)); 298 334 } else if (event->pmu == &cstate_pkg_pmu) { 299 335 if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) 300 336 return -EINVAL; 301 337 cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX); 302 338 if (!(pkg_msr_mask & (1 << cfg))) 303 339 return -EINVAL; 304 - 305 - event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; 306 - 307 340 event->hw.event_base = pkg_msr[cfg].msr; 308 - cpu = cpumask_any_and(&cstate_pkg_cpu_mask, 309 - topology_die_cpumask(event->cpu)); 310 341 } else if (event->pmu == &cstate_module_pmu) { 311 342 if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX) 312 343 return -EINVAL; ··· 307 352 if (!(module_msr_mask & (1 << cfg))) 308 353 return -EINVAL; 309 354 event->hw.event_base = module_msr[cfg].msr; 310 - cpu = cpumask_any_and(&cstate_module_cpu_mask, 311 - topology_cluster_cpumask(event->cpu)); 312 355 } else { 313 356 return -ENOENT; 314 357 } 315 358 316 - if (cpu >= nr_cpu_ids) 317 - return -ENODEV; 318 - 319 - event->cpu = cpu; 320 359 event->hw.config = cfg; 321 360 event->hw.idx = -1; 322 361 return 0; ··· 361 412 return 0; 362 413 } 363 414 364 - /* 365 - * Check if exiting cpu is the designated reader. If so migrate the 366 - * events when there is a valid target available 367 - */ 368 - static int cstate_cpu_exit(unsigned int cpu) 369 - { 370 - unsigned int target; 371 - 372 - if (has_cstate_core && 373 - cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) { 374 - 375 - target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); 376 - /* Migrate events if there is a valid target */ 377 - if (target < nr_cpu_ids) { 378 - cpumask_set_cpu(target, &cstate_core_cpu_mask); 379 - perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); 380 - } 381 - } 382 - 383 - if (has_cstate_pkg && 384 - cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) { 385 - 386 - target = cpumask_any_but(topology_die_cpumask(cpu), cpu); 387 - /* Migrate events if there is a valid target */ 388 - if (target < nr_cpu_ids) { 389 - cpumask_set_cpu(target, &cstate_pkg_cpu_mask); 390 - perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); 391 - } 392 - } 393 - 394 - if (has_cstate_module && 395 - cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) { 396 - 397 - target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu); 398 - /* Migrate events if there is a valid target */ 399 - if (target < nr_cpu_ids) { 400 - cpumask_set_cpu(target, &cstate_module_cpu_mask); 401 - perf_pmu_migrate_context(&cstate_module_pmu, cpu, target); 402 - } 403 - } 404 - return 0; 405 - } 406 - 407 - static int cstate_cpu_init(unsigned int cpu) 408 - { 409 - unsigned int target; 410 - 411 - /* 412 - * If this is the first online thread of that core, set it in 413 - * the core cpu mask as the designated reader. 414 - */ 415 - target = cpumask_any_and(&cstate_core_cpu_mask, 416 - topology_sibling_cpumask(cpu)); 417 - 418 - if (has_cstate_core && target >= nr_cpu_ids) 419 - cpumask_set_cpu(cpu, &cstate_core_cpu_mask); 420 - 421 - /* 422 - * If this is the first online thread of that package, set it 423 - * in the package cpu mask as the designated reader. 424 - */ 425 - target = cpumask_any_and(&cstate_pkg_cpu_mask, 426 - topology_die_cpumask(cpu)); 427 - if (has_cstate_pkg && target >= nr_cpu_ids) 428 - cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); 429 - 430 - /* 431 - * If this is the first online thread of that cluster, set it 432 - * in the cluster cpu mask as the designated reader. 433 - */ 434 - target = cpumask_any_and(&cstate_module_cpu_mask, 435 - topology_cluster_cpumask(cpu)); 436 - if (has_cstate_module && target >= nr_cpu_ids) 437 - cpumask_set_cpu(cpu, &cstate_module_cpu_mask); 438 - 439 - return 0; 440 - } 441 - 442 415 static const struct attribute_group *core_attr_update[] = { 443 416 &group_cstate_core_c1, 444 417 &group_cstate_core_c3, ··· 397 526 .stop = cstate_pmu_event_stop, 398 527 .read = cstate_pmu_event_update, 399 528 .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, 529 + .scope = PERF_PMU_SCOPE_CORE, 400 530 .module = THIS_MODULE, 401 531 }; 402 532 ··· 413 541 .stop = cstate_pmu_event_stop, 414 542 .read = cstate_pmu_event_update, 415 543 .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, 544 + .scope = PERF_PMU_SCOPE_PKG, 416 545 .module = THIS_MODULE, 417 546 }; 418 547 ··· 429 556 .stop = cstate_pmu_event_stop, 430 557 .read = cstate_pmu_event_update, 431 558 .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, 559 + .scope = PERF_PMU_SCOPE_CLUSTER, 432 560 .module = THIS_MODULE, 433 561 }; 434 562 ··· 684 810 685 811 static inline void cstate_cleanup(void) 686 812 { 687 - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE); 688 - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING); 689 - 690 813 if (has_cstate_core) 691 814 perf_pmu_unregister(&cstate_core_pmu); 692 815 ··· 698 827 { 699 828 int err; 700 829 701 - cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING, 702 - "perf/x86/cstate:starting", cstate_cpu_init, NULL); 703 - cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE, 704 - "perf/x86/cstate:online", NULL, cstate_cpu_exit); 705 - 706 830 if (has_cstate_core) { 707 831 err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); 708 832 if (err) { ··· 710 844 711 845 if (has_cstate_pkg) { 712 846 if (topology_max_dies_per_package() > 1) { 847 + /* CLX-AP is multi-die and the cstate is die-scope */ 848 + cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE; 713 849 err = perf_pmu_register(&cstate_pkg_pmu, 714 850 "cstate_die", -1); 715 851 } else {
+14 -15
arch/x86/events/intel/pt.c
··· 416 416 static void pt_config_start(struct perf_event *event) 417 417 { 418 418 struct pt *pt = this_cpu_ptr(&pt_ctx); 419 - u64 ctl = event->hw.config; 419 + u64 ctl = event->hw.aux_config; 420 420 421 421 ctl |= RTIT_CTL_TRACEEN; 422 422 if (READ_ONCE(pt->vmx_on)) ··· 424 424 else 425 425 wrmsrl(MSR_IA32_RTIT_CTL, ctl); 426 426 427 - WRITE_ONCE(event->hw.config, ctl); 427 + WRITE_ONCE(event->hw.aux_config, ctl); 428 428 } 429 429 430 430 /* Address ranges and their corresponding msr configuration registers */ ··· 503 503 u64 reg; 504 504 505 505 /* First round: clear STATUS, in particular the PSB byte counter. */ 506 - if (!event->hw.config) { 506 + if (!event->hw.aux_config) { 507 507 perf_event_itrace_started(event); 508 508 wrmsrl(MSR_IA32_RTIT_STATUS, 0); 509 509 } ··· 533 533 534 534 reg |= (event->attr.config & PT_CONFIG_MASK); 535 535 536 - event->hw.config = reg; 536 + event->hw.aux_config = reg; 537 537 pt_config_start(event); 538 538 } 539 539 540 540 static void pt_config_stop(struct perf_event *event) 541 541 { 542 542 struct pt *pt = this_cpu_ptr(&pt_ctx); 543 - u64 ctl = READ_ONCE(event->hw.config); 543 + u64 ctl = READ_ONCE(event->hw.aux_config); 544 544 545 545 /* may be already stopped by a PMI */ 546 546 if (!(ctl & RTIT_CTL_TRACEEN)) ··· 550 550 if (!READ_ONCE(pt->vmx_on)) 551 551 wrmsrl(MSR_IA32_RTIT_CTL, ctl); 552 552 553 - WRITE_ONCE(event->hw.config, ctl); 553 + WRITE_ONCE(event->hw.aux_config, ctl); 554 554 555 555 /* 556 556 * A wrmsr that disables trace generation serializes other PT ··· 1557 1557 1558 1558 /* Turn PTs back on */ 1559 1559 if (!on && event) 1560 - wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config); 1560 + wrmsrl(MSR_IA32_RTIT_CTL, event->hw.aux_config); 1561 1561 1562 1562 local_irq_restore(flags); 1563 1563 } ··· 1606 1606 * see comment in intel_pt_interrupt(). 1607 1607 */ 1608 1608 WRITE_ONCE(pt->handle_nmi, 0); 1609 + barrier(); 1609 1610 1610 1611 pt_config_stop(event); 1611 1612 ··· 1658 1657 return 0; 1659 1658 1660 1659 /* 1661 - * Here, handle_nmi tells us if the tracing is on 1660 + * There is no PT interrupt in this mode, so stop the trace and it will 1661 + * remain stopped while the buffer is copied. 1662 1662 */ 1663 - if (READ_ONCE(pt->handle_nmi)) 1664 - pt_config_stop(event); 1665 - 1663 + pt_config_stop(event); 1666 1664 pt_read_offset(buf); 1667 1665 pt_update_head(pt); 1668 1666 ··· 1673 1673 ret = perf_output_copy_aux(&pt->handle, handle, from, to); 1674 1674 1675 1675 /* 1676 - * If the tracing was on when we turned up, restart it. 1677 - * Compiler barrier not needed as we couldn't have been 1678 - * preempted by anything that touches pt->handle_nmi. 1676 + * Here, handle_nmi tells us if the tracing was on. 1677 + * If the tracing was on, restart it. 1679 1678 */ 1680 - if (pt->handle_nmi) 1679 + if (READ_ONCE(pt->handle_nmi)) 1681 1680 pt_config_start(event); 1682 1681 1683 1682 return ret;
+9
arch/x86/events/intel/uncore.c
··· 1816 1816 .mmio_init = adl_uncore_mmio_init, 1817 1817 }; 1818 1818 1819 + static const struct intel_uncore_init_fun lnl_uncore_init __initconst = { 1820 + .cpu_init = lnl_uncore_cpu_init, 1821 + .mmio_init = lnl_uncore_mmio_init, 1822 + }; 1823 + 1819 1824 static const struct intel_uncore_init_fun icx_uncore_init __initconst = { 1820 1825 .cpu_init = icx_uncore_cpu_init, 1821 1826 .pci_init = icx_uncore_pci_init, ··· 1898 1893 X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init), 1899 1894 X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init), 1900 1895 X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init), 1896 + X86_MATCH_VFM(INTEL_ARROWLAKE, &mtl_uncore_init), 1897 + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init), 1898 + X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init), 1899 + X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init), 1901 1900 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init), 1902 1901 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init), 1903 1902 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
+2
arch/x86/events/intel/uncore.h
··· 611 611 void icl_uncore_cpu_init(void); 612 612 void tgl_uncore_cpu_init(void); 613 613 void adl_uncore_cpu_init(void); 614 + void lnl_uncore_cpu_init(void); 614 615 void mtl_uncore_cpu_init(void); 615 616 void tgl_uncore_mmio_init(void); 616 617 void tgl_l_uncore_mmio_init(void); 617 618 void adl_uncore_mmio_init(void); 619 + void lnl_uncore_mmio_init(void); 618 620 int snb_pci2phy_map_init(int devid); 619 621 620 622 /* uncore_snbep.c */
+168 -17
arch/x86/events/intel/uncore_snb.c
··· 252 252 DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); 253 253 DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); 254 254 DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29"); 255 + DEFINE_UNCORE_FORMAT_ATTR(threshold2, threshold, "config:24-31"); 255 256 256 257 /* Sandy Bridge uncore support */ 257 258 static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) ··· 745 744 { 746 745 mtl_uncore_cbox.num_boxes = icl_get_cbox_num(); 747 746 uncore_msr_uncores = mtl_msr_uncores; 747 + } 748 + 749 + static struct intel_uncore_type *lnl_msr_uncores[] = { 750 + &mtl_uncore_cbox, 751 + &mtl_uncore_arb, 752 + NULL 753 + }; 754 + 755 + #define LNL_UNC_MSR_GLOBAL_CTL 0x240e 756 + 757 + static void lnl_uncore_msr_init_box(struct intel_uncore_box *box) 758 + { 759 + if (box->pmu->pmu_idx == 0) 760 + wrmsrl(LNL_UNC_MSR_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); 761 + } 762 + 763 + static struct intel_uncore_ops lnl_uncore_msr_ops = { 764 + .init_box = lnl_uncore_msr_init_box, 765 + .disable_event = snb_uncore_msr_disable_event, 766 + .enable_event = snb_uncore_msr_enable_event, 767 + .read_counter = uncore_msr_read_counter, 768 + }; 769 + 770 + void lnl_uncore_cpu_init(void) 771 + { 772 + mtl_uncore_cbox.num_boxes = 4; 773 + mtl_uncore_cbox.ops = &lnl_uncore_msr_ops; 774 + uncore_msr_uncores = lnl_msr_uncores; 748 775 } 749 776 750 777 enum { ··· 1504 1475 ids++; 1505 1476 } 1506 1477 1478 + /* Just try to grab 00:00.0 device */ 1479 + if (!mc_dev) 1480 + mc_dev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0)); 1481 + 1507 1482 return mc_dev; 1508 1483 } 1509 1484 1510 1485 #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 1511 1486 #define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 1512 1487 1513 - static void __uncore_imc_init_box(struct intel_uncore_box *box, 1514 - unsigned int base_offset) 1488 + static void 1489 + uncore_get_box_mmio_addr(struct intel_uncore_box *box, 1490 + unsigned int base_offset, 1491 + int bar_offset, int step) 1515 1492 { 1516 1493 struct pci_dev *pdev = tgl_uncore_get_mc_dev(); 1517 1494 struct intel_uncore_pmu *pmu = box->pmu; 1518 1495 struct intel_uncore_type *type = pmu->type; 1519 1496 resource_size_t addr; 1520 - u32 mch_bar; 1497 + u32 bar; 1521 1498 1522 1499 if (!pdev) { 1523 1500 pr_warn("perf uncore: Cannot find matched IMC device.\n"); 1524 1501 return; 1525 1502 } 1526 1503 1527 - pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET, &mch_bar); 1528 - /* MCHBAR is disabled */ 1529 - if (!(mch_bar & BIT(0))) { 1530 - pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n"); 1504 + pci_read_config_dword(pdev, bar_offset, &bar); 1505 + if (!(bar & BIT(0))) { 1506 + pr_warn("perf uncore: BAR 0x%x is disabled. Failed to map %s counters.\n", 1507 + bar_offset, type->name); 1531 1508 pci_dev_put(pdev); 1532 1509 return; 1533 1510 } 1534 - mch_bar &= ~BIT(0); 1535 - addr = (resource_size_t)(mch_bar + TGL_UNCORE_MMIO_IMC_MEM_OFFSET * pmu->pmu_idx); 1511 + bar &= ~BIT(0); 1512 + addr = (resource_size_t)(bar + step * pmu->pmu_idx); 1536 1513 1537 1514 #ifdef CONFIG_PHYS_ADDR_T_64BIT 1538 - pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET + 4, &mch_bar); 1539 - addr |= ((resource_size_t)mch_bar << 32); 1515 + pci_read_config_dword(pdev, bar_offset + 4, &bar); 1516 + addr |= ((resource_size_t)bar << 32); 1540 1517 #endif 1541 1518 1542 1519 addr += base_offset; ··· 1551 1516 pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); 1552 1517 1553 1518 pci_dev_put(pdev); 1519 + } 1520 + 1521 + static void __uncore_imc_init_box(struct intel_uncore_box *box, 1522 + unsigned int base_offset) 1523 + { 1524 + uncore_get_box_mmio_addr(box, base_offset, 1525 + SNB_UNCORE_PCI_IMC_BAR_OFFSET, 1526 + TGL_UNCORE_MMIO_IMC_MEM_OFFSET); 1554 1527 } 1555 1528 1556 1529 static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) ··· 1655 1612 writel(0, box->io_addr + uncore_mmio_box_ctl(box)); 1656 1613 } 1657 1614 1615 + #define MMIO_UNCORE_COMMON_OPS() \ 1616 + .exit_box = uncore_mmio_exit_box, \ 1617 + .disable_box = adl_uncore_mmio_disable_box, \ 1618 + .enable_box = adl_uncore_mmio_enable_box, \ 1619 + .disable_event = intel_generic_uncore_mmio_disable_event, \ 1620 + .enable_event = intel_generic_uncore_mmio_enable_event, \ 1621 + .read_counter = uncore_mmio_read_counter, 1622 + 1658 1623 static struct intel_uncore_ops adl_uncore_mmio_ops = { 1659 1624 .init_box = adl_uncore_imc_init_box, 1660 - .exit_box = uncore_mmio_exit_box, 1661 - .disable_box = adl_uncore_mmio_disable_box, 1662 - .enable_box = adl_uncore_mmio_enable_box, 1663 - .disable_event = intel_generic_uncore_mmio_disable_event, 1664 - .enable_event = intel_generic_uncore_mmio_enable_event, 1665 - .read_counter = uncore_mmio_read_counter, 1625 + MMIO_UNCORE_COMMON_OPS() 1666 1626 }; 1667 1627 1668 1628 #define ADL_UNC_CTL_CHMASK_MASK 0x00000f00 ··· 1749 1703 } 1750 1704 1751 1705 /* end of Alder Lake MMIO uncore support */ 1706 + 1707 + /* Lunar Lake MMIO uncore support */ 1708 + #define LNL_UNCORE_PCI_SAFBAR_OFFSET 0x68 1709 + #define LNL_UNCORE_MAP_SIZE 0x1000 1710 + #define LNL_UNCORE_SNCU_BASE 0xE4B000 1711 + #define LNL_UNCORE_SNCU_CTR 0x390 1712 + #define LNL_UNCORE_SNCU_CTRL 0x398 1713 + #define LNL_UNCORE_SNCU_BOX_CTL 0x380 1714 + #define LNL_UNCORE_GLOBAL_CTL 0x700 1715 + #define LNL_UNCORE_HBO_BASE 0xE54000 1716 + #define LNL_UNCORE_HBO_OFFSET -4096 1717 + #define LNL_UNCORE_HBO_CTR 0x570 1718 + #define LNL_UNCORE_HBO_CTRL 0x550 1719 + #define LNL_UNCORE_HBO_BOX_CTL 0x548 1720 + 1721 + #define LNL_UNC_CTL_THRESHOLD 0xff000000 1722 + #define LNL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ 1723 + SNB_UNC_CTL_UMASK_MASK | \ 1724 + SNB_UNC_CTL_EDGE_DET | \ 1725 + SNB_UNC_CTL_INVERT | \ 1726 + LNL_UNC_CTL_THRESHOLD) 1727 + 1728 + static struct attribute *lnl_uncore_formats_attr[] = { 1729 + &format_attr_event.attr, 1730 + &format_attr_umask.attr, 1731 + &format_attr_edge.attr, 1732 + &format_attr_inv.attr, 1733 + &format_attr_threshold2.attr, 1734 + NULL 1735 + }; 1736 + 1737 + static const struct attribute_group lnl_uncore_format_group = { 1738 + .name = "format", 1739 + .attrs = lnl_uncore_formats_attr, 1740 + }; 1741 + 1742 + static void lnl_uncore_hbo_init_box(struct intel_uncore_box *box) 1743 + { 1744 + uncore_get_box_mmio_addr(box, LNL_UNCORE_HBO_BASE, 1745 + LNL_UNCORE_PCI_SAFBAR_OFFSET, 1746 + LNL_UNCORE_HBO_OFFSET); 1747 + } 1748 + 1749 + static struct intel_uncore_ops lnl_uncore_hbo_ops = { 1750 + .init_box = lnl_uncore_hbo_init_box, 1751 + MMIO_UNCORE_COMMON_OPS() 1752 + }; 1753 + 1754 + static struct intel_uncore_type lnl_uncore_hbo = { 1755 + .name = "hbo", 1756 + .num_counters = 4, 1757 + .num_boxes = 2, 1758 + .perf_ctr_bits = 64, 1759 + .perf_ctr = LNL_UNCORE_HBO_CTR, 1760 + .event_ctl = LNL_UNCORE_HBO_CTRL, 1761 + .event_mask = LNL_UNC_RAW_EVENT_MASK, 1762 + .box_ctl = LNL_UNCORE_HBO_BOX_CTL, 1763 + .mmio_map_size = LNL_UNCORE_MAP_SIZE, 1764 + .ops = &lnl_uncore_hbo_ops, 1765 + .format_group = &lnl_uncore_format_group, 1766 + }; 1767 + 1768 + static void lnl_uncore_sncu_init_box(struct intel_uncore_box *box) 1769 + { 1770 + uncore_get_box_mmio_addr(box, LNL_UNCORE_SNCU_BASE, 1771 + LNL_UNCORE_PCI_SAFBAR_OFFSET, 1772 + 0); 1773 + 1774 + if (box->io_addr) 1775 + writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + LNL_UNCORE_GLOBAL_CTL); 1776 + } 1777 + 1778 + static struct intel_uncore_ops lnl_uncore_sncu_ops = { 1779 + .init_box = lnl_uncore_sncu_init_box, 1780 + MMIO_UNCORE_COMMON_OPS() 1781 + }; 1782 + 1783 + static struct intel_uncore_type lnl_uncore_sncu = { 1784 + .name = "sncu", 1785 + .num_counters = 2, 1786 + .num_boxes = 1, 1787 + .perf_ctr_bits = 64, 1788 + .perf_ctr = LNL_UNCORE_SNCU_CTR, 1789 + .event_ctl = LNL_UNCORE_SNCU_CTRL, 1790 + .event_mask = LNL_UNC_RAW_EVENT_MASK, 1791 + .box_ctl = LNL_UNCORE_SNCU_BOX_CTL, 1792 + .mmio_map_size = LNL_UNCORE_MAP_SIZE, 1793 + .ops = &lnl_uncore_sncu_ops, 1794 + .format_group = &lnl_uncore_format_group, 1795 + }; 1796 + 1797 + static struct intel_uncore_type *lnl_mmio_uncores[] = { 1798 + &adl_uncore_imc, 1799 + &lnl_uncore_hbo, 1800 + &lnl_uncore_sncu, 1801 + &adl_uncore_imc_free_running, 1802 + NULL 1803 + }; 1804 + 1805 + void lnl_uncore_mmio_init(void) 1806 + { 1807 + uncore_mmio_uncores = lnl_mmio_uncores; 1808 + } 1809 + 1810 + /* end of Lunar Lake MMIO uncore support */
-7
drivers/dma/idxd/idxd.h
··· 124 124 125 125 struct pmu pmu; 126 126 char name[IDXD_NAME_SIZE]; 127 - int cpu; 128 127 129 128 int n_counters; 130 129 int counter_width; ··· 134 135 135 136 unsigned long supported_filters; 136 137 int n_filters; 137 - 138 - struct hlist_node cpuhp_node; 139 138 }; 140 139 141 140 #define IDXD_MAX_PRIORITY 0xf ··· 800 803 int perfmon_pmu_init(struct idxd_device *idxd); 801 804 void perfmon_pmu_remove(struct idxd_device *idxd); 802 805 void perfmon_counter_overflow(struct idxd_device *idxd); 803 - void perfmon_init(void); 804 - void perfmon_exit(void); 805 806 #else 806 807 static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; } 807 808 static inline void perfmon_pmu_remove(struct idxd_device *idxd) {} 808 809 static inline void perfmon_counter_overflow(struct idxd_device *idxd) {} 809 - static inline void perfmon_init(void) {} 810 - static inline void perfmon_exit(void) {} 811 810 #endif 812 811 813 812 /* debugfs */
-3
drivers/dma/idxd/init.c
··· 878 878 else 879 879 support_enqcmd = true; 880 880 881 - perfmon_init(); 882 - 883 881 err = idxd_driver_register(&idxd_drv); 884 882 if (err < 0) 885 883 goto err_idxd_driver_register; ··· 926 928 idxd_driver_unregister(&idxd_drv); 927 929 pci_unregister_driver(&idxd_pci_driver); 928 930 idxd_cdev_remove(); 929 - perfmon_exit(); 930 931 idxd_remove_debugfs(); 931 932 } 932 933 module_exit(idxd_exit_module);
+1 -97
drivers/dma/idxd/perfmon.c
··· 6 6 #include "idxd.h" 7 7 #include "perfmon.h" 8 8 9 - static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, 10 - char *buf); 11 - 12 - static cpumask_t perfmon_dsa_cpu_mask; 13 - static bool cpuhp_set_up; 14 - static enum cpuhp_state cpuhp_slot; 15 - 16 - /* 17 - * perf userspace reads this attribute to determine which cpus to open 18 - * counters on. It's connected to perfmon_dsa_cpu_mask, which is 19 - * maintained by the cpu hotplug handlers. 20 - */ 21 - static DEVICE_ATTR_RO(cpumask); 22 - 23 - static struct attribute *perfmon_cpumask_attrs[] = { 24 - &dev_attr_cpumask.attr, 25 - NULL, 26 - }; 27 - 28 - static struct attribute_group cpumask_attr_group = { 29 - .attrs = perfmon_cpumask_attrs, 30 - }; 31 - 32 9 /* 33 10 * These attributes specify the bits in the config word that the perf 34 11 * syscall uses to pass the event ids and categories to perfmon. ··· 44 67 45 68 static const struct attribute_group *perfmon_attr_groups[] = { 46 69 &perfmon_format_attr_group, 47 - &cpumask_attr_group, 48 70 NULL, 49 71 }; 50 - 51 - static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, 52 - char *buf) 53 - { 54 - return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask); 55 - } 56 72 57 73 static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event) 58 74 { ··· 187 217 return -EINVAL; 188 218 189 219 event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd)); 190 - event->cpu = idxd->idxd_pmu->cpu; 191 220 event->hw.config = event->attr.config; 192 221 193 222 if (event->group_leader != event) ··· 457 488 idxd_pmu->pmu.stop = perfmon_pmu_event_stop; 458 489 idxd_pmu->pmu.read = perfmon_pmu_event_update; 459 490 idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; 491 + idxd_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; 460 492 idxd_pmu->pmu.module = THIS_MODULE; 461 493 } 462 494 ··· 466 496 if (!idxd->idxd_pmu) 467 497 return; 468 498 469 - cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node); 470 499 perf_pmu_unregister(&idxd->idxd_pmu->pmu); 471 500 kfree(idxd->idxd_pmu); 472 501 idxd->idxd_pmu = NULL; 473 - } 474 - 475 - static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node) 476 - { 477 - struct idxd_pmu *idxd_pmu; 478 - 479 - idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); 480 - 481 - /* select the first online CPU as the designated reader */ 482 - if (cpumask_empty(&perfmon_dsa_cpu_mask)) { 483 - cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask); 484 - idxd_pmu->cpu = cpu; 485 - } 486 - 487 - return 0; 488 - } 489 - 490 - static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node) 491 - { 492 - struct idxd_pmu *idxd_pmu; 493 - unsigned int target; 494 - 495 - idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); 496 - 497 - if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask)) 498 - return 0; 499 - 500 - target = cpumask_any_but(cpu_online_mask, cpu); 501 - /* migrate events if there is a valid target */ 502 - if (target < nr_cpu_ids) { 503 - cpumask_set_cpu(target, &perfmon_dsa_cpu_mask); 504 - perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target); 505 - } 506 - 507 - return 0; 508 502 } 509 503 510 504 int perfmon_pmu_init(struct idxd_device *idxd) ··· 476 542 union idxd_perfcap perfcap; 477 543 struct idxd_pmu *idxd_pmu; 478 544 int rc = -ENODEV; 479 - 480 - /* 481 - * perfmon module initialization failed, nothing to do 482 - */ 483 - if (!cpuhp_set_up) 484 - return -ENODEV; 485 545 486 546 /* 487 547 * If perfmon_offset or num_counters is 0, it means perfmon is ··· 552 624 if (rc) 553 625 goto free; 554 626 555 - rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node); 556 - if (rc) { 557 - perf_pmu_unregister(&idxd->idxd_pmu->pmu); 558 - goto free; 559 - } 560 627 out: 561 628 return rc; 562 629 free: ··· 559 636 idxd->idxd_pmu = NULL; 560 637 561 638 goto out; 562 - } 563 - 564 - void __init perfmon_init(void) 565 - { 566 - int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, 567 - "driver/dma/idxd/perf:online", 568 - perf_event_cpu_online, 569 - perf_event_cpu_offline); 570 - if (WARN_ON(rc < 0)) 571 - return; 572 - 573 - cpuhp_slot = rc; 574 - cpuhp_set_up = true; 575 - } 576 - 577 - void __exit perfmon_exit(void) 578 - { 579 - if (cpuhp_set_up) 580 - cpuhp_remove_multi_state(cpuhp_slot); 581 639 }
-2
drivers/iommu/intel/iommu.h
··· 700 700 DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); 701 701 struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; 702 702 unsigned char irq_name[16]; 703 - struct hlist_node cpuhp_node; 704 - int cpu; 705 703 }; 706 704 707 705 #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
+2 -109
drivers/iommu/intel/perfmon.c
··· 34 34 .attrs = attrs_empty, 35 35 }; 36 36 37 - static cpumask_t iommu_pmu_cpu_mask; 38 - 39 - static ssize_t 40 - cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) 41 - { 42 - return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask); 43 - } 44 - static DEVICE_ATTR_RO(cpumask); 45 - 46 - static struct attribute *iommu_pmu_cpumask_attrs[] = { 47 - &dev_attr_cpumask.attr, 48 - NULL 49 - }; 50 - 51 - static struct attribute_group iommu_pmu_cpumask_attr_group = { 52 - .attrs = iommu_pmu_cpumask_attrs, 53 - }; 54 - 55 37 static const struct attribute_group *iommu_pmu_attr_groups[] = { 56 38 &iommu_pmu_format_attr_group, 57 39 &iommu_pmu_events_attr_group, 58 - &iommu_pmu_cpumask_attr_group, 59 40 NULL 60 41 }; 61 42 ··· 546 565 iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; 547 566 iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; 548 567 iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; 568 + iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; 549 569 iommu_pmu->pmu.module = THIS_MODULE; 550 570 551 571 return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); ··· 755 773 iommu->perf_irq = 0; 756 774 } 757 775 758 - static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) 759 - { 760 - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); 761 - 762 - if (cpumask_empty(&iommu_pmu_cpu_mask)) 763 - cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); 764 - 765 - if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask)) 766 - iommu_pmu->cpu = cpu; 767 - 768 - return 0; 769 - } 770 - 771 - static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) 772 - { 773 - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); 774 - int target = cpumask_first(&iommu_pmu_cpu_mask); 775 - 776 - /* 777 - * The iommu_pmu_cpu_mask has been updated when offline the CPU 778 - * for the first iommu_pmu. Migrate the other iommu_pmu to the 779 - * new target. 780 - */ 781 - if (target < nr_cpu_ids && target != iommu_pmu->cpu) { 782 - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); 783 - iommu_pmu->cpu = target; 784 - return 0; 785 - } 786 - 787 - if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) 788 - return 0; 789 - 790 - target = cpumask_any_but(cpu_online_mask, cpu); 791 - 792 - if (target < nr_cpu_ids) 793 - cpumask_set_cpu(target, &iommu_pmu_cpu_mask); 794 - else 795 - return 0; 796 - 797 - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); 798 - iommu_pmu->cpu = target; 799 - 800 - return 0; 801 - } 802 - 803 - static int nr_iommu_pmu; 804 - static enum cpuhp_state iommu_cpuhp_slot; 805 - 806 - static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) 807 - { 808 - int ret; 809 - 810 - if (!nr_iommu_pmu) { 811 - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, 812 - "driver/iommu/intel/perfmon:online", 813 - iommu_pmu_cpu_online, 814 - iommu_pmu_cpu_offline); 815 - if (ret < 0) 816 - return ret; 817 - iommu_cpuhp_slot = ret; 818 - } 819 - 820 - ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); 821 - if (ret) { 822 - if (!nr_iommu_pmu) 823 - cpuhp_remove_multi_state(iommu_cpuhp_slot); 824 - return ret; 825 - } 826 - nr_iommu_pmu++; 827 - 828 - return 0; 829 - } 830 - 831 - static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) 832 - { 833 - cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); 834 - 835 - if (--nr_iommu_pmu) 836 - return; 837 - 838 - cpuhp_remove_multi_state(iommu_cpuhp_slot); 839 - } 840 - 841 776 void iommu_pmu_register(struct intel_iommu *iommu) 842 777 { 843 778 struct iommu_pmu *iommu_pmu = iommu->pmu; ··· 765 866 if (__iommu_pmu_register(iommu)) 766 867 goto err; 767 868 768 - if (iommu_pmu_cpuhp_setup(iommu_pmu)) 769 - goto unregister; 770 - 771 869 /* Set interrupt for overflow */ 772 870 if (iommu_pmu_set_interrupt(iommu)) 773 - goto cpuhp_free; 871 + goto unregister; 774 872 775 873 return; 776 874 777 - cpuhp_free: 778 - iommu_pmu_cpuhp_free(iommu_pmu); 779 875 unregister: 780 876 perf_pmu_unregister(&iommu_pmu->pmu); 781 877 err: ··· 786 892 return; 787 893 788 894 iommu_pmu_unset_interrupt(iommu); 789 - iommu_pmu_cpuhp_free(iommu_pmu); 790 895 perf_pmu_unregister(&iommu_pmu->pmu); 791 896 }
-2
include/linux/cpuhotplug.h
··· 153 153 CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, 154 154 CPUHP_AP_PERF_X86_STARTING, 155 155 CPUHP_AP_PERF_X86_AMD_IBS_STARTING, 156 - CPUHP_AP_PERF_X86_CSTATE_STARTING, 157 156 CPUHP_AP_PERF_XTENSA_STARTING, 158 157 CPUHP_AP_ARM_VFP_STARTING, 159 158 CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, ··· 209 210 CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, 210 211 CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, 211 212 CPUHP_AP_PERF_X86_RAPL_ONLINE, 212 - CPUHP_AP_PERF_X86_CSTATE_ONLINE, 213 213 CPUHP_AP_PERF_S390_CF_ONLINE, 214 214 CPUHP_AP_PERF_S390_SF_ONLINE, 215 215 CPUHP_AP_PERF_ARM_CCI_ONLINE,
+30 -2
include/linux/perf_event.h
··· 168 168 struct hw_perf_event_extra extra_reg; 169 169 struct hw_perf_event_extra branch_reg; 170 170 }; 171 + struct { /* aux / Intel-PT */ 172 + u64 aux_config; 173 + }; 171 174 struct { /* software */ 172 175 struct hrtimer hrtimer; 173 176 }; ··· 295 292 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 296 293 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 297 294 295 + /** 296 + * pmu::scope 297 + */ 298 + enum perf_pmu_scope { 299 + PERF_PMU_SCOPE_NONE = 0, 300 + PERF_PMU_SCOPE_CORE, 301 + PERF_PMU_SCOPE_DIE, 302 + PERF_PMU_SCOPE_CLUSTER, 303 + PERF_PMU_SCOPE_PKG, 304 + PERF_PMU_SCOPE_SYS_WIDE, 305 + PERF_PMU_MAX_SCOPE, 306 + }; 307 + 298 308 struct perf_output_handle; 299 309 300 310 #define PMU_NULL_DEV ((void *)(~0UL)) ··· 330 314 * various common per-pmu feature flags 331 315 */ 332 316 int capabilities; 317 + 318 + /* 319 + * PMU scope 320 + */ 321 + unsigned int scope; 333 322 334 323 int __percpu *pmu_disable_count; 335 324 struct perf_cpu_pmu_context __percpu *cpu_pmu_context; ··· 636 615 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and 637 616 * cannot be a group leader. If an event with this flag is detached from the 638 617 * group it is scheduled out and moved into an unrecoverable ERROR state. 618 + * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the 619 + * PMU scope where it is active. 639 620 */ 640 621 #define PERF_EV_CAP_SOFTWARE BIT(0) 641 622 #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) 642 623 #define PERF_EV_CAP_SIBLING BIT(2) 624 + #define PERF_EV_CAP_READ_SCOPE BIT(3) 643 625 644 626 #define SWEVENT_HLIST_BITS 8 645 627 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) ··· 987 963 struct rcu_head rcu_head; 988 964 989 965 /* 990 - * Sum (event->pending_work + event->pending_work) 966 + * The count of events for which using the switch-out fast path 967 + * should be avoided. 968 + * 969 + * Sum (event->pending_work + events with 970 + * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) 991 971 * 992 972 * The SIGTRAP is targeted at ctx->task, as such it won't do changing 993 973 * that until the signal is delivered. 994 974 */ 995 - local_t nr_pending; 975 + local_t nr_no_switch_fast; 996 976 }; 997 977 998 978 struct perf_cpu_pmu_context {
+67
include/linux/rbtree.h
··· 245 245 } 246 246 247 247 /** 248 + * rb_find_add_rcu() - find equivalent @node in @tree, or add @node 249 + * @node: node to look-for / insert 250 + * @tree: tree to search / modify 251 + * @cmp: operator defining the node order 252 + * 253 + * Adds a Store-Release for link_node. 254 + * 255 + * Returns the rb_node matching @node, or NULL when no match is found and @node 256 + * is inserted. 257 + */ 258 + static __always_inline struct rb_node * 259 + rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, 260 + int (*cmp)(struct rb_node *, const struct rb_node *)) 261 + { 262 + struct rb_node **link = &tree->rb_node; 263 + struct rb_node *parent = NULL; 264 + int c; 265 + 266 + while (*link) { 267 + parent = *link; 268 + c = cmp(node, parent); 269 + 270 + if (c < 0) 271 + link = &parent->rb_left; 272 + else if (c > 0) 273 + link = &parent->rb_right; 274 + else 275 + return parent; 276 + } 277 + 278 + rb_link_node_rcu(node, parent, link); 279 + rb_insert_color(node, tree); 280 + return NULL; 281 + } 282 + 283 + /** 248 284 * rb_find() - find @key in tree @tree 249 285 * @key: key to match 250 286 * @tree: tree to search ··· 301 265 node = node->rb_left; 302 266 else if (c > 0) 303 267 node = node->rb_right; 268 + else 269 + return node; 270 + } 271 + 272 + return NULL; 273 + } 274 + 275 + /** 276 + * rb_find_rcu() - find @key in tree @tree 277 + * @key: key to match 278 + * @tree: tree to search 279 + * @cmp: operator defining the node order 280 + * 281 + * Notably, tree descent vs concurrent tree rotations is unsound and can result 282 + * in false-negatives. 283 + * 284 + * Returns the rb_node matching @key or NULL. 285 + */ 286 + static __always_inline struct rb_node * 287 + rb_find_rcu(const void *key, const struct rb_root *tree, 288 + int (*cmp)(const void *key, const struct rb_node *)) 289 + { 290 + struct rb_node *node = tree->rb_node; 291 + 292 + while (node) { 293 + int c = cmp(key, node); 294 + 295 + if (c < 0) 296 + node = rcu_dereference_raw(node->rb_left); 297 + else if (c > 0) 298 + node = rcu_dereference_raw(node->rb_right); 304 299 else 305 300 return node; 306 301 }
+25 -23
include/linux/uprobes.h
··· 16 16 #include <linux/types.h> 17 17 #include <linux/wait.h> 18 18 19 + struct uprobe; 19 20 struct vm_area_struct; 20 21 struct mm_struct; 21 22 struct inode; ··· 28 27 29 28 #define MAX_URETPROBE_DEPTH 64 30 29 31 - enum uprobe_filter_ctx { 32 - UPROBE_FILTER_REGISTER, 33 - UPROBE_FILTER_UNREGISTER, 34 - UPROBE_FILTER_MMAP, 35 - }; 36 - 37 30 struct uprobe_consumer { 31 + /* 32 + * handler() can return UPROBE_HANDLER_REMOVE to signal the need to 33 + * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is 34 + * returned, filter() callback has to be implemented as well and it 35 + * should return false to "confirm" the decision to uninstall uprobe 36 + * for the current process. If filter() is omitted or returns true, 37 + * UPROBE_HANDLER_REMOVE is effectively ignored. 38 + */ 38 39 int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); 39 40 int (*ret_handler)(struct uprobe_consumer *self, 40 41 unsigned long func, 41 42 struct pt_regs *regs); 42 - bool (*filter)(struct uprobe_consumer *self, 43 - enum uprobe_filter_ctx ctx, 44 - struct mm_struct *mm); 43 + bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); 45 44 46 - struct uprobe_consumer *next; 45 + struct list_head cons_node; 47 46 }; 48 47 49 48 #ifdef CONFIG_UPROBES ··· 76 75 77 76 struct uprobe *active_uprobe; 78 77 unsigned long xol_vaddr; 78 + 79 + struct arch_uprobe *auprobe; 79 80 80 81 struct return_instance *return_instances; 81 82 unsigned int depth; ··· 113 110 extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); 114 111 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); 115 112 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); 116 - extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); 117 - extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); 118 - extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); 119 - extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); 113 + extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); 114 + extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); 115 + extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); 116 + extern void uprobe_unregister_sync(void); 120 117 extern int uprobe_mmap(struct vm_area_struct *vma); 121 118 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); 122 119 extern void uprobe_start_dup_mmap(void); ··· 154 151 155 152 #define uprobe_get_trap_addr(regs) instruction_pointer(regs) 156 153 157 - static inline int 158 - uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) 154 + static inline struct uprobe * 155 + uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) 159 156 { 160 - return -ENOSYS; 161 - } 162 - static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) 163 - { 164 - return -ENOSYS; 157 + return ERR_PTR(-ENOSYS); 165 158 } 166 159 static inline int 167 - uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) 160 + uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) 168 161 { 169 162 return -ENOSYS; 170 163 } 171 164 static inline void 172 - uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) 165 + uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) 166 + { 167 + } 168 + static inline void uprobe_unregister_sync(void) 173 169 { 174 170 } 175 171 static inline int uprobe_mmap(struct vm_area_struct *vma)
+425 -161
kernel/events/core.c
··· 155 155 return data.ret; 156 156 } 157 157 158 + enum event_type_t { 159 + EVENT_FLEXIBLE = 0x01, 160 + EVENT_PINNED = 0x02, 161 + EVENT_TIME = 0x04, 162 + EVENT_FROZEN = 0x08, 163 + /* see ctx_resched() for details */ 164 + EVENT_CPU = 0x10, 165 + EVENT_CGROUP = 0x20, 166 + 167 + /* compound helpers */ 168 + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 169 + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, 170 + }; 171 + 172 + static inline void __perf_ctx_lock(struct perf_event_context *ctx) 173 + { 174 + raw_spin_lock(&ctx->lock); 175 + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); 176 + } 177 + 158 178 static void perf_ctx_lock(struct perf_cpu_context *cpuctx, 159 179 struct perf_event_context *ctx) 160 180 { 161 - raw_spin_lock(&cpuctx->ctx.lock); 181 + __perf_ctx_lock(&cpuctx->ctx); 162 182 if (ctx) 163 - raw_spin_lock(&ctx->lock); 183 + __perf_ctx_lock(ctx); 184 + } 185 + 186 + static inline void __perf_ctx_unlock(struct perf_event_context *ctx) 187 + { 188 + /* 189 + * If ctx_sched_in() didn't again set any ALL flags, clean up 190 + * after ctx_sched_out() by clearing is_active. 191 + */ 192 + if (ctx->is_active & EVENT_FROZEN) { 193 + if (!(ctx->is_active & EVENT_ALL)) 194 + ctx->is_active = 0; 195 + else 196 + ctx->is_active &= ~EVENT_FROZEN; 197 + } 198 + raw_spin_unlock(&ctx->lock); 164 199 } 165 200 166 201 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, 167 202 struct perf_event_context *ctx) 168 203 { 169 204 if (ctx) 170 - raw_spin_unlock(&ctx->lock); 171 - raw_spin_unlock(&cpuctx->ctx.lock); 205 + __perf_ctx_unlock(ctx); 206 + __perf_ctx_unlock(&cpuctx->ctx); 172 207 } 173 208 174 209 #define TASK_TOMBSTONE ((void *)-1L) ··· 299 264 { 300 265 struct perf_event_context *ctx = event->ctx; 301 266 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ 267 + struct perf_cpu_context *cpuctx; 302 268 struct event_function_struct efs = { 303 269 .event = event, 304 270 .func = func, ··· 327 291 if (!task_function_call(task, event_function, &efs)) 328 292 return; 329 293 330 - raw_spin_lock_irq(&ctx->lock); 294 + local_irq_disable(); 295 + cpuctx = this_cpu_ptr(&perf_cpu_context); 296 + perf_ctx_lock(cpuctx, ctx); 331 297 /* 332 298 * Reload the task pointer, it might have been changed by 333 299 * a concurrent perf_event_context_sched_out(). 334 300 */ 335 301 task = ctx->task; 336 - if (task == TASK_TOMBSTONE) { 337 - raw_spin_unlock_irq(&ctx->lock); 338 - return; 339 - } 302 + if (task == TASK_TOMBSTONE) 303 + goto unlock; 340 304 if (ctx->is_active) { 341 - raw_spin_unlock_irq(&ctx->lock); 305 + perf_ctx_unlock(cpuctx, ctx); 306 + local_irq_enable(); 342 307 goto again; 343 308 } 344 309 func(event, NULL, ctx, data); 345 - raw_spin_unlock_irq(&ctx->lock); 310 + unlock: 311 + perf_ctx_unlock(cpuctx, ctx); 312 + local_irq_enable(); 346 313 } 347 314 348 315 /* ··· 408 369 (PERF_SAMPLE_BRANCH_KERNEL |\ 409 370 PERF_SAMPLE_BRANCH_HV) 410 371 411 - enum event_type_t { 412 - EVENT_FLEXIBLE = 0x1, 413 - EVENT_PINNED = 0x2, 414 - EVENT_TIME = 0x4, 415 - /* see ctx_resched() for details */ 416 - EVENT_CPU = 0x8, 417 - EVENT_CGROUP = 0x10, 418 - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 419 - }; 420 - 421 372 /* 422 373 * perf_sched_events : >0 events exist 423 374 */ ··· 436 407 static DEFINE_MUTEX(pmus_lock); 437 408 static struct srcu_struct pmus_srcu; 438 409 static cpumask_var_t perf_online_mask; 410 + static cpumask_var_t perf_online_core_mask; 411 + static cpumask_var_t perf_online_die_mask; 412 + static cpumask_var_t perf_online_cluster_mask; 413 + static cpumask_var_t perf_online_pkg_mask; 414 + static cpumask_var_t perf_online_sys_mask; 439 415 static struct kmem_cache *perf_event_cache; 440 416 441 417 /* ··· 719 685 ___p; \ 720 686 }) 721 687 688 + #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ 689 + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ 690 + if (_cgroup && !_epc->nr_cgroups) \ 691 + continue; \ 692 + else if (_pmu && _epc->pmu != _pmu) \ 693 + continue; \ 694 + else 695 + 722 696 static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) 723 697 { 724 698 struct perf_event_pmu_context *pmu_ctx; 725 699 726 - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { 727 - if (cgroup && !pmu_ctx->nr_cgroups) 728 - continue; 700 + for_each_epc(pmu_ctx, ctx, NULL, cgroup) 729 701 perf_pmu_disable(pmu_ctx->pmu); 730 - } 731 702 } 732 703 733 704 static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) 734 705 { 735 706 struct perf_event_pmu_context *pmu_ctx; 736 707 737 - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { 738 - if (cgroup && !pmu_ctx->nr_cgroups) 739 - continue; 708 + for_each_epc(pmu_ctx, ctx, NULL, cgroup) 740 709 perf_pmu_enable(pmu_ctx->pmu); 741 - } 742 710 } 743 711 744 - static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); 745 - static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); 712 + static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); 713 + static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); 746 714 747 715 #ifdef CONFIG_CGROUP_PERF 748 716 ··· 901 865 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 902 866 perf_ctx_disable(&cpuctx->ctx, true); 903 867 904 - ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); 868 + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); 905 869 /* 906 870 * must not be done before ctxswout due 907 871 * to update_cgrp_time_from_cpuctx() in ··· 913 877 * perf_cgroup_set_timestamp() in ctx_sched_in() 914 878 * to not have to pass task around 915 879 */ 916 - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); 880 + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); 917 881 918 882 perf_ctx_enable(&cpuctx->ctx, true); 919 883 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); ··· 1805 1769 typeof(*event), group_node)) 1806 1770 1807 1771 /* 1772 + * Does the event attribute request inherit with PERF_SAMPLE_READ 1773 + */ 1774 + static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) 1775 + { 1776 + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); 1777 + } 1778 + 1779 + /* 1808 1780 * Add an event from the lists for its context. 1809 1781 * Must be called with ctx->mutex and ctx->lock held. 1810 1782 */ ··· 1842 1798 ctx->nr_user++; 1843 1799 if (event->attr.inherit_stat) 1844 1800 ctx->nr_stat++; 1801 + if (has_inherit_and_sample_read(&event->attr)) 1802 + local_inc(&ctx->nr_no_switch_fast); 1845 1803 1846 1804 if (event->state > PERF_EVENT_STATE_OFF) 1847 1805 perf_cgroup_event_enable(event, ctx); ··· 2068 2022 ctx->nr_user--; 2069 2023 if (event->attr.inherit_stat) 2070 2024 ctx->nr_stat--; 2025 + if (has_inherit_and_sample_read(&event->attr)) 2026 + local_dec(&ctx->nr_no_switch_fast); 2071 2027 2072 2028 list_del_rcu(&event->event_entry); 2073 2029 ··· 2365 2317 event_sched_out(event, ctx); 2366 2318 } 2367 2319 2320 + static inline void 2321 + __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) 2322 + { 2323 + if (ctx->is_active & EVENT_TIME) { 2324 + if (ctx->is_active & EVENT_FROZEN) 2325 + return; 2326 + update_context_time(ctx); 2327 + update_cgrp_time_from_cpuctx(cpuctx, final); 2328 + } 2329 + } 2330 + 2331 + static inline void 2332 + ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) 2333 + { 2334 + __ctx_time_update(cpuctx, ctx, false); 2335 + } 2336 + 2337 + /* 2338 + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). 2339 + */ 2340 + static inline void 2341 + ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) 2342 + { 2343 + ctx_time_update(cpuctx, ctx); 2344 + if (ctx->is_active & EVENT_TIME) 2345 + ctx->is_active |= EVENT_FROZEN; 2346 + } 2347 + 2348 + static inline void 2349 + ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) 2350 + { 2351 + if (ctx->is_active & EVENT_TIME) { 2352 + if (ctx->is_active & EVENT_FROZEN) 2353 + return; 2354 + update_context_time(ctx); 2355 + update_cgrp_time_from_event(event); 2356 + } 2357 + } 2358 + 2368 2359 #define DETACH_GROUP 0x01UL 2369 2360 #define DETACH_CHILD 0x02UL 2370 2361 #define DETACH_DEAD 0x04UL ··· 2423 2336 struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; 2424 2337 unsigned long flags = (unsigned long)info; 2425 2338 2426 - if (ctx->is_active & EVENT_TIME) { 2427 - update_context_time(ctx); 2428 - update_cgrp_time_from_cpuctx(cpuctx, false); 2429 - } 2339 + ctx_time_update(cpuctx, ctx); 2430 2340 2431 2341 /* 2432 2342 * Ensure event_sched_out() switches to OFF, at the very least ··· 2508 2424 if (event->state < PERF_EVENT_STATE_INACTIVE) 2509 2425 return; 2510 2426 2511 - if (ctx->is_active & EVENT_TIME) { 2512 - update_context_time(ctx); 2513 - update_cgrp_time_from_event(event); 2514 - } 2515 - 2516 2427 perf_pmu_disable(event->pmu_ctx->pmu); 2428 + ctx_time_update_event(ctx, event); 2517 2429 2518 2430 if (event == event->group_leader) 2519 2431 group_sched_out(event, ctx); ··· 2725 2645 } 2726 2646 2727 2647 static void task_ctx_sched_out(struct perf_event_context *ctx, 2728 - enum event_type_t event_type) 2648 + struct pmu *pmu, 2649 + enum event_type_t event_type) 2729 2650 { 2730 2651 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 2731 2652 ··· 2736 2655 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2737 2656 return; 2738 2657 2739 - ctx_sched_out(ctx, event_type); 2658 + ctx_sched_out(ctx, pmu, event_type); 2740 2659 } 2741 2660 2742 2661 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2743 - struct perf_event_context *ctx) 2662 + struct perf_event_context *ctx, 2663 + struct pmu *pmu) 2744 2664 { 2745 - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); 2665 + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); 2746 2666 if (ctx) 2747 - ctx_sched_in(ctx, EVENT_PINNED); 2748 - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); 2667 + ctx_sched_in(ctx, pmu, EVENT_PINNED); 2668 + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); 2749 2669 if (ctx) 2750 - ctx_sched_in(ctx, EVENT_FLEXIBLE); 2670 + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); 2751 2671 } 2752 2672 2753 2673 /* ··· 2766 2684 * event_type is a bit mask of the types of events involved. For CPU events, 2767 2685 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. 2768 2686 */ 2769 - /* 2770 - * XXX: ctx_resched() reschedule entire perf_event_context while adding new 2771 - * event to the context or enabling existing event in the context. We can 2772 - * probably optimize it by rescheduling only affected pmu_ctx. 2773 - */ 2774 2687 static void ctx_resched(struct perf_cpu_context *cpuctx, 2775 2688 struct perf_event_context *task_ctx, 2776 - enum event_type_t event_type) 2689 + struct pmu *pmu, enum event_type_t event_type) 2777 2690 { 2778 2691 bool cpu_event = !!(event_type & EVENT_CPU); 2692 + struct perf_event_pmu_context *epc; 2779 2693 2780 2694 /* 2781 2695 * If pinned groups are involved, flexible groups also need to be ··· 2782 2704 2783 2705 event_type &= EVENT_ALL; 2784 2706 2785 - perf_ctx_disable(&cpuctx->ctx, false); 2707 + for_each_epc(epc, &cpuctx->ctx, pmu, false) 2708 + perf_pmu_disable(epc->pmu); 2709 + 2786 2710 if (task_ctx) { 2787 - perf_ctx_disable(task_ctx, false); 2788 - task_ctx_sched_out(task_ctx, event_type); 2711 + for_each_epc(epc, task_ctx, pmu, false) 2712 + perf_pmu_disable(epc->pmu); 2713 + 2714 + task_ctx_sched_out(task_ctx, pmu, event_type); 2789 2715 } 2790 2716 2791 2717 /* ··· 2800 2718 * - otherwise, do nothing more. 2801 2719 */ 2802 2720 if (cpu_event) 2803 - ctx_sched_out(&cpuctx->ctx, event_type); 2721 + ctx_sched_out(&cpuctx->ctx, pmu, event_type); 2804 2722 else if (event_type & EVENT_PINNED) 2805 - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); 2723 + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); 2806 2724 2807 - perf_event_sched_in(cpuctx, task_ctx); 2725 + perf_event_sched_in(cpuctx, task_ctx, pmu); 2808 2726 2809 - perf_ctx_enable(&cpuctx->ctx, false); 2810 - if (task_ctx) 2811 - perf_ctx_enable(task_ctx, false); 2727 + for_each_epc(epc, &cpuctx->ctx, pmu, false) 2728 + perf_pmu_enable(epc->pmu); 2729 + 2730 + if (task_ctx) { 2731 + for_each_epc(epc, task_ctx, pmu, false) 2732 + perf_pmu_enable(epc->pmu); 2733 + } 2812 2734 } 2813 2735 2814 2736 void perf_pmu_resched(struct pmu *pmu) ··· 2821 2735 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2822 2736 2823 2737 perf_ctx_lock(cpuctx, task_ctx); 2824 - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); 2738 + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); 2825 2739 perf_ctx_unlock(cpuctx, task_ctx); 2826 2740 } 2827 2741 ··· 2877 2791 #endif 2878 2792 2879 2793 if (reprogram) { 2880 - ctx_sched_out(ctx, EVENT_TIME); 2794 + ctx_time_freeze(cpuctx, ctx); 2881 2795 add_event_to_ctx(event, ctx); 2882 - ctx_resched(cpuctx, task_ctx, get_event_type(event)); 2796 + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, 2797 + get_event_type(event)); 2883 2798 } else { 2884 2799 add_event_to_ctx(event, ctx); 2885 2800 } ··· 3023 2936 event->state <= PERF_EVENT_STATE_ERROR) 3024 2937 return; 3025 2938 3026 - if (ctx->is_active) 3027 - ctx_sched_out(ctx, EVENT_TIME); 2939 + ctx_time_freeze(cpuctx, ctx); 3028 2940 3029 2941 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); 3030 2942 perf_cgroup_event_enable(event, ctx); ··· 3031 2945 if (!ctx->is_active) 3032 2946 return; 3033 2947 3034 - if (!event_filter_match(event)) { 3035 - ctx_sched_in(ctx, EVENT_TIME); 2948 + if (!event_filter_match(event)) 3036 2949 return; 3037 - } 3038 2950 3039 2951 /* 3040 2952 * If the event is in a group and isn't the group leader, 3041 2953 * then don't put it on unless the group is on. 3042 2954 */ 3043 - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { 3044 - ctx_sched_in(ctx, EVENT_TIME); 2955 + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 3045 2956 return; 3046 - } 3047 2957 3048 2958 task_ctx = cpuctx->task_ctx; 3049 2959 if (ctx->task) 3050 2960 WARN_ON_ONCE(task_ctx != ctx); 3051 2961 3052 - ctx_resched(cpuctx, task_ctx, get_event_type(event)); 2962 + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); 3053 2963 } 3054 2964 3055 2965 /* ··· 3313 3231 struct perf_event *event, *tmp; 3314 3232 struct pmu *pmu = pmu_ctx->pmu; 3315 3233 3316 - if (ctx->task && !ctx->is_active) { 3234 + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { 3317 3235 struct perf_cpu_pmu_context *cpc; 3318 3236 3319 3237 cpc = this_cpu_ptr(pmu->cpu_pmu_context); ··· 3321 3239 cpc->task_epc = NULL; 3322 3240 } 3323 3241 3324 - if (!event_type) 3242 + if (!(event_type & EVENT_ALL)) 3325 3243 return; 3326 3244 3327 3245 perf_pmu_disable(pmu); ··· 3347 3265 perf_pmu_enable(pmu); 3348 3266 } 3349 3267 3268 + /* 3269 + * Be very careful with the @pmu argument since this will change ctx state. 3270 + * The @pmu argument works for ctx_resched(), because that is symmetric in 3271 + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. 3272 + * 3273 + * However, if you were to be asymmetrical, you could end up with messed up 3274 + * state, eg. ctx->is_active cleared even though most EPCs would still actually 3275 + * be active. 3276 + */ 3350 3277 static void 3351 - ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) 3278 + ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) 3352 3279 { 3353 3280 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 3354 3281 struct perf_event_pmu_context *pmu_ctx; ··· 3388 3297 * 3389 3298 * would only update time for the pinned events. 3390 3299 */ 3391 - if (is_active & EVENT_TIME) { 3392 - /* update (and stop) ctx time */ 3393 - update_context_time(ctx); 3394 - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); 3395 - /* 3396 - * CPU-release for the below ->is_active store, 3397 - * see __load_acquire() in perf_event_time_now() 3398 - */ 3399 - barrier(); 3400 - } 3300 + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); 3401 3301 3302 + /* 3303 + * CPU-release for the below ->is_active store, 3304 + * see __load_acquire() in perf_event_time_now() 3305 + */ 3306 + barrier(); 3402 3307 ctx->is_active &= ~event_type; 3403 - if (!(ctx->is_active & EVENT_ALL)) 3404 - ctx->is_active = 0; 3308 + 3309 + if (!(ctx->is_active & EVENT_ALL)) { 3310 + /* 3311 + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() 3312 + * does not observe a hole. perf_ctx_unlock() will clean up. 3313 + */ 3314 + if (ctx->is_active & EVENT_FROZEN) 3315 + ctx->is_active &= EVENT_TIME_FROZEN; 3316 + else 3317 + ctx->is_active = 0; 3318 + } 3405 3319 3406 3320 if (ctx->task) { 3407 3321 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 3408 - if (!ctx->is_active) 3322 + if (!(ctx->is_active & EVENT_ALL)) 3409 3323 cpuctx->task_ctx = NULL; 3410 3324 } 3411 3325 3412 3326 is_active ^= ctx->is_active; /* changed bits */ 3413 3327 3414 - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { 3415 - if (cgroup && !pmu_ctx->nr_cgroups) 3416 - continue; 3328 + for_each_epc(pmu_ctx, ctx, pmu, cgroup) 3417 3329 __pmu_ctx_sched_out(pmu_ctx, is_active); 3418 - } 3419 3330 } 3420 3331 3421 3332 /* ··· 3610 3517 3611 3518 perf_ctx_disable(ctx, false); 3612 3519 3613 - /* PMIs are disabled; ctx->nr_pending is stable. */ 3614 - if (local_read(&ctx->nr_pending) || 3615 - local_read(&next_ctx->nr_pending)) { 3520 + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ 3521 + if (local_read(&ctx->nr_no_switch_fast) || 3522 + local_read(&next_ctx->nr_no_switch_fast)) { 3616 3523 /* 3617 3524 * Must not swap out ctx when there's pending 3618 3525 * events that rely on the ctx->task relation. 3526 + * 3527 + * Likewise, when a context contains inherit + 3528 + * SAMPLE_READ events they should be switched 3529 + * out using the slow path so that they are 3530 + * treated as if they were distinct contexts. 3619 3531 */ 3620 3532 raw_spin_unlock(&next_ctx->lock); 3621 3533 rcu_read_unlock(); ··· 3661 3563 3662 3564 inside_switch: 3663 3565 perf_ctx_sched_task_cb(ctx, false); 3664 - task_ctx_sched_out(ctx, EVENT_ALL); 3566 + task_ctx_sched_out(ctx, NULL, EVENT_ALL); 3665 3567 3666 3568 perf_ctx_enable(ctx, false); 3667 3569 raw_spin_unlock(&ctx->lock); ··· 3959 3861 merge_sched_in, &can_add_hw); 3960 3862 } 3961 3863 3962 - static void ctx_groups_sched_in(struct perf_event_context *ctx, 3963 - struct perf_event_groups *groups, 3964 - bool cgroup) 3864 + static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, 3865 + enum event_type_t event_type) 3965 3866 { 3966 - struct perf_event_pmu_context *pmu_ctx; 3867 + struct perf_event_context *ctx = pmu_ctx->ctx; 3967 3868 3968 - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { 3969 - if (cgroup && !pmu_ctx->nr_cgroups) 3970 - continue; 3971 - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); 3972 - } 3973 - } 3974 - 3975 - static void __pmu_ctx_sched_in(struct perf_event_context *ctx, 3976 - struct pmu *pmu) 3977 - { 3978 - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); 3869 + if (event_type & EVENT_PINNED) 3870 + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); 3871 + if (event_type & EVENT_FLEXIBLE) 3872 + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); 3979 3873 } 3980 3874 3981 3875 static void 3982 - ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) 3876 + ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) 3983 3877 { 3984 3878 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 3879 + struct perf_event_pmu_context *pmu_ctx; 3985 3880 int is_active = ctx->is_active; 3986 3881 bool cgroup = event_type & EVENT_CGROUP; 3987 3882 ··· 3998 3907 3999 3908 ctx->is_active |= (event_type | EVENT_TIME); 4000 3909 if (ctx->task) { 4001 - if (!is_active) 3910 + if (!(is_active & EVENT_ALL)) 4002 3911 cpuctx->task_ctx = ctx; 4003 3912 else 4004 3913 WARN_ON_ONCE(cpuctx->task_ctx != ctx); ··· 4010 3919 * First go through the list and put on any pinned groups 4011 3920 * in order to give them the best chance of going on. 4012 3921 */ 4013 - if (is_active & EVENT_PINNED) 4014 - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); 3922 + if (is_active & EVENT_PINNED) { 3923 + for_each_epc(pmu_ctx, ctx, pmu, cgroup) 3924 + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); 3925 + } 4015 3926 4016 3927 /* Then walk through the lower prio flexible groups */ 4017 - if (is_active & EVENT_FLEXIBLE) 4018 - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); 3928 + if (is_active & EVENT_FLEXIBLE) { 3929 + for_each_epc(pmu_ctx, ctx, pmu, cgroup) 3930 + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); 3931 + } 4019 3932 } 4020 3933 4021 3934 static void perf_event_context_sched_in(struct task_struct *task) ··· 4062 3967 */ 4063 3968 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { 4064 3969 perf_ctx_disable(&cpuctx->ctx, false); 4065 - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); 3970 + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); 4066 3971 } 4067 3972 4068 - perf_event_sched_in(cpuctx, ctx); 3973 + perf_event_sched_in(cpuctx, ctx, NULL); 4069 3974 4070 3975 perf_ctx_sched_task_cb(cpuctx->task_ctx, true); 4071 3976 ··· 4188 4093 period = perf_calculate_period(event, nsec, count); 4189 4094 4190 4095 delta = (s64)(period - hwc->sample_period); 4191 - delta = (delta + 7) / 8; /* low pass filter */ 4096 + if (delta >= 0) 4097 + delta += 7; 4098 + else 4099 + delta -= 7; 4100 + delta /= 8; /* low pass filter */ 4192 4101 4193 4102 sample_period = hwc->sample_period + delta; 4194 4103 ··· 4410 4311 update_context_time(&cpuctx->ctx); 4411 4312 __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); 4412 4313 rotate_ctx(&cpuctx->ctx, cpu_event); 4413 - __pmu_ctx_sched_in(&cpuctx->ctx, pmu); 4314 + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); 4414 4315 } 4415 4316 4416 4317 if (task_event) 4417 4318 rotate_ctx(task_epc->ctx, task_event); 4418 4319 4419 4320 if (task_event || (task_epc && cpu_event)) 4420 - __pmu_ctx_sched_in(task_epc->ctx, pmu); 4321 + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); 4421 4322 4422 4323 perf_pmu_enable(pmu); 4423 4324 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); ··· 4483 4384 4484 4385 cpuctx = this_cpu_ptr(&perf_cpu_context); 4485 4386 perf_ctx_lock(cpuctx, ctx); 4486 - ctx_sched_out(ctx, EVENT_TIME); 4387 + ctx_time_freeze(cpuctx, ctx); 4487 4388 4488 4389 list_for_each_entry(event, &ctx->event_list, event_entry) { 4489 4390 enabled |= event_enable_on_exec(event, ctx); ··· 4495 4396 */ 4496 4397 if (enabled) { 4497 4398 clone_ctx = unclone_ctx(ctx); 4498 - ctx_resched(cpuctx, ctx, event_type); 4499 - } else { 4500 - ctx_sched_in(ctx, EVENT_TIME); 4399 + ctx_resched(cpuctx, ctx, NULL, event_type); 4501 4400 } 4502 4401 perf_ctx_unlock(cpuctx, ctx); 4503 4402 ··· 4556 4459 int ret; 4557 4460 }; 4558 4461 4462 + static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); 4463 + 4559 4464 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) 4560 4465 { 4466 + int local_cpu = smp_processor_id(); 4561 4467 u16 local_pkg, event_pkg; 4562 4468 4563 4469 if ((unsigned)event_cpu >= nr_cpu_ids) 4564 4470 return event_cpu; 4565 4471 4566 - if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { 4567 - int local_cpu = smp_processor_id(); 4472 + if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { 4473 + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); 4568 4474 4475 + if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) 4476 + return local_cpu; 4477 + } 4478 + 4479 + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { 4569 4480 event_pkg = topology_physical_package_id(event_cpu); 4570 4481 local_pkg = topology_physical_package_id(local_cpu); 4571 4482 ··· 4606 4501 return; 4607 4502 4608 4503 raw_spin_lock(&ctx->lock); 4609 - if (ctx->is_active & EVENT_TIME) { 4610 - update_context_time(ctx); 4611 - update_cgrp_time_from_event(event); 4612 - } 4504 + ctx_time_update_event(ctx, event); 4613 4505 4614 4506 perf_event_update_time(event); 4615 4507 if (data->group) ··· 4641 4539 raw_spin_unlock(&ctx->lock); 4642 4540 } 4643 4541 4644 - static inline u64 perf_event_count(struct perf_event *event) 4542 + static inline u64 perf_event_count(struct perf_event *event, bool self) 4645 4543 { 4544 + if (self) 4545 + return local64_read(&event->count); 4546 + 4646 4547 return local64_read(&event->count) + atomic64_read(&event->child_count); 4647 4548 } 4648 4549 ··· 4806 4701 * May read while context is not active (e.g., thread is 4807 4702 * blocked), in that case we cannot update context time 4808 4703 */ 4809 - if (ctx->is_active & EVENT_TIME) { 4810 - update_context_time(ctx); 4811 - update_cgrp_time_from_event(event); 4812 - } 4704 + ctx_time_update_event(ctx, event); 4813 4705 4814 4706 perf_event_update_time(event); 4815 4707 if (group) ··· 5307 5205 */ 5308 5206 if (task_work_cancel(current, head)) { 5309 5207 event->pending_work = 0; 5310 - local_dec(&event->ctx->nr_pending); 5208 + local_dec(&event->ctx->nr_no_switch_fast); 5311 5209 return; 5312 5210 } 5313 5211 ··· 5601 5499 mutex_lock(&event->child_mutex); 5602 5500 5603 5501 (void)perf_event_read(event, false); 5604 - total += perf_event_count(event); 5502 + total += perf_event_count(event, false); 5605 5503 5606 5504 *enabled += event->total_time_enabled + 5607 5505 atomic64_read(&event->child_total_time_enabled); ··· 5610 5508 5611 5509 list_for_each_entry(child, &event->child_list, child_list) { 5612 5510 (void)perf_event_read(child, false); 5613 - total += perf_event_count(child); 5511 + total += perf_event_count(child, false); 5614 5512 *enabled += child->total_time_enabled; 5615 5513 *running += child->total_time_running; 5616 5514 } ··· 5692 5590 /* 5693 5591 * Write {count,id} tuples for every sibling. 5694 5592 */ 5695 - values[n++] += perf_event_count(leader); 5593 + values[n++] += perf_event_count(leader, false); 5696 5594 if (read_format & PERF_FORMAT_ID) 5697 5595 values[n++] = primary_event_id(leader); 5698 5596 if (read_format & PERF_FORMAT_LOST) 5699 5597 values[n++] = atomic64_read(&leader->lost_samples); 5700 5598 5701 5599 for_each_sibling_event(sub, leader) { 5702 - values[n++] += perf_event_count(sub); 5600 + values[n++] += perf_event_count(sub, false); 5703 5601 if (read_format & PERF_FORMAT_ID) 5704 5602 values[n++] = primary_event_id(sub); 5705 5603 if (read_format & PERF_FORMAT_LOST) ··· 6279 6177 ++userpg->lock; 6280 6178 barrier(); 6281 6179 userpg->index = perf_event_index(event); 6282 - userpg->offset = perf_event_count(event); 6180 + userpg->offset = perf_event_count(event, false); 6283 6181 if (userpg->index) 6284 6182 userpg->offset -= local64_read(&event->hw.prev_count); 6285 6183 ··· 6976 6874 if (event->pending_work) { 6977 6875 event->pending_work = 0; 6978 6876 perf_sigtrap(event); 6979 - local_dec(&event->ctx->nr_pending); 6877 + local_dec(&event->ctx->nr_no_switch_fast); 6980 6878 rcuwait_wake_up(&event->pending_work_wait); 6981 6879 } 6982 6880 rcu_read_unlock(); ··· 7358 7256 u64 values[5]; 7359 7257 int n = 0; 7360 7258 7361 - values[n++] = perf_event_count(event); 7259 + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); 7362 7260 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 7363 7261 values[n++] = enabled + 7364 7262 atomic64_read(&event->child_total_time_enabled); ··· 7376 7274 } 7377 7275 7378 7276 static void perf_output_read_group(struct perf_output_handle *handle, 7379 - struct perf_event *event, 7380 - u64 enabled, u64 running) 7277 + struct perf_event *event, 7278 + u64 enabled, u64 running) 7381 7279 { 7382 7280 struct perf_event *leader = event->group_leader, *sub; 7383 7281 u64 read_format = event->attr.read_format; 7384 7282 unsigned long flags; 7385 7283 u64 values[6]; 7386 7284 int n = 0; 7285 + bool self = has_inherit_and_sample_read(&event->attr); 7387 7286 7388 7287 /* 7389 7288 * Disabling interrupts avoids all counter scheduling ··· 7404 7301 (leader->state == PERF_EVENT_STATE_ACTIVE)) 7405 7302 leader->pmu->read(leader); 7406 7303 7407 - values[n++] = perf_event_count(leader); 7304 + values[n++] = perf_event_count(leader, self); 7408 7305 if (read_format & PERF_FORMAT_ID) 7409 7306 values[n++] = primary_event_id(leader); 7410 7307 if (read_format & PERF_FORMAT_LOST) ··· 7419 7316 (sub->state == PERF_EVENT_STATE_ACTIVE)) 7420 7317 sub->pmu->read(sub); 7421 7318 7422 - values[n++] = perf_event_count(sub); 7319 + values[n++] = perf_event_count(sub, self); 7423 7320 if (read_format & PERF_FORMAT_ID) 7424 7321 values[n++] = primary_event_id(sub); 7425 7322 if (read_format & PERF_FORMAT_LOST) ··· 7440 7337 * The problem is that its both hard and excessively expensive to iterate the 7441 7338 * child list, not to mention that its impossible to IPI the children running 7442 7339 * on another CPU, from interrupt/NMI context. 7340 + * 7341 + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread 7342 + * counts rather than attempting to accumulate some value across all children on 7343 + * all cores. 7443 7344 */ 7444 7345 static void perf_output_read(struct perf_output_handle *handle, 7445 7346 struct perf_event *event) ··· 9854 9747 if (!event->pending_work && 9855 9748 !task_work_add(current, &event->pending_task, notify_mode)) { 9856 9749 event->pending_work = pending_id; 9857 - local_inc(&event->ctx->nr_pending); 9750 + local_inc(&event->ctx->nr_no_switch_fast); 9858 9751 9859 9752 event->pending_addr = 0; 9860 9753 if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) ··· 11591 11484 } 11592 11485 static DEVICE_ATTR_RW(perf_event_mux_interval_ms); 11593 11486 11487 + static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) 11488 + { 11489 + switch (scope) { 11490 + case PERF_PMU_SCOPE_CORE: 11491 + return topology_sibling_cpumask(cpu); 11492 + case PERF_PMU_SCOPE_DIE: 11493 + return topology_die_cpumask(cpu); 11494 + case PERF_PMU_SCOPE_CLUSTER: 11495 + return topology_cluster_cpumask(cpu); 11496 + case PERF_PMU_SCOPE_PKG: 11497 + return topology_core_cpumask(cpu); 11498 + case PERF_PMU_SCOPE_SYS_WIDE: 11499 + return cpu_online_mask; 11500 + } 11501 + 11502 + return NULL; 11503 + } 11504 + 11505 + static inline struct cpumask *perf_scope_cpumask(unsigned int scope) 11506 + { 11507 + switch (scope) { 11508 + case PERF_PMU_SCOPE_CORE: 11509 + return perf_online_core_mask; 11510 + case PERF_PMU_SCOPE_DIE: 11511 + return perf_online_die_mask; 11512 + case PERF_PMU_SCOPE_CLUSTER: 11513 + return perf_online_cluster_mask; 11514 + case PERF_PMU_SCOPE_PKG: 11515 + return perf_online_pkg_mask; 11516 + case PERF_PMU_SCOPE_SYS_WIDE: 11517 + return perf_online_sys_mask; 11518 + } 11519 + 11520 + return NULL; 11521 + } 11522 + 11523 + static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, 11524 + char *buf) 11525 + { 11526 + struct pmu *pmu = dev_get_drvdata(dev); 11527 + struct cpumask *mask = perf_scope_cpumask(pmu->scope); 11528 + 11529 + if (mask) 11530 + return cpumap_print_to_pagebuf(true, buf, mask); 11531 + return 0; 11532 + } 11533 + 11534 + static DEVICE_ATTR_RO(cpumask); 11535 + 11594 11536 static struct attribute *pmu_dev_attrs[] = { 11595 11537 &dev_attr_type.attr, 11596 11538 &dev_attr_perf_event_mux_interval_ms.attr, 11597 11539 &dev_attr_nr_addr_filters.attr, 11540 + &dev_attr_cpumask.attr, 11598 11541 NULL, 11599 11542 }; 11600 11543 ··· 11654 11497 struct pmu *pmu = dev_get_drvdata(dev); 11655 11498 11656 11499 if (n == 2 && !pmu->nr_addr_filters) 11500 + return 0; 11501 + 11502 + /* cpumask */ 11503 + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) 11657 11504 return 0; 11658 11505 11659 11506 return a->mode; ··· 11740 11579 11741 11580 pmu->type = -1; 11742 11581 if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { 11582 + ret = -EINVAL; 11583 + goto free_pdc; 11584 + } 11585 + 11586 + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { 11743 11587 ret = -EINVAL; 11744 11588 goto free_pdc; 11745 11589 } ··· 11902 11736 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && 11903 11737 event_has_any_exclude_flag(event)) 11904 11738 ret = -EINVAL; 11739 + 11740 + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { 11741 + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); 11742 + struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); 11743 + int cpu; 11744 + 11745 + if (pmu_cpumask && cpumask) { 11746 + cpu = cpumask_any_and(pmu_cpumask, cpumask); 11747 + if (cpu >= nr_cpu_ids) 11748 + ret = -ENODEV; 11749 + else 11750 + event->event_caps |= PERF_EV_CAP_READ_SCOPE; 11751 + } else { 11752 + ret = -ENODEV; 11753 + } 11754 + } 11905 11755 11906 11756 if (ret && event->destroy) 11907 11757 event->destroy(event); ··· 12246 12064 local64_set(&hwc->period_left, hwc->sample_period); 12247 12065 12248 12066 /* 12249 - * We currently do not support PERF_SAMPLE_READ on inherited events. 12067 + * We do not support PERF_SAMPLE_READ on inherited events unless 12068 + * PERF_SAMPLE_TID is also selected, which allows inherited events to 12069 + * collect per-thread samples. 12250 12070 * See perf_output_read(). 12251 12071 */ 12252 - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) 12072 + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) 12253 12073 goto err_ns; 12254 12074 12255 12075 if (!has_branch_stack(event)) ··· 13275 13091 perf_event_read_event(child_event, task); 13276 13092 } 13277 13093 13278 - child_val = perf_event_count(child_event); 13094 + child_val = perf_event_count(child_event, false); 13279 13095 13280 13096 /* 13281 13097 * Add back the child's count to the parent's count: ··· 13366 13182 * in. 13367 13183 */ 13368 13184 raw_spin_lock_irq(&child_ctx->lock); 13369 - task_ctx_sched_out(child_ctx, EVENT_ALL); 13185 + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); 13370 13186 13371 13187 /* 13372 13188 * Now that the context is inactive, destroy the task <-> ctx relation ··· 13881 13697 int cpu; 13882 13698 13883 13699 zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); 13700 + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); 13701 + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); 13702 + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); 13703 + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); 13704 + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); 13705 + 13884 13706 13885 13707 for_each_possible_cpu(cpu) { 13886 13708 swhash = &per_cpu(swevent_htable, cpu); ··· 13930 13740 struct perf_event *event; 13931 13741 13932 13742 raw_spin_lock(&ctx->lock); 13933 - ctx_sched_out(ctx, EVENT_TIME); 13743 + ctx_sched_out(ctx, NULL, EVENT_TIME); 13934 13744 list_for_each_entry(event, &ctx->event_list, event_entry) 13935 13745 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); 13936 13746 raw_spin_unlock(&ctx->lock); 13747 + } 13748 + 13749 + static void perf_event_clear_cpumask(unsigned int cpu) 13750 + { 13751 + int target[PERF_PMU_MAX_SCOPE]; 13752 + unsigned int scope; 13753 + struct pmu *pmu; 13754 + 13755 + cpumask_clear_cpu(cpu, perf_online_mask); 13756 + 13757 + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { 13758 + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); 13759 + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); 13760 + 13761 + target[scope] = -1; 13762 + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) 13763 + continue; 13764 + 13765 + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) 13766 + continue; 13767 + target[scope] = cpumask_any_but(cpumask, cpu); 13768 + if (target[scope] < nr_cpu_ids) 13769 + cpumask_set_cpu(target[scope], pmu_cpumask); 13770 + } 13771 + 13772 + /* migrate */ 13773 + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { 13774 + if (pmu->scope == PERF_PMU_SCOPE_NONE || 13775 + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) 13776 + continue; 13777 + 13778 + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) 13779 + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); 13780 + } 13937 13781 } 13938 13782 13939 13783 static void perf_event_exit_cpu_context(int cpu) ··· 13977 13753 13978 13754 // XXX simplify cpuctx->online 13979 13755 mutex_lock(&pmus_lock); 13756 + /* 13757 + * Clear the cpumasks, and migrate to other CPUs if possible. 13758 + * Must be invoked before the __perf_event_exit_context. 13759 + */ 13760 + perf_event_clear_cpumask(cpu); 13980 13761 cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); 13981 13762 ctx = &cpuctx->ctx; 13982 13763 ··· 13989 13760 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); 13990 13761 cpuctx->online = 0; 13991 13762 mutex_unlock(&ctx->mutex); 13992 - cpumask_clear_cpu(cpu, perf_online_mask); 13993 13763 mutex_unlock(&pmus_lock); 13994 13764 } 13995 13765 #else ··· 13996 13768 static void perf_event_exit_cpu_context(int cpu) { } 13997 13769 13998 13770 #endif 13771 + 13772 + static void perf_event_setup_cpumask(unsigned int cpu) 13773 + { 13774 + struct cpumask *pmu_cpumask; 13775 + unsigned int scope; 13776 + 13777 + cpumask_set_cpu(cpu, perf_online_mask); 13778 + 13779 + /* 13780 + * Early boot stage, the cpumask hasn't been set yet. 13781 + * The perf_online_<domain>_masks includes the first CPU of each domain. 13782 + * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks. 13783 + */ 13784 + if (!topology_sibling_cpumask(cpu)) { 13785 + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { 13786 + pmu_cpumask = perf_scope_cpumask(scope); 13787 + if (WARN_ON_ONCE(!pmu_cpumask)) 13788 + continue; 13789 + cpumask_set_cpu(cpu, pmu_cpumask); 13790 + } 13791 + return; 13792 + } 13793 + 13794 + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { 13795 + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); 13796 + 13797 + pmu_cpumask = perf_scope_cpumask(scope); 13798 + 13799 + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) 13800 + continue; 13801 + 13802 + if (!cpumask_empty(cpumask) && 13803 + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) 13804 + cpumask_set_cpu(cpu, pmu_cpumask); 13805 + } 13806 + } 13999 13807 14000 13808 int perf_event_init_cpu(unsigned int cpu) 14001 13809 { ··· 14041 13777 perf_swevent_init_cpu(cpu); 14042 13778 14043 13779 mutex_lock(&pmus_lock); 14044 - cpumask_set_cpu(cpu, perf_online_mask); 13780 + perf_event_setup_cpumask(cpu); 14045 13781 cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); 14046 13782 ctx = &cpuctx->ctx; 14047 13783
+280 -225
kernel/events/uprobes.c
··· 40 40 #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) 41 41 42 42 static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ 43 + static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); 44 + 45 + DEFINE_STATIC_SRCU(uprobes_srcu); 43 46 44 47 #define UPROBES_HASH_SZ 13 45 48 /* serialize uprobe->pending_list */ ··· 60 57 struct rw_semaphore register_rwsem; 61 58 struct rw_semaphore consumer_rwsem; 62 59 struct list_head pending_list; 63 - struct uprobe_consumer *consumers; 60 + struct list_head consumers; 64 61 struct inode *inode; /* Also hold a ref to inode */ 62 + struct rcu_head rcu; 65 63 loff_t offset; 66 64 loff_t ref_ctr_offset; 67 65 unsigned long flags; ··· 112 108 */ 113 109 unsigned long vaddr; /* Page(s) of instruction slots */ 114 110 }; 111 + 112 + static void uprobe_warn(struct task_struct *t, const char *msg) 113 + { 114 + pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); 115 + } 115 116 116 117 /* 117 118 * valid_vma: Verify if the specified vma is an executable vma ··· 462 453 * @vaddr: the virtual address to store the opcode. 463 454 * @opcode: opcode to be written at @vaddr. 464 455 * 465 - * Called with mm->mmap_lock held for write. 456 + * Called with mm->mmap_lock held for read or write. 466 457 * Return 0 (success) or a negative errno. 467 458 */ 468 459 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, ··· 596 587 *(uprobe_opcode_t *)&auprobe->insn); 597 588 } 598 589 590 + /* uprobe should have guaranteed positive refcount */ 599 591 static struct uprobe *get_uprobe(struct uprobe *uprobe) 600 592 { 601 593 refcount_inc(&uprobe->ref); 602 594 return uprobe; 603 595 } 604 596 597 + /* 598 + * uprobe should have guaranteed lifetime, which can be either of: 599 + * - caller already has refcount taken (and wants an extra one); 600 + * - uprobe is RCU protected and won't be freed until after grace period; 601 + * - we are holding uprobes_treelock (for read or write, doesn't matter). 602 + */ 603 + static struct uprobe *try_get_uprobe(struct uprobe *uprobe) 604 + { 605 + if (refcount_inc_not_zero(&uprobe->ref)) 606 + return uprobe; 607 + return NULL; 608 + } 609 + 610 + static inline bool uprobe_is_active(struct uprobe *uprobe) 611 + { 612 + return !RB_EMPTY_NODE(&uprobe->rb_node); 613 + } 614 + 615 + static void uprobe_free_rcu(struct rcu_head *rcu) 616 + { 617 + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); 618 + 619 + kfree(uprobe); 620 + } 621 + 605 622 static void put_uprobe(struct uprobe *uprobe) 606 623 { 607 - if (refcount_dec_and_test(&uprobe->ref)) { 608 - /* 609 - * If application munmap(exec_vma) before uprobe_unregister() 610 - * gets called, we don't get a chance to remove uprobe from 611 - * delayed_uprobe_list from remove_breakpoint(). Do it here. 612 - */ 613 - mutex_lock(&delayed_uprobe_lock); 614 - delayed_uprobe_remove(uprobe, NULL); 615 - mutex_unlock(&delayed_uprobe_lock); 616 - kfree(uprobe); 624 + if (!refcount_dec_and_test(&uprobe->ref)) 625 + return; 626 + 627 + write_lock(&uprobes_treelock); 628 + 629 + if (uprobe_is_active(uprobe)) { 630 + write_seqcount_begin(&uprobes_seqcount); 631 + rb_erase(&uprobe->rb_node, &uprobes_tree); 632 + write_seqcount_end(&uprobes_seqcount); 617 633 } 634 + 635 + write_unlock(&uprobes_treelock); 636 + 637 + /* 638 + * If application munmap(exec_vma) before uprobe_unregister() 639 + * gets called, we don't get a chance to remove uprobe from 640 + * delayed_uprobe_list from remove_breakpoint(). Do it here. 641 + */ 642 + mutex_lock(&delayed_uprobe_lock); 643 + delayed_uprobe_remove(uprobe, NULL); 644 + mutex_unlock(&delayed_uprobe_lock); 645 + 646 + call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); 618 647 } 619 648 620 649 static __always_inline ··· 694 647 return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); 695 648 } 696 649 697 - static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) 650 + /* 651 + * Assumes being inside RCU protected region. 652 + * No refcount is taken on returned uprobe. 653 + */ 654 + static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) 698 655 { 699 656 struct __uprobe_key key = { 700 657 .inode = inode, 701 658 .offset = offset, 702 659 }; 703 - struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); 660 + struct rb_node *node; 661 + unsigned int seq; 704 662 705 - if (node) 706 - return get_uprobe(__node_2_uprobe(node)); 663 + lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); 664 + 665 + do { 666 + seq = read_seqcount_begin(&uprobes_seqcount); 667 + node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); 668 + /* 669 + * Lockless RB-tree lookups can result only in false negatives. 670 + * If the element is found, it is correct and can be returned 671 + * under RCU protection. If we find nothing, we need to 672 + * validate that seqcount didn't change. If it did, we have to 673 + * try again as we might have missed the element (false 674 + * negative). If seqcount is unchanged, search truly failed. 675 + */ 676 + if (node) 677 + return __node_2_uprobe(node); 678 + } while (read_seqcount_retry(&uprobes_seqcount, seq)); 707 679 708 680 return NULL; 709 681 } 710 682 711 683 /* 712 - * Find a uprobe corresponding to a given inode:offset 713 - * Acquires uprobes_treelock 684 + * Attempt to insert a new uprobe into uprobes_tree. 685 + * 686 + * If uprobe already exists (for given inode+offset), we just increment 687 + * refcount of previously existing uprobe. 688 + * 689 + * If not, a provided new instance of uprobe is inserted into the tree (with 690 + * assumed initial refcount == 1). 691 + * 692 + * In any case, we return a uprobe instance that ends up being in uprobes_tree. 693 + * Caller has to clean up new uprobe instance, if it ended up not being 694 + * inserted into the tree. 695 + * 696 + * We assume that uprobes_treelock is held for writing. 714 697 */ 715 - static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) 698 + static struct uprobe *__insert_uprobe(struct uprobe *uprobe) 716 699 { 717 - struct uprobe *uprobe; 700 + struct rb_node *node; 701 + again: 702 + node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); 703 + if (node) { 704 + struct uprobe *u = __node_2_uprobe(node); 718 705 719 - read_lock(&uprobes_treelock); 720 - uprobe = __find_uprobe(inode, offset); 721 - read_unlock(&uprobes_treelock); 706 + if (!try_get_uprobe(u)) { 707 + rb_erase(node, &uprobes_tree); 708 + RB_CLEAR_NODE(&u->rb_node); 709 + goto again; 710 + } 711 + 712 + return u; 713 + } 722 714 723 715 return uprobe; 724 716 } 725 717 726 - static struct uprobe *__insert_uprobe(struct uprobe *uprobe) 727 - { 728 - struct rb_node *node; 729 - 730 - node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); 731 - if (node) 732 - return get_uprobe(__node_2_uprobe(node)); 733 - 734 - /* get access + creation ref */ 735 - refcount_set(&uprobe->ref, 2); 736 - return NULL; 737 - } 738 - 739 718 /* 740 - * Acquire uprobes_treelock. 741 - * Matching uprobe already exists in rbtree; 742 - * increment (access refcount) and return the matching uprobe. 743 - * 744 - * No matching uprobe; insert the uprobe in rb_tree; 745 - * get a double refcount (access + creation) and return NULL. 719 + * Acquire uprobes_treelock and insert uprobe into uprobes_tree 720 + * (or reuse existing one, see __insert_uprobe() comments above). 746 721 */ 747 722 static struct uprobe *insert_uprobe(struct uprobe *uprobe) 748 723 { 749 724 struct uprobe *u; 750 725 751 726 write_lock(&uprobes_treelock); 727 + write_seqcount_begin(&uprobes_seqcount); 752 728 u = __insert_uprobe(uprobe); 729 + write_seqcount_end(&uprobes_seqcount); 753 730 write_unlock(&uprobes_treelock); 754 731 755 732 return u; ··· 796 725 797 726 uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); 798 727 if (!uprobe) 799 - return NULL; 728 + return ERR_PTR(-ENOMEM); 800 729 801 730 uprobe->inode = inode; 802 731 uprobe->offset = offset; 803 732 uprobe->ref_ctr_offset = ref_ctr_offset; 733 + INIT_LIST_HEAD(&uprobe->consumers); 804 734 init_rwsem(&uprobe->register_rwsem); 805 735 init_rwsem(&uprobe->consumer_rwsem); 736 + RB_CLEAR_NODE(&uprobe->rb_node); 737 + refcount_set(&uprobe->ref, 1); 806 738 807 739 /* add to uprobes_tree, sorted on inode:offset */ 808 740 cur_uprobe = insert_uprobe(uprobe); 809 741 /* a uprobe exists for this inode:offset combination */ 810 - if (cur_uprobe) { 742 + if (cur_uprobe != uprobe) { 811 743 if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { 812 744 ref_ctr_mismatch_warn(cur_uprobe, uprobe); 813 745 put_uprobe(cur_uprobe); ··· 827 753 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) 828 754 { 829 755 down_write(&uprobe->consumer_rwsem); 830 - uc->next = uprobe->consumers; 831 - uprobe->consumers = uc; 756 + list_add_rcu(&uc->cons_node, &uprobe->consumers); 832 757 up_write(&uprobe->consumer_rwsem); 833 758 } 834 759 835 760 /* 836 761 * For uprobe @uprobe, delete the consumer @uc. 837 - * Return true if the @uc is deleted successfully 838 - * or return false. 762 + * Should never be called with consumer that's not part of @uprobe->consumers. 839 763 */ 840 - static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) 764 + static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) 841 765 { 842 - struct uprobe_consumer **con; 843 - bool ret = false; 844 - 845 766 down_write(&uprobe->consumer_rwsem); 846 - for (con = &uprobe->consumers; *con; con = &(*con)->next) { 847 - if (*con == uc) { 848 - *con = uc->next; 849 - ret = true; 850 - break; 851 - } 852 - } 767 + list_del_rcu(&uc->cons_node); 853 768 up_write(&uprobe->consumer_rwsem); 854 - 855 - return ret; 856 769 } 857 770 858 771 static int __copy_insn(struct address_space *mapping, struct file *filp, ··· 924 863 return ret; 925 864 } 926 865 927 - static inline bool consumer_filter(struct uprobe_consumer *uc, 928 - enum uprobe_filter_ctx ctx, struct mm_struct *mm) 866 + static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) 929 867 { 930 - return !uc->filter || uc->filter(uc, ctx, mm); 868 + return !uc->filter || uc->filter(uc, mm); 931 869 } 932 870 933 - static bool filter_chain(struct uprobe *uprobe, 934 - enum uprobe_filter_ctx ctx, struct mm_struct *mm) 871 + static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) 935 872 { 936 873 struct uprobe_consumer *uc; 937 874 bool ret = false; 938 875 939 876 down_read(&uprobe->consumer_rwsem); 940 - for (uc = uprobe->consumers; uc; uc = uc->next) { 941 - ret = consumer_filter(uc, ctx, mm); 877 + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, 878 + srcu_read_lock_held(&uprobes_srcu)) { 879 + ret = consumer_filter(uc, mm); 942 880 if (ret) 943 881 break; 944 882 } ··· 979 919 { 980 920 set_bit(MMF_RECALC_UPROBES, &mm->flags); 981 921 return set_orig_insn(&uprobe->arch, mm, vaddr); 982 - } 983 - 984 - static inline bool uprobe_is_active(struct uprobe *uprobe) 985 - { 986 - return !RB_EMPTY_NODE(&uprobe->rb_node); 987 - } 988 - /* 989 - * There could be threads that have already hit the breakpoint. They 990 - * will recheck the current insn and restart if find_uprobe() fails. 991 - * See find_active_uprobe(). 992 - */ 993 - static void delete_uprobe(struct uprobe *uprobe) 994 - { 995 - if (WARN_ON(!uprobe_is_active(uprobe))) 996 - return; 997 - 998 - write_lock(&uprobes_treelock); 999 - rb_erase(&uprobe->rb_node, &uprobes_tree); 1000 - write_unlock(&uprobes_treelock); 1001 - RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ 1002 - put_uprobe(uprobe); 1003 922 } 1004 923 1005 924 struct map_info { ··· 1085 1046 1086 1047 if (err && is_register) 1087 1048 goto free; 1088 - 1049 + /* 1050 + * We take mmap_lock for writing to avoid the race with 1051 + * find_active_uprobe_rcu() which takes mmap_lock for reading. 1052 + * Thus this install_breakpoint() can not make 1053 + * is_trap_at_addr() true right after find_uprobe_rcu() 1054 + * returns NULL in find_active_uprobe_rcu(). 1055 + */ 1089 1056 mmap_write_lock(mm); 1090 1057 vma = find_vma(mm, info->vaddr); 1091 1058 if (!vma || !valid_vma(vma, is_register) || ··· 1104 1059 1105 1060 if (is_register) { 1106 1061 /* consult only the "caller", new consumer. */ 1107 - if (consumer_filter(new, 1108 - UPROBE_FILTER_REGISTER, mm)) 1062 + if (consumer_filter(new, mm)) 1109 1063 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 1110 1064 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { 1111 - if (!filter_chain(uprobe, 1112 - UPROBE_FILTER_UNREGISTER, mm)) 1065 + if (!filter_chain(uprobe, mm)) 1113 1066 err |= remove_breakpoint(uprobe, mm, info->vaddr); 1114 1067 } 1115 1068 ··· 1122 1079 return err; 1123 1080 } 1124 1081 1125 - static void 1126 - __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) 1082 + /** 1083 + * uprobe_unregister_nosync - unregister an already registered probe. 1084 + * @uprobe: uprobe to remove 1085 + * @uc: identify which probe if multiple probes are colocated. 1086 + */ 1087 + void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) 1127 1088 { 1128 1089 int err; 1129 1090 1130 - if (WARN_ON(!consumer_del(uprobe, uc))) 1131 - return; 1132 - 1133 - err = register_for_each_vma(uprobe, NULL); 1134 - /* TODO : cant unregister? schedule a worker thread */ 1135 - if (!uprobe->consumers && !err) 1136 - delete_uprobe(uprobe); 1137 - } 1138 - 1139 - /* 1140 - * uprobe_unregister - unregister an already registered probe. 1141 - * @inode: the file in which the probe has to be removed. 1142 - * @offset: offset from the start of the file. 1143 - * @uc: identify which probe if multiple probes are colocated. 1144 - */ 1145 - void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) 1146 - { 1147 - struct uprobe *uprobe; 1148 - 1149 - uprobe = find_uprobe(inode, offset); 1150 - if (WARN_ON(!uprobe)) 1151 - return; 1152 - 1153 1091 down_write(&uprobe->register_rwsem); 1154 - __uprobe_unregister(uprobe, uc); 1092 + consumer_del(uprobe, uc); 1093 + err = register_for_each_vma(uprobe, NULL); 1155 1094 up_write(&uprobe->register_rwsem); 1095 + 1096 + /* TODO : cant unregister? schedule a worker thread */ 1097 + if (unlikely(err)) { 1098 + uprobe_warn(current, "unregister, leaking uprobe"); 1099 + return; 1100 + } 1101 + 1156 1102 put_uprobe(uprobe); 1157 1103 } 1158 - EXPORT_SYMBOL_GPL(uprobe_unregister); 1104 + EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); 1159 1105 1160 - /* 1161 - * __uprobe_register - register a probe 1106 + void uprobe_unregister_sync(void) 1107 + { 1108 + /* 1109 + * Now that handler_chain() and handle_uretprobe_chain() iterate over 1110 + * uprobe->consumers list under RCU protection without holding 1111 + * uprobe->register_rwsem, we need to wait for RCU grace period to 1112 + * make sure that we can't call into just unregistered 1113 + * uprobe_consumer's callbacks anymore. If we don't do that, fast and 1114 + * unlucky enough caller can free consumer's memory and cause 1115 + * handler_chain() or handle_uretprobe_chain() to do an use-after-free. 1116 + */ 1117 + synchronize_srcu(&uprobes_srcu); 1118 + } 1119 + EXPORT_SYMBOL_GPL(uprobe_unregister_sync); 1120 + 1121 + /** 1122 + * uprobe_register - register a probe 1162 1123 * @inode: the file in which the probe has to be placed. 1163 1124 * @offset: offset from the start of the file. 1125 + * @ref_ctr_offset: offset of SDT marker / reference counter 1164 1126 * @uc: information on howto handle the probe.. 1165 1127 * 1166 - * Apart from the access refcount, __uprobe_register() takes a creation 1128 + * Apart from the access refcount, uprobe_register() takes a creation 1167 1129 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting 1168 1130 * inserted into the rbtree (i.e first consumer for a @inode:@offset 1169 1131 * tuple). Creation refcount stops uprobe_unregister from freeing the 1170 1132 * @uprobe even before the register operation is complete. Creation 1171 1133 * refcount is released when the last @uc for the @uprobe 1172 - * unregisters. Caller of __uprobe_register() is required to keep @inode 1134 + * unregisters. Caller of uprobe_register() is required to keep @inode 1173 1135 * (and the containing mount) referenced. 1174 1136 * 1175 - * Return errno if it cannot successully install probes 1176 - * else return 0 (success) 1137 + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. 1177 1138 */ 1178 - static int __uprobe_register(struct inode *inode, loff_t offset, 1179 - loff_t ref_ctr_offset, struct uprobe_consumer *uc) 1139 + struct uprobe *uprobe_register(struct inode *inode, 1140 + loff_t offset, loff_t ref_ctr_offset, 1141 + struct uprobe_consumer *uc) 1180 1142 { 1181 1143 struct uprobe *uprobe; 1182 1144 int ret; 1183 1145 1184 1146 /* Uprobe must have at least one set consumer */ 1185 1147 if (!uc->handler && !uc->ret_handler) 1186 - return -EINVAL; 1148 + return ERR_PTR(-EINVAL); 1187 1149 1188 1150 /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ 1189 1151 if (!inode->i_mapping->a_ops->read_folio && 1190 1152 !shmem_mapping(inode->i_mapping)) 1191 - return -EIO; 1153 + return ERR_PTR(-EIO); 1192 1154 /* Racy, just to catch the obvious mistakes */ 1193 1155 if (offset > i_size_read(inode)) 1194 - return -EINVAL; 1156 + return ERR_PTR(-EINVAL); 1195 1157 1196 1158 /* 1197 1159 * This ensures that copy_from_page(), copy_to_page() and 1198 1160 * __update_ref_ctr() can't cross page boundary. 1199 1161 */ 1200 1162 if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) 1201 - return -EINVAL; 1163 + return ERR_PTR(-EINVAL); 1202 1164 if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) 1203 - return -EINVAL; 1165 + return ERR_PTR(-EINVAL); 1204 1166 1205 - retry: 1206 1167 uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); 1207 - if (!uprobe) 1208 - return -ENOMEM; 1209 1168 if (IS_ERR(uprobe)) 1210 - return PTR_ERR(uprobe); 1169 + return uprobe; 1211 1170 1212 - /* 1213 - * We can race with uprobe_unregister()->delete_uprobe(). 1214 - * Check uprobe_is_active() and retry if it is false. 1215 - */ 1216 1171 down_write(&uprobe->register_rwsem); 1217 - ret = -EAGAIN; 1218 - if (likely(uprobe_is_active(uprobe))) { 1219 - consumer_add(uprobe, uc); 1220 - ret = register_for_each_vma(uprobe, uc); 1221 - if (ret) 1222 - __uprobe_unregister(uprobe, uc); 1223 - } 1172 + consumer_add(uprobe, uc); 1173 + ret = register_for_each_vma(uprobe, uc); 1224 1174 up_write(&uprobe->register_rwsem); 1225 - put_uprobe(uprobe); 1226 1175 1227 - if (unlikely(ret == -EAGAIN)) 1228 - goto retry; 1229 - return ret; 1230 - } 1176 + if (ret) { 1177 + uprobe_unregister_nosync(uprobe, uc); 1178 + /* 1179 + * Registration might have partially succeeded, so we can have 1180 + * this consumer being called right at this time. We need to 1181 + * sync here. It's ok, it's unlikely slow path. 1182 + */ 1183 + uprobe_unregister_sync(); 1184 + return ERR_PTR(ret); 1185 + } 1231 1186 1232 - int uprobe_register(struct inode *inode, loff_t offset, 1233 - struct uprobe_consumer *uc) 1234 - { 1235 - return __uprobe_register(inode, offset, 0, uc); 1187 + return uprobe; 1236 1188 } 1237 1189 EXPORT_SYMBOL_GPL(uprobe_register); 1238 1190 1239 - int uprobe_register_refctr(struct inode *inode, loff_t offset, 1240 - loff_t ref_ctr_offset, struct uprobe_consumer *uc) 1241 - { 1242 - return __uprobe_register(inode, offset, ref_ctr_offset, uc); 1243 - } 1244 - EXPORT_SYMBOL_GPL(uprobe_register_refctr); 1245 - 1246 - /* 1247 - * uprobe_apply - unregister an already registered probe. 1248 - * @inode: the file in which the probe has to be removed. 1249 - * @offset: offset from the start of the file. 1191 + /** 1192 + * uprobe_apply - add or remove the breakpoints according to @uc->filter 1193 + * @uprobe: uprobe which "owns" the breakpoint 1250 1194 * @uc: consumer which wants to add more or remove some breakpoints 1251 1195 * @add: add or remove the breakpoints 1196 + * Return: 0 on success or negative error code. 1252 1197 */ 1253 - int uprobe_apply(struct inode *inode, loff_t offset, 1254 - struct uprobe_consumer *uc, bool add) 1198 + int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) 1255 1199 { 1256 - struct uprobe *uprobe; 1257 1200 struct uprobe_consumer *con; 1258 - int ret = -ENOENT; 1259 - 1260 - uprobe = find_uprobe(inode, offset); 1261 - if (WARN_ON(!uprobe)) 1262 - return ret; 1201 + int ret = -ENOENT, srcu_idx; 1263 1202 1264 1203 down_write(&uprobe->register_rwsem); 1265 - for (con = uprobe->consumers; con && con != uc ; con = con->next) 1266 - ; 1267 - if (con) 1268 - ret = register_for_each_vma(uprobe, add ? uc : NULL); 1204 + 1205 + srcu_idx = srcu_read_lock(&uprobes_srcu); 1206 + list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, 1207 + srcu_read_lock_held(&uprobes_srcu)) { 1208 + if (con == uc) { 1209 + ret = register_for_each_vma(uprobe, add ? uc : NULL); 1210 + break; 1211 + } 1212 + } 1213 + srcu_read_unlock(&uprobes_srcu, srcu_idx); 1214 + 1269 1215 up_write(&uprobe->register_rwsem); 1270 - put_uprobe(uprobe); 1271 1216 1272 1217 return ret; 1273 1218 } ··· 1336 1305 u = rb_entry(t, struct uprobe, rb_node); 1337 1306 if (u->inode != inode || u->offset < min) 1338 1307 break; 1339 - list_add(&u->pending_list, head); 1340 - get_uprobe(u); 1308 + /* if uprobe went away, it's safe to ignore it */ 1309 + if (try_get_uprobe(u)) 1310 + list_add(&u->pending_list, head); 1341 1311 } 1342 1312 for (t = n; (t = rb_next(t)); ) { 1343 1313 u = rb_entry(t, struct uprobe, rb_node); 1344 1314 if (u->inode != inode || u->offset > max) 1345 1315 break; 1346 - list_add(&u->pending_list, head); 1347 - get_uprobe(u); 1316 + /* if uprobe went away, it's safe to ignore it */ 1317 + if (try_get_uprobe(u)) 1318 + list_add(&u->pending_list, head); 1348 1319 } 1349 1320 } 1350 1321 read_unlock(&uprobes_treelock); ··· 1417 1384 */ 1418 1385 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1419 1386 if (!fatal_signal_pending(current) && 1420 - filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { 1387 + filter_chain(uprobe, vma->vm_mm)) { 1421 1388 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); 1422 1389 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1423 1390 } ··· 1803 1770 return -ENOMEM; 1804 1771 1805 1772 *n = *o; 1773 + /* 1774 + * uprobe's refcnt has to be positive at this point, kept by 1775 + * utask->return_instances items; return_instances can't be 1776 + * removed right now, as task is blocked due to duping; so 1777 + * get_uprobe() is safe to use here. 1778 + */ 1806 1779 get_uprobe(n->uprobe); 1807 1780 n->next = NULL; 1808 1781 ··· 1818 1779 } 1819 1780 1820 1781 return 0; 1821 - } 1822 - 1823 - static void uprobe_warn(struct task_struct *t, const char *msg) 1824 - { 1825 - pr_warn("uprobe: %s:%d failed to %s\n", 1826 - current->comm, current->pid, msg); 1827 1782 } 1828 1783 1829 1784 static void dup_xol_work(struct callback_head *work) ··· 1916 1883 return; 1917 1884 } 1918 1885 1886 + /* we need to bump refcount to store uprobe in utask */ 1887 + if (!try_get_uprobe(uprobe)) 1888 + return; 1889 + 1919 1890 ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); 1920 1891 if (!ri) 1921 - return; 1892 + goto fail; 1922 1893 1923 1894 trampoline_vaddr = uprobe_get_trampoline_vaddr(); 1924 1895 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); ··· 1949 1912 } 1950 1913 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; 1951 1914 } 1952 - 1953 - ri->uprobe = get_uprobe(uprobe); 1915 + ri->uprobe = uprobe; 1954 1916 ri->func = instruction_pointer(regs); 1955 1917 ri->stack = user_stack_pointer(regs); 1956 1918 ri->orig_ret_vaddr = orig_ret_vaddr; ··· 1960 1924 utask->return_instances = ri; 1961 1925 1962 1926 return; 1963 - fail: 1927 + fail: 1964 1928 kfree(ri); 1929 + put_uprobe(uprobe); 1965 1930 } 1966 1931 1967 1932 /* Prepare to single-step probed instruction out of line. */ ··· 1977 1940 if (!utask) 1978 1941 return -ENOMEM; 1979 1942 1943 + if (!try_get_uprobe(uprobe)) 1944 + return -EINVAL; 1945 + 1980 1946 xol_vaddr = xol_get_insn_slot(uprobe); 1981 - if (!xol_vaddr) 1982 - return -ENOMEM; 1947 + if (!xol_vaddr) { 1948 + err = -ENOMEM; 1949 + goto err_out; 1950 + } 1983 1951 1984 1952 utask->xol_vaddr = xol_vaddr; 1985 1953 utask->vaddr = bp_vaddr; ··· 1992 1950 err = arch_uprobe_pre_xol(&uprobe->arch, regs); 1993 1951 if (unlikely(err)) { 1994 1952 xol_free_insn_slot(current); 1995 - return err; 1953 + goto err_out; 1996 1954 } 1997 1955 1998 1956 utask->active_uprobe = uprobe; 1999 1957 utask->state = UTASK_SSTEP; 2000 1958 return 0; 1959 + err_out: 1960 + put_uprobe(uprobe); 1961 + return err; 2001 1962 } 2002 1963 2003 1964 /* ··· 2073 2028 if (likely(result == 0)) 2074 2029 goto out; 2075 2030 2076 - /* 2077 - * The NULL 'tsk' here ensures that any faults that occur here 2078 - * will not be accounted to the task. 'mm' *is* current->mm, 2079 - * but we treat this as a 'remote' access since it is 2080 - * essentially a kernel access to the memory. 2081 - */ 2082 - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); 2031 + result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); 2083 2032 if (result < 0) 2084 2033 return result; 2085 2034 ··· 2084 2045 return is_trap_insn(&opcode); 2085 2046 } 2086 2047 2087 - static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) 2048 + /* assumes being inside RCU protected region */ 2049 + static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) 2088 2050 { 2089 2051 struct mm_struct *mm = current->mm; 2090 2052 struct uprobe *uprobe = NULL; ··· 2098 2058 struct inode *inode = file_inode(vma->vm_file); 2099 2059 loff_t offset = vaddr_to_offset(vma, bp_vaddr); 2100 2060 2101 - uprobe = find_uprobe(inode, offset); 2061 + uprobe = find_uprobe_rcu(inode, offset); 2102 2062 } 2103 2063 2104 2064 if (!uprobe) ··· 2119 2079 struct uprobe_consumer *uc; 2120 2080 int remove = UPROBE_HANDLER_REMOVE; 2121 2081 bool need_prep = false; /* prepare return uprobe, when needed */ 2082 + bool has_consumers = false; 2122 2083 2123 - down_read(&uprobe->register_rwsem); 2124 - for (uc = uprobe->consumers; uc; uc = uc->next) { 2084 + current->utask->auprobe = &uprobe->arch; 2085 + 2086 + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, 2087 + srcu_read_lock_held(&uprobes_srcu)) { 2125 2088 int rc = 0; 2126 2089 2127 2090 if (uc->handler) { ··· 2137 2094 need_prep = true; 2138 2095 2139 2096 remove &= rc; 2097 + has_consumers = true; 2140 2098 } 2099 + current->utask->auprobe = NULL; 2141 2100 2142 2101 if (need_prep && !remove) 2143 2102 prepare_uretprobe(uprobe, regs); /* put bp at return */ 2144 2103 2145 - if (remove && uprobe->consumers) { 2146 - WARN_ON(!uprobe_is_active(uprobe)); 2147 - unapply_uprobe(uprobe, current->mm); 2104 + if (remove && has_consumers) { 2105 + down_read(&uprobe->register_rwsem); 2106 + 2107 + /* re-check that removal is still required, this time under lock */ 2108 + if (!filter_chain(uprobe, current->mm)) { 2109 + WARN_ON(!uprobe_is_active(uprobe)); 2110 + unapply_uprobe(uprobe, current->mm); 2111 + } 2112 + 2113 + up_read(&uprobe->register_rwsem); 2148 2114 } 2149 - up_read(&uprobe->register_rwsem); 2150 2115 } 2151 2116 2152 2117 static void ··· 2162 2111 { 2163 2112 struct uprobe *uprobe = ri->uprobe; 2164 2113 struct uprobe_consumer *uc; 2114 + int srcu_idx; 2165 2115 2166 - down_read(&uprobe->register_rwsem); 2167 - for (uc = uprobe->consumers; uc; uc = uc->next) { 2116 + srcu_idx = srcu_read_lock(&uprobes_srcu); 2117 + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, 2118 + srcu_read_lock_held(&uprobes_srcu)) { 2168 2119 if (uc->ret_handler) 2169 2120 uc->ret_handler(uc, ri->func, regs); 2170 2121 } 2171 - up_read(&uprobe->register_rwsem); 2122 + srcu_read_unlock(&uprobes_srcu, srcu_idx); 2172 2123 } 2173 2124 2174 2125 static struct return_instance *find_next_ret_chain(struct return_instance *ri) ··· 2255 2202 { 2256 2203 struct uprobe *uprobe; 2257 2204 unsigned long bp_vaddr; 2258 - int is_swbp; 2205 + int is_swbp, srcu_idx; 2259 2206 2260 2207 bp_vaddr = uprobe_get_swbp_addr(regs); 2261 2208 if (bp_vaddr == uprobe_get_trampoline_vaddr()) 2262 2209 return uprobe_handle_trampoline(regs); 2263 2210 2264 - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); 2211 + srcu_idx = srcu_read_lock(&uprobes_srcu); 2212 + 2213 + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); 2265 2214 if (!uprobe) { 2266 2215 if (is_swbp > 0) { 2267 2216 /* No matching uprobe; signal SIGTRAP. */ ··· 2279 2224 */ 2280 2225 instruction_pointer_set(regs, bp_vaddr); 2281 2226 } 2282 - return; 2227 + goto out; 2283 2228 } 2284 2229 2285 2230 /* change it in advance for ->handler() and restart */ ··· 2314 2259 if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) 2315 2260 goto out; 2316 2261 2317 - if (!pre_ssout(uprobe, regs, bp_vaddr)) 2318 - return; 2262 + if (pre_ssout(uprobe, regs, bp_vaddr)) 2263 + goto out; 2319 2264 2320 - /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ 2321 2265 out: 2322 - put_uprobe(uprobe); 2266 + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ 2267 + srcu_read_unlock(&uprobes_srcu, srcu_idx); 2323 2268 } 2324 2269 2325 2270 /*
+21 -17
kernel/trace/bpf_trace.c
··· 3160 3160 loff_t offset; 3161 3161 unsigned long ref_ctr_offset; 3162 3162 u64 cookie; 3163 + struct uprobe *uprobe; 3163 3164 struct uprobe_consumer consumer; 3164 3165 }; 3165 3166 ··· 3179 3178 struct bpf_uprobe *uprobe; 3180 3179 }; 3181 3180 3182 - static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, 3183 - u32 cnt) 3181 + static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) 3184 3182 { 3185 3183 u32 i; 3186 3184 3187 - for (i = 0; i < cnt; i++) { 3188 - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, 3189 - &uprobes[i].consumer); 3190 - } 3185 + for (i = 0; i < cnt; i++) 3186 + uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); 3187 + 3188 + if (cnt) 3189 + uprobe_unregister_sync(); 3191 3190 } 3192 3191 3193 3192 static void bpf_uprobe_multi_link_release(struct bpf_link *link) ··· 3195 3194 struct bpf_uprobe_multi_link *umulti_link; 3196 3195 3197 3196 umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); 3198 - bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); 3197 + bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); 3199 3198 if (umulti_link->task) 3200 3199 put_task_struct(umulti_link->task); 3201 3200 path_put(&umulti_link->path); ··· 3323 3322 } 3324 3323 3325 3324 static bool 3326 - uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, 3327 - struct mm_struct *mm) 3325 + uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) 3328 3326 { 3329 3327 struct bpf_uprobe *uprobe; 3330 3328 ··· 3480 3480 &bpf_uprobe_multi_link_lops, prog); 3481 3481 3482 3482 for (i = 0; i < cnt; i++) { 3483 - err = uprobe_register_refctr(d_real_inode(link->path.dentry), 3484 - uprobes[i].offset, 3485 - uprobes[i].ref_ctr_offset, 3486 - &uprobes[i].consumer); 3487 - if (err) { 3488 - bpf_uprobe_unregister(&path, uprobes, i); 3489 - goto error_free; 3483 + uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), 3484 + uprobes[i].offset, 3485 + uprobes[i].ref_ctr_offset, 3486 + &uprobes[i].consumer); 3487 + if (IS_ERR(uprobes[i].uprobe)) { 3488 + err = PTR_ERR(uprobes[i].uprobe); 3489 + link->cnt = i; 3490 + goto error_unregister; 3490 3491 } 3491 3492 } 3492 3493 3493 3494 err = bpf_link_prime(&link->link, &link_primer); 3494 3495 if (err) 3495 - goto error_free; 3496 + goto error_unregister; 3496 3497 3497 3498 return bpf_link_settle(&link_primer); 3499 + 3500 + error_unregister: 3501 + bpf_uprobe_unregister(uprobes, link->cnt); 3498 3502 3499 3503 error_free: 3500 3504 kvfree(uprobes);
+20 -24
kernel/trace/trace_uprobe.c
··· 58 58 struct dyn_event devent; 59 59 struct uprobe_consumer consumer; 60 60 struct path path; 61 - struct inode *inode; 62 61 char *filename; 62 + struct uprobe *uprobe; 63 63 unsigned long offset; 64 64 unsigned long ref_ctr_offset; 65 65 unsigned long nhit; ··· 1078 1078 return trace_handle_return(s); 1079 1079 } 1080 1080 1081 - typedef bool (*filter_func_t)(struct uprobe_consumer *self, 1082 - enum uprobe_filter_ctx ctx, 1083 - struct mm_struct *mm); 1081 + typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); 1084 1082 1085 1083 static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) 1086 1084 { 1087 - int ret; 1085 + struct inode *inode = d_real_inode(tu->path.dentry); 1086 + struct uprobe *uprobe; 1088 1087 1089 1088 tu->consumer.filter = filter; 1090 - tu->inode = d_real_inode(tu->path.dentry); 1089 + uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); 1090 + if (IS_ERR(uprobe)) 1091 + return PTR_ERR(uprobe); 1091 1092 1092 - if (tu->ref_ctr_offset) 1093 - ret = uprobe_register_refctr(tu->inode, tu->offset, 1094 - tu->ref_ctr_offset, &tu->consumer); 1095 - else 1096 - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 1097 - 1098 - if (ret) 1099 - tu->inode = NULL; 1100 - 1101 - return ret; 1093 + tu->uprobe = uprobe; 1094 + return 0; 1102 1095 } 1103 1096 1104 1097 static void __probe_event_disable(struct trace_probe *tp) 1105 1098 { 1106 1099 struct trace_uprobe *tu; 1100 + bool sync = false; 1107 1101 1108 1102 tu = container_of(tp, struct trace_uprobe, tp); 1109 1103 WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); 1110 1104 1111 1105 list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { 1112 - if (!tu->inode) 1106 + if (!tu->uprobe) 1113 1107 continue; 1114 1108 1115 - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); 1116 - tu->inode = NULL; 1109 + uprobe_unregister_nosync(tu->uprobe, &tu->consumer); 1110 + sync = true; 1111 + tu->uprobe = NULL; 1117 1112 } 1113 + if (sync) 1114 + uprobe_unregister_sync(); 1118 1115 } 1119 1116 1120 1117 static int probe_event_enable(struct trace_event_call *call, ··· 1307 1310 return 0; 1308 1311 1309 1312 list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { 1310 - ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); 1313 + ret = uprobe_apply(tu->uprobe, &tu->consumer, false); 1311 1314 if (ret) 1312 1315 break; 1313 1316 } ··· 1331 1334 return 0; 1332 1335 1333 1336 list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { 1334 - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); 1337 + err = uprobe_apply(tu->uprobe, &tu->consumer, true); 1335 1338 if (err) { 1336 1339 uprobe_perf_close(call, event); 1337 1340 break; ··· 1341 1344 return err; 1342 1345 } 1343 1346 1344 - static bool uprobe_perf_filter(struct uprobe_consumer *uc, 1345 - enum uprobe_filter_ctx ctx, struct mm_struct *mm) 1347 + static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm) 1346 1348 { 1347 1349 struct trace_uprobe_filter *filter; 1348 1350 struct trace_uprobe *tu; ··· 1427 1431 static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, 1428 1432 struct uprobe_cpu_buffer **ucbp) 1429 1433 { 1430 - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) 1434 + if (!uprobe_perf_filter(&tu->consumer, current->mm)) 1431 1435 return UPROBE_HANDLER_REMOVE; 1432 1436 1433 1437 if (!is_ret_probe(tu))
+14 -13
tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
··· 434 434 435 435 struct testmod_uprobe { 436 436 struct path path; 437 - loff_t offset; 437 + struct uprobe *uprobe; 438 438 struct uprobe_consumer consumer; 439 439 }; 440 440 ··· 448 448 { 449 449 int err = -EBUSY; 450 450 451 - if (uprobe.offset) 451 + if (uprobe.uprobe) 452 452 return -EBUSY; 453 453 454 454 mutex_lock(&testmod_uprobe_mutex); 455 455 456 - if (uprobe.offset) 456 + if (uprobe.uprobe) 457 457 goto out; 458 458 459 459 err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path); 460 460 if (err) 461 461 goto out; 462 462 463 - err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry), 464 - offset, 0, &uprobe.consumer); 465 - if (err) 463 + uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry), 464 + offset, 0, &uprobe.consumer); 465 + if (IS_ERR(uprobe.uprobe)) { 466 + err = PTR_ERR(uprobe.uprobe); 466 467 path_put(&uprobe.path); 467 - else 468 - uprobe.offset = offset; 469 - 468 + uprobe.uprobe = NULL; 469 + } 470 470 out: 471 471 mutex_unlock(&testmod_uprobe_mutex); 472 472 return err; ··· 476 476 { 477 477 mutex_lock(&testmod_uprobe_mutex); 478 478 479 - if (uprobe.offset) { 480 - uprobe_unregister(d_real_inode(uprobe.path.dentry), 481 - uprobe.offset, &uprobe.consumer); 482 - uprobe.offset = 0; 479 + if (uprobe.uprobe) { 480 + uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer); 481 + uprobe_unregister_sync(); 482 + path_put(&uprobe.path); 483 + uprobe.uprobe = NULL; 483 484 } 484 485 485 486 mutex_unlock(&testmod_uprobe_mutex);