Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf-core-2025-01-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar:
"Seqlock optimizations that arose in a perf context and were merged
into the perf tree:

- seqlock: Add raw_seqcount_try_begin (Suren Baghdasaryan)
- mm: Convert mm_lock_seq to a proper seqcount (Suren Baghdasaryan)
- mm: Introduce mmap_lock_speculate_{try_begin|retry} (Suren
Baghdasaryan)
- mm/gup: Use raw_seqcount_try_begin() (Peter Zijlstra)

Core perf enhancements:

- Reduce 'struct page' footprint of perf by mapping pages in advance
(Lorenzo Stoakes)
- Save raw sample data conditionally based on sample type (Yabin Cui)
- Reduce sampling overhead by checking sample_type in
perf_sample_save_callchain() and perf_sample_save_brstack() (Yabin
Cui)
- Export perf_exclude_event() (Namhyung Kim)

Uprobes scalability enhancements: (Andrii Nakryiko)

- Simplify find_active_uprobe_rcu() VMA checks
- Add speculative lockless VMA-to-inode-to-uprobe resolution
- Simplify session consumer tracking
- Decouple return_instance list traversal and freeing
- Ensure return_instance is detached from the list before freeing
- Reuse return_instances between multiple uretprobes within task
- Guard against kmemdup() failing in dup_return_instance()

AMD core PMU driver enhancements:

- Relax privilege filter restriction on AMD IBS (Namhyung Kim)

AMD RAPL energy counters support: (Dhananjay Ugwekar)

- Introduce topology_logical_core_id() (K Prateek Nayak)
- Remove the unused get_rapl_pmu_cpumask() function
- Remove the cpu_to_rapl_pmu() function
- Rename rapl_pmu variables
- Make rapl_model struct global
- Add arguments to the init and cleanup functions
- Modify the generic variable names to *_pkg*
- Remove the global variable rapl_msrs
- Move the cntr_mask to rapl_pmus struct
- Add core energy counter support for AMD CPUs

Intel core PMU driver enhancements:

- Support RDPMC 'metrics clear mode' feature (Kan Liang)
- Clarify adaptive PEBS processing (Kan Liang)
- Factor out functions for PEBS records processing (Kan Liang)
- Simplify the PEBS records processing for adaptive PEBS (Kan Liang)

Intel uncore driver enhancements: (Kan Liang)

- Convert buggy pmu->func_id use to pmu->registered
- Support more units on Granite Rapids"

* tag 'perf-core-2025-01-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
perf: map pages in advance
perf/x86/intel/uncore: Support more units on Granite Rapids
perf/x86/intel/uncore: Clean up func_id
perf/x86/intel: Support RDPMC metrics clear mode
uprobes: Guard against kmemdup() failing in dup_return_instance()
perf/x86: Relax privilege filter restriction on AMD IBS
perf/core: Export perf_exclude_event()
uprobes: Reuse return_instances between multiple uretprobes within task
uprobes: Ensure return_instance is detached from the list before freeing
uprobes: Decouple return_instance list traversal and freeing
uprobes: Simplify session consumer tracking
uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution
uprobes: simplify find_active_uprobe_rcu() VMA checks
mm: introduce mmap_lock_speculate_{try_begin|retry}
mm: convert mm_lock_seq to a proper seqcount
mm/gup: Use raw_seqcount_try_begin()
seqlock: add raw_seqcount_try_begin
perf/x86/rapl: Add core energy counter support for AMD CPUs
perf/x86/rapl: Move the cntr_mask to rapl_pmus struct
perf/x86/rapl: Remove the global variable rapl_msrs
...

+958 -499
+4
Documentation/arch/x86/topology.rst
··· 135 135 The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo 136 136 "core_id." 137 137 138 + - topology_logical_core_id(); 139 + 140 + The logical core ID to which a thread belongs. 141 + 138 142 139 143 140 144 System topology examples
+1 -1
arch/s390/kernel/perf_cpum_cf.c
··· 981 981 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 982 982 raw.frag.size = cpuhw->usedss; 983 983 raw.frag.data = cpuhw->stop; 984 - perf_sample_save_raw_data(&data, &raw); 984 + perf_sample_save_raw_data(&data, event, &raw); 985 985 } 986 986 987 987 overflow = perf_event_overflow(event, &data, &regs);
+3 -3
arch/s390/kernel/perf_cpum_sf.c
··· 981 981 cpuhw->flags &= ~PMU_F_ENABLED; 982 982 } 983 983 984 - /* perf_exclude_event() - Filter event 984 + /* perf_event_exclude() - Filter event 985 985 * @event: The perf event 986 986 * @regs: pt_regs structure 987 987 * @sde_regs: Sample-data-entry (sde) regs structure ··· 990 990 * 991 991 * Return non-zero if the event shall be excluded. 992 992 */ 993 - static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, 993 + static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, 994 994 struct perf_sf_sde_regs *sde_regs) 995 995 { 996 996 if (event->attr.exclude_user && user_mode(regs)) ··· 1073 1073 data.tid_entry.pid = basic->hpp & LPP_PID_MASK; 1074 1074 1075 1075 overflow = 0; 1076 - if (perf_exclude_event(event, &regs, sde_regs)) 1076 + if (perf_event_exclude(event, &regs, sde_regs)) 1077 1077 goto out; 1078 1078 if (perf_event_overflow(event, &data, &regs)) { 1079 1079 overflow = 1;
+1 -1
arch/s390/kernel/perf_pai_crypto.c
··· 478 478 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 479 479 raw.frag.size = rawsize; 480 480 raw.frag.data = cpump->save; 481 - perf_sample_save_raw_data(&data, &raw); 481 + perf_sample_save_raw_data(&data, event, &raw); 482 482 } 483 483 484 484 overflow = perf_event_overflow(event, &data, &regs);
+1 -1
arch/s390/kernel/perf_pai_ext.c
··· 503 503 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 504 504 raw.frag.size = rawsize; 505 505 raw.frag.data = cpump->save; 506 - perf_sample_save_raw_data(&data, &raw); 506 + perf_sample_save_raw_data(&data, event, &raw); 507 507 } 508 508 509 509 overflow = perf_event_overflow(event, &data, &regs);
+1 -2
arch/x86/events/amd/core.c
··· 1001 1001 if (!x86_perf_event_set_period(event)) 1002 1002 continue; 1003 1003 1004 - if (has_branch_stack(event)) 1005 - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); 1004 + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); 1006 1005 1007 1006 if (perf_event_overflow(event, &data, regs)) 1008 1007 x86_pmu_stop(event, 0);
+43 -21
arch/x86/events/amd/ibs.c
··· 31 31 #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) 32 32 #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT 33 33 34 + /* attr.config2 */ 35 + #define IBS_SW_FILTER_MASK 1 34 36 35 37 /* 36 38 * IBS states: ··· 291 289 292 290 if (has_branch_stack(event)) 293 291 return -EOPNOTSUPP; 292 + 293 + /* handle exclude_{user,kernel} in the IRQ handler */ 294 + if (event->attr.exclude_host || event->attr.exclude_guest || 295 + event->attr.exclude_idle) 296 + return -EINVAL; 297 + 298 + if (!(event->attr.config2 & IBS_SW_FILTER_MASK) && 299 + (event->attr.exclude_kernel || event->attr.exclude_user || 300 + event->attr.exclude_hv)) 301 + return -EINVAL; 294 302 295 303 ret = validate_group(event); 296 304 if (ret) ··· 562 550 NULL, 563 551 }; 564 552 565 - static struct attribute_group empty_format_group = { 566 - .name = "format", 567 - .attrs = attrs_empty, 568 - }; 569 - 570 553 static struct attribute_group empty_caps_group = { 571 554 .name = "caps", 572 555 .attrs = attrs_empty, 573 556 }; 574 557 575 - static const struct attribute_group *empty_attr_groups[] = { 576 - &empty_format_group, 577 - &empty_caps_group, 578 - NULL, 579 - }; 580 - 581 558 PMU_FORMAT_ATTR(rand_en, "config:57"); 582 559 PMU_FORMAT_ATTR(cnt_ctl, "config:19"); 560 + PMU_FORMAT_ATTR(swfilt, "config2:0"); 583 561 PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); 584 562 PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); 585 563 PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); ··· 580 578 return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; 581 579 } 582 580 583 - static struct attribute *rand_en_attrs[] = { 581 + static struct attribute *fetch_attrs[] = { 584 582 &format_attr_rand_en.attr, 583 + &format_attr_swfilt.attr, 585 584 NULL, 586 585 }; 587 586 ··· 596 593 NULL, 597 594 }; 598 595 599 - static struct attribute_group group_rand_en = { 596 + static struct attribute_group group_fetch_formats = { 600 597 .name = "format", 601 - .attrs = rand_en_attrs, 598 + .attrs = fetch_attrs, 602 599 }; 603 600 604 601 static struct attribute_group group_fetch_l3missonly = { ··· 614 611 }; 615 612 616 613 static const struct attribute_group *fetch_attr_groups[] = { 617 - &group_rand_en, 614 + &group_fetch_formats, 618 615 &empty_caps_group, 619 616 NULL, 620 617 }; ··· 631 628 return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; 632 629 } 633 630 631 + static struct attribute *op_attrs[] = { 632 + &format_attr_swfilt.attr, 633 + NULL, 634 + }; 635 + 634 636 static struct attribute *cnt_ctl_attrs[] = { 635 637 &format_attr_cnt_ctl.attr, 636 638 NULL, ··· 644 636 static struct attribute *op_l3missonly_attrs[] = { 645 637 &op_l3missonly.attr.attr, 646 638 NULL, 639 + }; 640 + 641 + static struct attribute_group group_op_formats = { 642 + .name = "format", 643 + .attrs = op_attrs, 647 644 }; 648 645 649 646 static struct attribute_group group_cnt_ctl = { ··· 661 648 .name = "format", 662 649 .attrs = op_l3missonly_attrs, 663 650 .is_visible = zen4_ibs_extensions_is_visible, 651 + }; 652 + 653 + static const struct attribute_group *op_attr_groups[] = { 654 + &group_op_formats, 655 + &empty_caps_group, 656 + NULL, 664 657 }; 665 658 666 659 static const struct attribute_group *op_attr_update[] = { ··· 686 667 .start = perf_ibs_start, 687 668 .stop = perf_ibs_stop, 688 669 .read = perf_ibs_read, 689 - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 690 670 }, 691 671 .msr = MSR_AMD64_IBSFETCHCTL, 692 672 .config_mask = IBS_FETCH_CONFIG_MASK, ··· 709 691 .start = perf_ibs_start, 710 692 .stop = perf_ibs_stop, 711 693 .read = perf_ibs_read, 712 - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 713 694 }, 714 695 .msr = MSR_AMD64_IBSOPCTL, 715 696 .config_mask = IBS_OP_CONFIG_MASK, ··· 1128 1111 regs.flags |= PERF_EFLAGS_EXACT; 1129 1112 } 1130 1113 1114 + if ((event->attr.config2 & IBS_SW_FILTER_MASK) && 1115 + perf_exclude_event(event, &regs)) { 1116 + throttle = perf_event_account_interrupt(event); 1117 + goto out; 1118 + } 1119 + 1131 1120 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 1132 1121 raw = (struct perf_raw_record){ 1133 1122 .frag = { ··· 1141 1118 .data = ibs_data.data, 1142 1119 }, 1143 1120 }; 1144 - perf_sample_save_raw_data(&data, &raw); 1121 + perf_sample_save_raw_data(&data, event, &raw); 1145 1122 } 1146 1123 1147 1124 if (perf_ibs == &perf_ibs_op) ··· 1152 1129 * recorded as part of interrupt regs. Thus we need to use rip from 1153 1130 * interrupt regs while unwinding call stack. 1154 1131 */ 1155 - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 1156 - perf_sample_save_callchain(&data, event, iregs); 1132 + perf_sample_save_callchain(&data, event, iregs); 1157 1133 1158 1134 throttle = perf_event_overflow(event, &data, &regs); 1159 1135 out: ··· 1250 1228 if (ibs_caps & IBS_CAPS_ZEN4) 1251 1229 perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; 1252 1230 1253 - perf_ibs_op.pmu.attr_groups = empty_attr_groups; 1231 + perf_ibs_op.pmu.attr_groups = op_attr_groups; 1254 1232 perf_ibs_op.pmu.attr_update = op_attr_update; 1255 1233 1256 1234 return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+1 -2
arch/x86/events/core.c
··· 1707 1707 1708 1708 perf_sample_data_init(&data, 0, event->hw.last_period); 1709 1709 1710 - if (has_branch_stack(event)) 1711 - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); 1710 + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); 1712 1711 1713 1712 if (perf_event_overflow(event, &data, regs)) 1714 1713 x86_pmu_stop(event, 0);
+19 -1
arch/x86/events/intel/core.c
··· 2826 2826 return; 2827 2827 2828 2828 idx = INTEL_PMC_IDX_FIXED_SLOTS; 2829 + 2830 + if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR) 2831 + bits |= INTEL_FIXED_3_METRICS_CLEAR; 2829 2832 } 2830 2833 2831 2834 intel_set_masks(event, idx); ··· 4084 4081 * is used in a metrics group, it too cannot support sampling. 4085 4082 */ 4086 4083 if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { 4087 - if (event->attr.config1 || event->attr.config2) 4084 + /* The metrics_clear can only be set for the slots event */ 4085 + if (event->attr.config1 && 4086 + (!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR))) 4087 + return -EINVAL; 4088 + 4089 + if (event->attr.config2) 4088 4090 return -EINVAL; 4089 4091 4090 4092 /* ··· 4698 4690 PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); 4699 4691 PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ 4700 4692 4693 + PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ 4694 + 4701 4695 static ssize_t umask2_show(struct device *dev, 4702 4696 struct device_attribute *attr, 4703 4697 char *page) ··· 4719 4709 static struct attribute *format_evtsel_ext_attrs[] = { 4720 4710 &format_attr_umask2.attr, 4721 4711 &format_attr_eq.attr, 4712 + &format_attr_metrics_clear.attr, 4722 4713 NULL 4723 4714 }; 4724 4715 ··· 4743 4732 mask = hybrid(dev_get_drvdata(dev), config_mask); 4744 4733 if (i == 1) 4745 4734 return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; 4735 + 4736 + /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ 4737 + if (i == 2) { 4738 + union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap); 4739 + 4740 + return intel_cap.rdpmc_metrics_clear ? attr->mode : 0; 4741 + } 4746 4742 4747 4743 return 0; 4748 4744 }
+116 -82
arch/x86/events/intel/ds.c
··· 1789 1789 * previous PMI context or an (I)RET happened between the record and 1790 1790 * PMI. 1791 1791 */ 1792 - if (sample_type & PERF_SAMPLE_CALLCHAIN) 1793 - perf_sample_save_callchain(data, event, iregs); 1792 + perf_sample_save_callchain(data, event, iregs); 1794 1793 1795 1794 /* 1796 1795 * We use the interrupt regs as a base because the PEBS record does not ··· 1888 1889 if (x86_pmu.intel_cap.pebs_format >= 3) 1889 1890 setup_pebs_time(event, data, pebs->tsc); 1890 1891 1891 - if (has_branch_stack(event)) 1892 - perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); 1892 + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); 1893 1893 } 1894 1894 1895 1895 static void adaptive_pebs_save_regs(struct pt_regs *regs, ··· 1915 1917 } 1916 1918 1917 1919 #define PEBS_LATENCY_MASK 0xffff 1918 - #define PEBS_CACHE_LATENCY_OFFSET 32 1919 - #define PEBS_RETIRE_LATENCY_OFFSET 32 1920 1920 1921 1921 /* 1922 1922 * With adaptive PEBS the layout depends on what fields are configured. ··· 1928 1932 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1929 1933 struct pebs_basic *basic = __pebs; 1930 1934 void *next_record = basic + 1; 1931 - u64 sample_type; 1932 - u64 format_size; 1935 + u64 sample_type, format_group; 1933 1936 struct pebs_meminfo *meminfo = NULL; 1934 1937 struct pebs_gprs *gprs = NULL; 1935 1938 struct x86_perf_regs *perf_regs; ··· 1940 1945 perf_regs->xmm_regs = NULL; 1941 1946 1942 1947 sample_type = event->attr.sample_type; 1943 - format_size = basic->format_size; 1948 + format_group = basic->format_group; 1944 1949 perf_sample_data_init(data, 0, event->hw.last_period); 1945 1950 data->period = event->hw.last_period; 1946 1951 ··· 1952 1957 * previous PMI context or an (I)RET happened between the record and 1953 1958 * PMI. 1954 1959 */ 1955 - if (sample_type & PERF_SAMPLE_CALLCHAIN) 1956 - perf_sample_save_callchain(data, event, iregs); 1960 + perf_sample_save_callchain(data, event, iregs); 1957 1961 1958 1962 *regs = *iregs; 1959 1963 /* The ip in basic is EventingIP */ ··· 1961 1967 1962 1968 if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { 1963 1969 if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) 1964 - data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK; 1970 + data->weight.var3_w = basic->retire_latency; 1965 1971 else 1966 1972 data->weight.var3_w = 0; 1967 1973 } ··· 1971 1977 * But PERF_SAMPLE_TRANSACTION needs gprs->ax. 1972 1978 * Save the pointer here but process later. 1973 1979 */ 1974 - if (format_size & PEBS_DATACFG_MEMINFO) { 1980 + if (format_group & PEBS_DATACFG_MEMINFO) { 1975 1981 meminfo = next_record; 1976 1982 next_record = meminfo + 1; 1977 1983 } 1978 1984 1979 - if (format_size & PEBS_DATACFG_GP) { 1985 + if (format_group & PEBS_DATACFG_GP) { 1980 1986 gprs = next_record; 1981 1987 next_record = gprs + 1; 1982 1988 ··· 1989 1995 adaptive_pebs_save_regs(regs, gprs); 1990 1996 } 1991 1997 1992 - if (format_size & PEBS_DATACFG_MEMINFO) { 1998 + if (format_group & PEBS_DATACFG_MEMINFO) { 1993 1999 if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { 1994 - u64 weight = meminfo->latency; 2000 + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? 2001 + meminfo->cache_latency : meminfo->mem_latency; 1995 2002 1996 - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { 1997 - data->weight.var2_w = weight & PEBS_LATENCY_MASK; 1998 - weight >>= PEBS_CACHE_LATENCY_OFFSET; 1999 - } 2003 + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) 2004 + data->weight.var2_w = meminfo->instr_latency; 2000 2005 2001 2006 /* 2002 2007 * Although meminfo::latency is defined as a u64, ··· 2003 2010 * in practice on Ice Lake and earlier platforms. 2004 2011 */ 2005 2012 if (sample_type & PERF_SAMPLE_WEIGHT) { 2006 - data->weight.full = weight ?: 2013 + data->weight.full = latency ?: 2007 2014 intel_get_tsx_weight(meminfo->tsx_tuning); 2008 2015 } else { 2009 - data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: 2016 + data->weight.var1_dw = (u32)latency ?: 2010 2017 intel_get_tsx_weight(meminfo->tsx_tuning); 2011 2018 } 2019 + 2012 2020 data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 2013 2021 } 2014 2022 ··· 2030 2036 } 2031 2037 } 2032 2038 2033 - if (format_size & PEBS_DATACFG_XMMS) { 2039 + if (format_group & PEBS_DATACFG_XMMS) { 2034 2040 struct pebs_xmm *xmm = next_record; 2035 2041 2036 2042 next_record = xmm + 1; 2037 2043 perf_regs->xmm_regs = xmm->xmm; 2038 2044 } 2039 2045 2040 - if (format_size & PEBS_DATACFG_LBRS) { 2046 + if (format_group & PEBS_DATACFG_LBRS) { 2041 2047 struct lbr_entry *lbr = next_record; 2042 - int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) 2048 + int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT) 2043 2049 & 0xff) + 1; 2044 2050 next_record = next_record + num_lbr * sizeof(struct lbr_entry); 2045 2051 ··· 2049 2055 } 2050 2056 } 2051 2057 2052 - WARN_ONCE(next_record != __pebs + (format_size >> 48), 2053 - "PEBS record size %llu, expected %llu, config %llx\n", 2054 - format_size >> 48, 2058 + WARN_ONCE(next_record != __pebs + basic->format_size, 2059 + "PEBS record size %u, expected %llu, config %llx\n", 2060 + basic->format_size, 2055 2061 (u64)(next_record - __pebs), 2056 - basic->format_size); 2062 + format_group); 2057 2063 } 2058 2064 2059 2065 static inline void * ··· 2164 2170 return 0; 2165 2171 } 2166 2172 2173 + typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *, 2174 + struct perf_sample_data *, struct pt_regs *); 2175 + 2176 + static struct pt_regs dummy_iregs; 2177 + 2167 2178 static __always_inline void 2168 2179 __intel_pmu_pebs_event(struct perf_event *event, 2169 2180 struct pt_regs *iregs, 2181 + struct pt_regs *regs, 2170 2182 struct perf_sample_data *data, 2171 - void *base, void *top, 2172 - int bit, int count, 2173 - void (*setup_sample)(struct perf_event *, 2174 - struct pt_regs *, 2175 - void *, 2176 - struct perf_sample_data *, 2177 - struct pt_regs *)) 2183 + void *at, 2184 + setup_fn setup_sample) 2178 2185 { 2179 - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2186 + setup_sample(event, iregs, at, data, regs); 2187 + perf_event_output(event, data, regs); 2188 + } 2189 + 2190 + static __always_inline void 2191 + __intel_pmu_pebs_last_event(struct perf_event *event, 2192 + struct pt_regs *iregs, 2193 + struct pt_regs *regs, 2194 + struct perf_sample_data *data, 2195 + void *at, 2196 + int count, 2197 + setup_fn setup_sample) 2198 + { 2180 2199 struct hw_perf_event *hwc = &event->hw; 2181 - struct x86_perf_regs perf_regs; 2182 - struct pt_regs *regs = &perf_regs.regs; 2183 - void *at = get_next_pebs_record_by_bit(base, top, bit); 2184 - static struct pt_regs dummy_iregs; 2185 - 2186 - if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { 2187 - /* 2188 - * Now, auto-reload is only enabled in fixed period mode. 2189 - * The reload value is always hwc->sample_period. 2190 - * May need to change it, if auto-reload is enabled in 2191 - * freq mode later. 2192 - */ 2193 - intel_pmu_save_and_restart_reload(event, count); 2194 - } else if (!intel_pmu_save_and_restart(event)) 2195 - return; 2196 - 2197 - if (!iregs) 2198 - iregs = &dummy_iregs; 2199 - 2200 - while (count > 1) { 2201 - setup_sample(event, iregs, at, data, regs); 2202 - perf_event_output(event, data, regs); 2203 - at += cpuc->pebs_record_size; 2204 - at = get_next_pebs_record_by_bit(at, top, bit); 2205 - count--; 2206 - } 2207 2200 2208 2201 setup_sample(event, iregs, at, data, regs); 2209 2202 if (iregs == &dummy_iregs) { ··· 2209 2228 if (perf_event_overflow(event, data, regs)) 2210 2229 x86_pmu_stop(event, 0); 2211 2230 } 2231 + 2232 + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { 2233 + /* 2234 + * Now, auto-reload is only enabled in fixed period mode. 2235 + * The reload value is always hwc->sample_period. 2236 + * May need to change it, if auto-reload is enabled in 2237 + * freq mode later. 2238 + */ 2239 + intel_pmu_save_and_restart_reload(event, count); 2240 + } else 2241 + intel_pmu_save_and_restart(event); 2242 + } 2243 + 2244 + static __always_inline void 2245 + __intel_pmu_pebs_events(struct perf_event *event, 2246 + struct pt_regs *iregs, 2247 + struct perf_sample_data *data, 2248 + void *base, void *top, 2249 + int bit, int count, 2250 + setup_fn setup_sample) 2251 + { 2252 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2253 + struct x86_perf_regs perf_regs; 2254 + struct pt_regs *regs = &perf_regs.regs; 2255 + void *at = get_next_pebs_record_by_bit(base, top, bit); 2256 + int cnt = count; 2257 + 2258 + if (!iregs) 2259 + iregs = &dummy_iregs; 2260 + 2261 + while (cnt > 1) { 2262 + __intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample); 2263 + at += cpuc->pebs_record_size; 2264 + at = get_next_pebs_record_by_bit(at, top, bit); 2265 + cnt--; 2266 + } 2267 + 2268 + __intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample); 2212 2269 } 2213 2270 2214 2271 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) ··· 2283 2264 return; 2284 2265 } 2285 2266 2286 - __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, 2287 - setup_pebs_fixed_sample_data); 2267 + __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n, 2268 + setup_pebs_fixed_sample_data); 2288 2269 } 2289 2270 2290 2271 static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) ··· 2415 2396 } 2416 2397 2417 2398 if (counts[bit]) { 2418 - __intel_pmu_pebs_event(event, iregs, data, base, 2419 - top, bit, counts[bit], 2420 - setup_pebs_fixed_sample_data); 2399 + __intel_pmu_pebs_events(event, iregs, data, base, 2400 + top, bit, counts[bit], 2401 + setup_pebs_fixed_sample_data); 2421 2402 } 2422 2403 } 2423 2404 } ··· 2425 2406 static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) 2426 2407 { 2427 2408 short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; 2409 + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; 2428 2410 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2429 2411 struct debug_store *ds = cpuc->ds; 2412 + struct x86_perf_regs perf_regs; 2413 + struct pt_regs *regs = &perf_regs.regs; 2414 + struct pebs_basic *basic; 2430 2415 struct perf_event *event; 2431 2416 void *base, *at, *top; 2432 2417 int bit; ··· 2452 2429 return; 2453 2430 } 2454 2431 2455 - for (at = base; at < top; at += cpuc->pebs_record_size) { 2432 + if (!iregs) 2433 + iregs = &dummy_iregs; 2434 + 2435 + /* Process all but the last event for each counter. */ 2436 + for (at = base; at < top; at += basic->format_size) { 2456 2437 u64 pebs_status; 2457 2438 2458 - pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; 2459 - pebs_status &= mask; 2439 + basic = at; 2440 + if (basic->format_size != cpuc->pebs_record_size) 2441 + continue; 2460 2442 2461 - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) 2462 - counts[bit]++; 2443 + pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; 2444 + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { 2445 + event = cpuc->events[bit]; 2446 + 2447 + if (WARN_ON_ONCE(!event) || 2448 + WARN_ON_ONCE(!event->attr.precise_ip)) 2449 + continue; 2450 + 2451 + if (counts[bit]++) { 2452 + __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], 2453 + setup_pebs_adaptive_sample_data); 2454 + } 2455 + last[bit] = at; 2456 + } 2463 2457 } 2464 2458 2465 2459 for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { 2466 - if (counts[bit] == 0) 2460 + if (!counts[bit]) 2467 2461 continue; 2468 2462 2469 2463 event = cpuc->events[bit]; 2470 - if (WARN_ON_ONCE(!event)) 2471 - continue; 2472 2464 2473 - if (WARN_ON_ONCE(!event->attr.precise_ip)) 2474 - continue; 2475 - 2476 - __intel_pmu_pebs_event(event, iregs, data, base, 2477 - top, bit, counts[bit], 2478 - setup_pebs_adaptive_sample_data); 2465 + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], 2466 + counts[bit], setup_pebs_adaptive_sample_data); 2479 2467 } 2480 2468 } 2481 2469
+7 -13
arch/x86/events/intel/uncore.c
··· 745 745 746 746 pmu = uncore_event_to_pmu(event); 747 747 /* no device found for this pmu */ 748 - if (pmu->func_id < 0) 748 + if (!pmu->registered) 749 749 return -ENOENT; 750 750 751 751 /* Sampling not supported yet */ ··· 992 992 uncore_type_exit(*types); 993 993 } 994 994 995 - static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) 995 + static int __init uncore_type_init(struct intel_uncore_type *type) 996 996 { 997 997 struct intel_uncore_pmu *pmus; 998 998 size_t size; ··· 1005 1005 size = uncore_max_dies() * sizeof(struct intel_uncore_box *); 1006 1006 1007 1007 for (i = 0; i < type->num_boxes; i++) { 1008 - pmus[i].func_id = setid ? i : -1; 1009 1008 pmus[i].pmu_idx = i; 1010 1009 pmus[i].type = type; 1011 1010 pmus[i].boxes = kzalloc(size, GFP_KERNEL); ··· 1054 1055 } 1055 1056 1056 1057 static int __init 1057 - uncore_types_init(struct intel_uncore_type **types, bool setid) 1058 + uncore_types_init(struct intel_uncore_type **types) 1058 1059 { 1059 1060 int ret; 1060 1061 1061 1062 for (; *types; types++) { 1062 - ret = uncore_type_init(*types, setid); 1063 + ret = uncore_type_init(*types); 1063 1064 if (ret) 1064 1065 return ret; 1065 1066 } ··· 1158 1159 box = uncore_alloc_box(type, NUMA_NO_NODE); 1159 1160 if (!box) 1160 1161 return -ENOMEM; 1161 - 1162 - if (pmu->func_id < 0) 1163 - pmu->func_id = pdev->devfn; 1164 - else 1165 - WARN_ON_ONCE(pmu->func_id != pdev->devfn); 1166 1162 1167 1163 atomic_inc(&box->refcnt); 1168 1164 box->dieid = die; ··· 1404 1410 goto err; 1405 1411 } 1406 1412 1407 - ret = uncore_types_init(uncore_pci_uncores, false); 1413 + ret = uncore_types_init(uncore_pci_uncores); 1408 1414 if (ret) 1409 1415 goto errtype; 1410 1416 ··· 1672 1678 { 1673 1679 int ret; 1674 1680 1675 - ret = uncore_types_init(uncore_msr_uncores, true); 1681 + ret = uncore_types_init(uncore_msr_uncores); 1676 1682 if (ret) 1677 1683 goto err; 1678 1684 ··· 1691 1697 struct intel_uncore_type **types = uncore_mmio_uncores; 1692 1698 int ret; 1693 1699 1694 - ret = uncore_types_init(types, true); 1700 + ret = uncore_types_init(types); 1695 1701 if (ret) 1696 1702 goto err; 1697 1703
-1
arch/x86/events/intel/uncore.h
··· 125 125 struct pmu pmu; 126 126 char name[UNCORE_PMU_NAME_LEN]; 127 127 int pmu_idx; 128 - int func_id; 129 128 bool registered; 130 129 atomic_t activeboxes; 131 130 cpumask_t cpu_mask;
+1 -1
arch/x86/events/intel/uncore_snb.c
··· 910 910 911 911 pmu = uncore_event_to_pmu(event); 912 912 /* no device found for this pmu */ 913 - if (pmu->func_id < 0) 913 + if (!pmu->registered) 914 914 return -ENOENT; 915 915 916 916 /* Sampling not supported yet */
+32 -16
arch/x86/events/intel/uncore_snbep.c
··· 6684 6684 /* GNR uncore support */ 6685 6685 6686 6686 #define UNCORE_GNR_NUM_UNCORE_TYPES 23 6687 - #define UNCORE_GNR_TYPE_15 15 6688 - #define UNCORE_GNR_B2UPI 18 6689 - #define UNCORE_GNR_TYPE_21 21 6690 - #define UNCORE_GNR_TYPE_22 22 6691 6687 6692 6688 int gnr_uncore_units_ignore[] = { 6693 - UNCORE_SPR_UPI, 6694 - UNCORE_GNR_TYPE_15, 6695 - UNCORE_GNR_B2UPI, 6696 - UNCORE_GNR_TYPE_21, 6697 - UNCORE_GNR_TYPE_22, 6698 6689 UNCORE_IGNORE_END 6699 6690 }; 6700 6691 6701 6692 static struct intel_uncore_type gnr_uncore_ubox = { 6702 6693 .name = "ubox", 6694 + .attr_update = uncore_alias_groups, 6695 + }; 6696 + 6697 + static struct intel_uncore_type gnr_uncore_pciex8 = { 6698 + SPR_UNCORE_PCI_COMMON_FORMAT(), 6699 + .name = "pciex8", 6700 + }; 6701 + 6702 + static struct intel_uncore_type gnr_uncore_pciex16 = { 6703 + SPR_UNCORE_PCI_COMMON_FORMAT(), 6704 + .name = "pciex16", 6705 + }; 6706 + 6707 + static struct intel_uncore_type gnr_uncore_upi = { 6708 + SPR_UNCORE_PCI_COMMON_FORMAT(), 6709 + .name = "upi", 6710 + }; 6711 + 6712 + static struct intel_uncore_type gnr_uncore_b2upi = { 6713 + SPR_UNCORE_PCI_COMMON_FORMAT(), 6714 + .name = "b2upi", 6715 + }; 6716 + 6717 + static struct intel_uncore_type gnr_uncore_b2hot = { 6718 + .name = "b2hot", 6703 6719 .attr_update = uncore_alias_groups, 6704 6720 }; 6705 6721 ··· 6743 6727 &gnr_uncore_ubox, 6744 6728 &spr_uncore_imc, 6745 6729 NULL, 6730 + &gnr_uncore_upi, 6746 6731 NULL, 6747 6732 NULL, 6748 6733 NULL, 6734 + &spr_uncore_cxlcm, 6735 + &spr_uncore_cxldp, 6749 6736 NULL, 6750 - NULL, 6751 - NULL, 6752 - NULL, 6753 - NULL, 6737 + &gnr_uncore_b2hot, 6754 6738 &gnr_uncore_b2cmi, 6755 6739 &gnr_uncore_b2cxl, 6756 - NULL, 6740 + &gnr_uncore_b2upi, 6757 6741 NULL, 6758 6742 &gnr_uncore_mdf_sbo, 6759 - NULL, 6760 - NULL, 6743 + &gnr_uncore_pciex16, 6744 + &gnr_uncore_pciex8, 6761 6745 }; 6762 6746 6763 6747 static struct freerunning_counters gnr_iio_freerunning[] = {
+1
arch/x86/events/perf_event.h
··· 624 624 u64 pebs_output_pt_available:1; 625 625 u64 pebs_timing_info:1; 626 626 u64 anythread_deprecated:1; 627 + u64 rdpmc_metrics_clear:1; 627 628 }; 628 629 u64 capabilities; 629 630 };
+264 -149
arch/x86/events/rapl.c
··· 39 39 * event: rapl_energy_psys 40 40 * perf code: 0x5 41 41 * 42 + * core counter: consumption of a single physical core 43 + * event: rapl_energy_core (power_core PMU) 44 + * perf code: 0x1 45 + * 42 46 * We manage those counters as free running (read-only). They may be 43 47 * use simultaneously by other tools, such as turbostat. 44 48 * ··· 74 70 /* 75 71 * RAPL energy status counters 76 72 */ 77 - enum perf_rapl_events { 73 + enum perf_rapl_pkg_events { 78 74 PERF_RAPL_PP0 = 0, /* all cores */ 79 75 PERF_RAPL_PKG, /* entire package */ 80 76 PERF_RAPL_RAM, /* DRAM */ 81 77 PERF_RAPL_PP1, /* gpu */ 82 78 PERF_RAPL_PSYS, /* psys */ 83 79 84 - PERF_RAPL_MAX, 85 - NR_RAPL_DOMAINS = PERF_RAPL_MAX, 80 + PERF_RAPL_PKG_EVENTS_MAX, 81 + NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, 86 82 }; 87 83 88 - static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 84 + #define PERF_RAPL_CORE 0 /* single core */ 85 + #define PERF_RAPL_CORE_EVENTS_MAX 1 86 + #define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX 87 + 88 + static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { 89 89 "pp0-core", 90 90 "package", 91 91 "dram", 92 92 "pp1-gpu", 93 93 "psys", 94 94 }; 95 + 96 + static const char *const rapl_core_domain_name __initconst = "core"; 95 97 96 98 /* 97 99 * event code: LSB 8 bits, passed in attr->config ··· 122 112 * considered as either pkg-scope or die-scope, and we are considering 123 113 * them as die-scope. 124 114 */ 125 - #define rapl_pmu_is_pkg_scope() \ 115 + #define rapl_pkg_pmu_is_pkg_scope() \ 126 116 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ 127 117 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) 128 118 ··· 139 129 struct rapl_pmus { 140 130 struct pmu pmu; 141 131 unsigned int nr_rapl_pmu; 142 - struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); 132 + unsigned int cntr_mask; 133 + struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); 143 134 }; 144 135 145 136 enum rapl_unit_quirk { ··· 150 139 }; 151 140 152 141 struct rapl_model { 153 - struct perf_msr *rapl_msrs; 154 - unsigned long events; 142 + struct perf_msr *rapl_pkg_msrs; 143 + struct perf_msr *rapl_core_msrs; 144 + unsigned long pkg_events; 145 + unsigned long core_events; 155 146 unsigned int msr_power_unit; 156 147 enum rapl_unit_quirk unit_quirk; 157 148 }; 158 149 159 150 /* 1/2^hw_unit Joule */ 160 - static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; 161 - static struct rapl_pmus *rapl_pmus; 162 - static unsigned int rapl_cntr_mask; 151 + static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; 152 + static int rapl_core_hw_unit __read_mostly; 153 + static struct rapl_pmus *rapl_pmus_pkg; 154 + static struct rapl_pmus *rapl_pmus_core; 163 155 static u64 rapl_timer_ms; 164 - static struct perf_msr *rapl_msrs; 156 + static struct rapl_model *rapl_model; 165 157 166 158 /* 167 - * Helper functions to get the correct topology macros according to the 159 + * Helper function to get the correct topology id according to the 168 160 * RAPL PMU scope. 169 161 */ 170 - static inline unsigned int get_rapl_pmu_idx(int cpu) 162 + static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) 171 163 { 172 - return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : 173 - topology_logical_die_id(cpu); 174 - } 175 - 176 - static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu) 177 - { 178 - return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) : 179 - topology_die_cpumask(cpu); 180 - } 181 - 182 - static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) 183 - { 184 - unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); 185 - 186 164 /* 187 - * The unsigned check also catches the '-1' return value for non 188 - * existent mappings in the topology map. 165 + * Returns unsigned int, which converts the '-1' return value 166 + * (for non-existent mappings in topology map) to UINT_MAX, so 167 + * the error check in the caller is simplified. 189 168 */ 190 - return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; 169 + switch (scope) { 170 + case PERF_PMU_SCOPE_PKG: 171 + return topology_logical_package_id(cpu); 172 + case PERF_PMU_SCOPE_DIE: 173 + return topology_logical_die_id(cpu); 174 + case PERF_PMU_SCOPE_CORE: 175 + return topology_logical_core_id(cpu); 176 + default: 177 + return -EINVAL; 178 + } 191 179 } 192 180 193 181 static inline u64 rapl_read_counter(struct perf_event *event) ··· 196 186 return raw; 197 187 } 198 188 199 - static inline u64 rapl_scale(u64 v, int cfg) 189 + static inline u64 rapl_scale(u64 v, struct perf_event *event) 200 190 { 201 - if (cfg > NR_RAPL_DOMAINS) { 202 - pr_warn("Invalid domain %d, failed to scale data\n", cfg); 203 - return v; 204 - } 191 + int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1]; 192 + 193 + if (event->pmu->scope == PERF_PMU_SCOPE_CORE) 194 + hw_unit = rapl_core_hw_unit; 195 + 205 196 /* 206 197 * scale delta to smallest unit (1/2^32) 207 198 * users must then scale back: count * 1/(1e9*2^32) to get Joules 208 199 * or use ldexp(count, -32). 209 200 * Watts = Joules/Time delta 210 201 */ 211 - return v << (32 - rapl_hw_unit[cfg - 1]); 202 + return v << (32 - hw_unit); 212 203 } 213 204 214 205 static u64 rapl_event_update(struct perf_event *event) ··· 236 225 delta = (new_raw_count << shift) - (prev_raw_count << shift); 237 226 delta >>= shift; 238 227 239 - sdelta = rapl_scale(delta, event->hw.config); 228 + sdelta = rapl_scale(delta, event); 240 229 241 230 local64_add(sdelta, &event->count); 242 231 ··· 251 240 252 241 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) 253 242 { 254 - struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); 243 + struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); 255 244 struct perf_event *event; 256 245 unsigned long flags; 257 246 258 - if (!pmu->n_active) 247 + if (!rapl_pmu->n_active) 259 248 return HRTIMER_NORESTART; 260 249 261 - raw_spin_lock_irqsave(&pmu->lock, flags); 250 + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); 262 251 263 - list_for_each_entry(event, &pmu->active_list, active_entry) 252 + list_for_each_entry(event, &rapl_pmu->active_list, active_entry) 264 253 rapl_event_update(event); 265 254 266 - raw_spin_unlock_irqrestore(&pmu->lock, flags); 255 + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); 267 256 268 - hrtimer_forward_now(hrtimer, pmu->timer_interval); 257 + hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); 269 258 270 259 return HRTIMER_RESTART; 271 260 } 272 261 273 - static void rapl_hrtimer_init(struct rapl_pmu *pmu) 262 + static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) 274 263 { 275 - struct hrtimer *hr = &pmu->hrtimer; 264 + struct hrtimer *hr = &rapl_pmu->hrtimer; 276 265 277 266 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 278 267 hr->function = rapl_hrtimer_handle; 279 268 } 280 269 281 - static void __rapl_pmu_event_start(struct rapl_pmu *pmu, 270 + static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, 282 271 struct perf_event *event) 283 272 { 284 273 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) ··· 286 275 287 276 event->hw.state = 0; 288 277 289 - list_add_tail(&event->active_entry, &pmu->active_list); 278 + list_add_tail(&event->active_entry, &rapl_pmu->active_list); 290 279 291 280 local64_set(&event->hw.prev_count, rapl_read_counter(event)); 292 281 293 - pmu->n_active++; 294 - if (pmu->n_active == 1) 295 - rapl_start_hrtimer(pmu); 282 + rapl_pmu->n_active++; 283 + if (rapl_pmu->n_active == 1) 284 + rapl_start_hrtimer(rapl_pmu); 296 285 } 297 286 298 287 static void rapl_pmu_event_start(struct perf_event *event, int mode) 299 288 { 300 - struct rapl_pmu *pmu = event->pmu_private; 289 + struct rapl_pmu *rapl_pmu = event->pmu_private; 301 290 unsigned long flags; 302 291 303 - raw_spin_lock_irqsave(&pmu->lock, flags); 304 - __rapl_pmu_event_start(pmu, event); 305 - raw_spin_unlock_irqrestore(&pmu->lock, flags); 292 + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); 293 + __rapl_pmu_event_start(rapl_pmu, event); 294 + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); 306 295 } 307 296 308 297 static void rapl_pmu_event_stop(struct perf_event *event, int mode) 309 298 { 310 - struct rapl_pmu *pmu = event->pmu_private; 299 + struct rapl_pmu *rapl_pmu = event->pmu_private; 311 300 struct hw_perf_event *hwc = &event->hw; 312 301 unsigned long flags; 313 302 314 - raw_spin_lock_irqsave(&pmu->lock, flags); 303 + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); 315 304 316 305 /* mark event as deactivated and stopped */ 317 306 if (!(hwc->state & PERF_HES_STOPPED)) { 318 - WARN_ON_ONCE(pmu->n_active <= 0); 319 - pmu->n_active--; 320 - if (pmu->n_active == 0) 321 - hrtimer_cancel(&pmu->hrtimer); 307 + WARN_ON_ONCE(rapl_pmu->n_active <= 0); 308 + rapl_pmu->n_active--; 309 + if (rapl_pmu->n_active == 0) 310 + hrtimer_cancel(&rapl_pmu->hrtimer); 322 311 323 312 list_del(&event->active_entry); 324 313 ··· 336 325 hwc->state |= PERF_HES_UPTODATE; 337 326 } 338 327 339 - raw_spin_unlock_irqrestore(&pmu->lock, flags); 328 + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); 340 329 } 341 330 342 331 static int rapl_pmu_event_add(struct perf_event *event, int mode) 343 332 { 344 - struct rapl_pmu *pmu = event->pmu_private; 333 + struct rapl_pmu *rapl_pmu = event->pmu_private; 345 334 struct hw_perf_event *hwc = &event->hw; 346 335 unsigned long flags; 347 336 348 - raw_spin_lock_irqsave(&pmu->lock, flags); 337 + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); 349 338 350 339 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 351 340 352 341 if (mode & PERF_EF_START) 353 - __rapl_pmu_event_start(pmu, event); 342 + __rapl_pmu_event_start(rapl_pmu, event); 354 343 355 - raw_spin_unlock_irqrestore(&pmu->lock, flags); 344 + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); 356 345 357 346 return 0; 358 347 } ··· 365 354 static int rapl_pmu_event_init(struct perf_event *event) 366 355 { 367 356 u64 cfg = event->attr.config & RAPL_EVENT_MASK; 368 - int bit, ret = 0; 369 - struct rapl_pmu *pmu; 357 + int bit, rapl_pmus_scope, ret = 0; 358 + struct rapl_pmu *rapl_pmu; 359 + unsigned int rapl_pmu_idx; 360 + struct rapl_pmus *rapl_pmus; 370 361 371 - /* only look at RAPL events */ 372 - if (event->attr.type != rapl_pmus->pmu.type) 373 - return -ENOENT; 362 + /* unsupported modes and filters */ 363 + if (event->attr.sample_period) /* no sampling */ 364 + return -EINVAL; 374 365 375 366 /* check only supported bits are set */ 376 367 if (event->attr.config & ~RAPL_EVENT_MASK) ··· 381 368 if (event->cpu < 0) 382 369 return -EINVAL; 383 370 384 - if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) 371 + rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); 372 + if (!rapl_pmus) 385 373 return -EINVAL; 374 + rapl_pmus_scope = rapl_pmus->pmu.scope; 386 375 387 - cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); 388 - bit = cfg - 1; 376 + if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { 377 + /* only look at RAPL package events */ 378 + if (event->attr.type != rapl_pmus_pkg->pmu.type) 379 + return -ENOENT; 380 + 381 + cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); 382 + if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) 383 + return -EINVAL; 384 + 385 + bit = cfg - 1; 386 + event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; 387 + } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { 388 + /* only look at RAPL core events */ 389 + if (event->attr.type != rapl_pmus_core->pmu.type) 390 + return -ENOENT; 391 + 392 + cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); 393 + if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) 394 + return -EINVAL; 395 + 396 + bit = cfg - 1; 397 + event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; 398 + } else 399 + return -EINVAL; 389 400 390 401 /* check event supported */ 391 - if (!(rapl_cntr_mask & (1 << bit))) 402 + if (!(rapl_pmus->cntr_mask & (1 << bit))) 392 403 return -EINVAL; 393 404 394 - /* unsupported modes and filters */ 395 - if (event->attr.sample_period) /* no sampling */ 405 + rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope); 406 + if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) 396 407 return -EINVAL; 397 - 398 408 /* must be done before validate_group */ 399 - pmu = cpu_to_rapl_pmu(event->cpu); 400 - if (!pmu) 409 + rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; 410 + if (!rapl_pmu) 401 411 return -EINVAL; 402 - event->pmu_private = pmu; 403 - event->hw.event_base = rapl_msrs[bit].msr; 412 + 413 + event->pmu_private = rapl_pmu; 404 414 event->hw.config = cfg; 405 415 event->hw.idx = bit; 406 416 ··· 440 404 RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); 441 405 RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); 442 406 RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); 407 + RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01"); 443 408 444 409 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); 445 410 RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); 446 411 RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); 447 412 RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); 448 413 RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); 414 + RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules"); 449 415 450 416 /* 451 417 * we compute in 0.23 nJ increments regardless of MSR ··· 457 419 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); 458 420 RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); 459 421 RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); 422 + RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10"); 460 423 461 424 /* 462 425 * There are no default events, but we need to create ··· 485 446 }; 486 447 487 448 static const struct attribute_group *rapl_attr_groups[] = { 449 + &rapl_pmu_format_group, 450 + &rapl_pmu_events_group, 451 + NULL, 452 + }; 453 + 454 + static const struct attribute_group *rapl_core_attr_groups[] = { 488 455 &rapl_pmu_format_group, 489 456 &rapl_pmu_events_group, 490 457 NULL, ··· 556 511 .attrs = rapl_events_psys, 557 512 }; 558 513 514 + static struct attribute *rapl_events_core[] = { 515 + EVENT_PTR(rapl_core), 516 + EVENT_PTR(rapl_core_unit), 517 + EVENT_PTR(rapl_core_scale), 518 + NULL, 519 + }; 520 + 521 + static struct attribute_group rapl_events_core_group = { 522 + .name = "events", 523 + .attrs = rapl_events_core, 524 + }; 525 + 559 526 static bool test_msr(int idx, void *data) 560 527 { 561 528 return test_bit(idx, (unsigned long *) data); ··· 593 536 }; 594 537 595 538 /* 596 - * Force to PERF_RAPL_MAX size due to: 597 - * - perf_msr_probe(PERF_RAPL_MAX) 539 + * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: 540 + * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) 598 541 * - want to use same event codes across both architectures 599 542 */ 600 - static struct perf_msr amd_rapl_msrs[] = { 543 + static struct perf_msr amd_rapl_pkg_msrs[] = { 601 544 [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, 602 545 [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, 603 546 [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, ··· 605 548 [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, 606 549 }; 607 550 608 - static int rapl_check_hw_unit(struct rapl_model *rm) 551 + static struct perf_msr amd_rapl_core_msrs[] = { 552 + [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group, 553 + test_msr, false, RAPL_MSR_MASK }, 554 + }; 555 + 556 + static int rapl_check_hw_unit(void) 609 557 { 610 558 u64 msr_rapl_power_unit_bits; 611 559 int i; 612 560 613 561 /* protect rdmsrl() to handle virtualization */ 614 - if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) 562 + if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) 615 563 return -1; 616 - for (i = 0; i < NR_RAPL_DOMAINS; i++) 617 - rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; 564 + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) 565 + rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; 618 566 619 - switch (rm->unit_quirk) { 567 + rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; 568 + 569 + switch (rapl_model->unit_quirk) { 620 570 /* 621 571 * DRAM domain on HSW server and KNL has fixed energy unit which can be 622 572 * different than the unit from power unit MSR. See ··· 631 567 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " 632 568 */ 633 569 case RAPL_UNIT_QUIRK_INTEL_HSW: 634 - rapl_hw_unit[PERF_RAPL_RAM] = 16; 570 + rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16; 635 571 break; 636 572 /* SPR uses a fixed energy unit for Psys domain. */ 637 573 case RAPL_UNIT_QUIRK_INTEL_SPR: 638 - rapl_hw_unit[PERF_RAPL_PSYS] = 0; 574 + rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0; 639 575 break; 640 576 default: 641 577 break; 642 578 } 643 - 644 579 645 580 /* 646 581 * Calculate the timer rate: ··· 649 586 * if hw unit is 32, then we use 2 ms 1/200/2 650 587 */ 651 588 rapl_timer_ms = 2; 652 - if (rapl_hw_unit[0] < 32) { 589 + if (rapl_pkg_hw_unit[0] < 32) { 653 590 rapl_timer_ms = (1000 / (2 * 100)); 654 - rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); 591 + rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1)); 655 592 } 656 593 return 0; 657 594 } ··· 659 596 static void __init rapl_advertise(void) 660 597 { 661 598 int i; 599 + int num_counters = hweight32(rapl_pmus_pkg->cntr_mask); 600 + 601 + if (rapl_pmus_core) 602 + num_counters += hweight32(rapl_pmus_core->cntr_mask); 662 603 663 604 pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", 664 - hweight32(rapl_cntr_mask), rapl_timer_ms); 605 + num_counters, rapl_timer_ms); 665 606 666 - for (i = 0; i < NR_RAPL_DOMAINS; i++) { 667 - if (rapl_cntr_mask & (1 << i)) { 607 + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { 608 + if (rapl_pmus_pkg->cntr_mask & (1 << i)) { 668 609 pr_info("hw unit of domain %s 2^-%d Joules\n", 669 - rapl_domain_names[i], rapl_hw_unit[i]); 610 + rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); 670 611 } 671 612 } 613 + 614 + if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE))) 615 + pr_info("hw unit of domain %s 2^-%d Joules\n", 616 + rapl_core_domain_name, rapl_core_hw_unit); 672 617 } 673 618 674 - static void cleanup_rapl_pmus(void) 619 + static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) 675 620 { 676 621 int i; 677 622 678 623 for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) 679 - kfree(rapl_pmus->pmus[i]); 624 + kfree(rapl_pmus->rapl_pmu[i]); 680 625 kfree(rapl_pmus); 681 626 } 682 627 ··· 697 626 NULL, 698 627 }; 699 628 700 - static int __init init_rapl_pmu(void) 629 + static const struct attribute_group *rapl_core_attr_update[] = { 630 + &rapl_events_core_group, 631 + NULL, 632 + }; 633 + 634 + static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) 701 635 { 702 - struct rapl_pmu *pmu; 636 + struct rapl_pmu *rapl_pmu; 703 637 int idx; 704 638 705 639 for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { 706 - pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); 707 - if (!pmu) 640 + rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL); 641 + if (!rapl_pmu) 708 642 goto free; 709 643 710 - raw_spin_lock_init(&pmu->lock); 711 - INIT_LIST_HEAD(&pmu->active_list); 712 - pmu->pmu = &rapl_pmus->pmu; 713 - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); 714 - rapl_hrtimer_init(pmu); 644 + raw_spin_lock_init(&rapl_pmu->lock); 645 + INIT_LIST_HEAD(&rapl_pmu->active_list); 646 + rapl_pmu->pmu = &rapl_pmus->pmu; 647 + rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); 648 + rapl_hrtimer_init(rapl_pmu); 715 649 716 - rapl_pmus->pmus[idx] = pmu; 650 + rapl_pmus->rapl_pmu[idx] = rapl_pmu; 717 651 } 718 652 719 653 return 0; 720 654 free: 721 655 for (; idx > 0; idx--) 722 - kfree(rapl_pmus->pmus[idx - 1]); 656 + kfree(rapl_pmus->rapl_pmu[idx - 1]); 723 657 return -ENOMEM; 724 658 } 725 659 726 - static int __init init_rapl_pmus(void) 660 + static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope, 661 + const struct attribute_group **rapl_attr_groups, 662 + const struct attribute_group **rapl_attr_update) 727 663 { 728 664 int nr_rapl_pmu = topology_max_packages(); 729 - int rapl_pmu_scope = PERF_PMU_SCOPE_PKG; 665 + struct rapl_pmus *rapl_pmus; 730 666 731 - if (!rapl_pmu_is_pkg_scope()) { 732 - nr_rapl_pmu *= topology_max_dies_per_package(); 733 - rapl_pmu_scope = PERF_PMU_SCOPE_DIE; 734 - } 667 + /* 668 + * rapl_pmu_scope must be either PKG, DIE or CORE 669 + */ 670 + if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) 671 + nr_rapl_pmu *= topology_max_dies_per_package(); 672 + else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE) 673 + nr_rapl_pmu *= topology_num_cores_per_package(); 674 + else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG) 675 + return -EINVAL; 735 676 736 - rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); 677 + rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); 737 678 if (!rapl_pmus) 738 679 return -ENOMEM; 680 + 681 + *rapl_pmus_ptr = rapl_pmus; 739 682 740 683 rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; 741 684 rapl_pmus->pmu.attr_groups = rapl_attr_groups; ··· 765 680 rapl_pmus->pmu.module = THIS_MODULE; 766 681 rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; 767 682 768 - return init_rapl_pmu(); 683 + return init_rapl_pmu(rapl_pmus); 769 684 } 770 685 771 686 static struct rapl_model model_snb = { 772 - .events = BIT(PERF_RAPL_PP0) | 687 + .pkg_events = BIT(PERF_RAPL_PP0) | 773 688 BIT(PERF_RAPL_PKG) | 774 689 BIT(PERF_RAPL_PP1), 775 690 .msr_power_unit = MSR_RAPL_POWER_UNIT, 776 - .rapl_msrs = intel_rapl_msrs, 691 + .rapl_pkg_msrs = intel_rapl_msrs, 777 692 }; 778 693 779 694 static struct rapl_model model_snbep = { 780 - .events = BIT(PERF_RAPL_PP0) | 695 + .pkg_events = BIT(PERF_RAPL_PP0) | 781 696 BIT(PERF_RAPL_PKG) | 782 697 BIT(PERF_RAPL_RAM), 783 698 .msr_power_unit = MSR_RAPL_POWER_UNIT, 784 - .rapl_msrs = intel_rapl_msrs, 699 + .rapl_pkg_msrs = intel_rapl_msrs, 785 700 }; 786 701 787 702 static struct rapl_model model_hsw = { 788 - .events = BIT(PERF_RAPL_PP0) | 703 + .pkg_events = BIT(PERF_RAPL_PP0) | 789 704 BIT(PERF_RAPL_PKG) | 790 705 BIT(PERF_RAPL_RAM) | 791 706 BIT(PERF_RAPL_PP1), 792 707 .msr_power_unit = MSR_RAPL_POWER_UNIT, 793 - .rapl_msrs = intel_rapl_msrs, 708 + .rapl_pkg_msrs = intel_rapl_msrs, 794 709 }; 795 710 796 711 static struct rapl_model model_hsx = { 797 - .events = BIT(PERF_RAPL_PP0) | 712 + .pkg_events = BIT(PERF_RAPL_PP0) | 798 713 BIT(PERF_RAPL_PKG) | 799 714 BIT(PERF_RAPL_RAM), 800 715 .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, 801 716 .msr_power_unit = MSR_RAPL_POWER_UNIT, 802 - .rapl_msrs = intel_rapl_msrs, 717 + .rapl_pkg_msrs = intel_rapl_msrs, 803 718 }; 804 719 805 720 static struct rapl_model model_knl = { 806 - .events = BIT(PERF_RAPL_PKG) | 721 + .pkg_events = BIT(PERF_RAPL_PKG) | 807 722 BIT(PERF_RAPL_RAM), 808 723 .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, 809 724 .msr_power_unit = MSR_RAPL_POWER_UNIT, 810 - .rapl_msrs = intel_rapl_msrs, 725 + .rapl_pkg_msrs = intel_rapl_msrs, 811 726 }; 812 727 813 728 static struct rapl_model model_skl = { 814 - .events = BIT(PERF_RAPL_PP0) | 729 + .pkg_events = BIT(PERF_RAPL_PP0) | 815 730 BIT(PERF_RAPL_PKG) | 816 731 BIT(PERF_RAPL_RAM) | 817 732 BIT(PERF_RAPL_PP1) | 818 733 BIT(PERF_RAPL_PSYS), 819 734 .msr_power_unit = MSR_RAPL_POWER_UNIT, 820 - .rapl_msrs = intel_rapl_msrs, 735 + .rapl_pkg_msrs = intel_rapl_msrs, 821 736 }; 822 737 823 738 static struct rapl_model model_spr = { 824 - .events = BIT(PERF_RAPL_PP0) | 739 + .pkg_events = BIT(PERF_RAPL_PP0) | 825 740 BIT(PERF_RAPL_PKG) | 826 741 BIT(PERF_RAPL_RAM) | 827 742 BIT(PERF_RAPL_PSYS), 828 743 .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, 829 744 .msr_power_unit = MSR_RAPL_POWER_UNIT, 830 - .rapl_msrs = intel_rapl_spr_msrs, 745 + .rapl_pkg_msrs = intel_rapl_spr_msrs, 831 746 }; 832 747 833 748 static struct rapl_model model_amd_hygon = { 834 - .events = BIT(PERF_RAPL_PKG), 749 + .pkg_events = BIT(PERF_RAPL_PKG), 750 + .core_events = BIT(PERF_RAPL_CORE), 835 751 .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, 836 - .rapl_msrs = amd_rapl_msrs, 752 + .rapl_pkg_msrs = amd_rapl_pkg_msrs, 753 + .rapl_core_msrs = amd_rapl_core_msrs, 837 754 }; 838 755 839 756 static const struct x86_cpu_id rapl_model_match[] __initconst = { ··· 891 804 static int __init rapl_pmu_init(void) 892 805 { 893 806 const struct x86_cpu_id *id; 894 - struct rapl_model *rm; 807 + int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE; 895 808 int ret; 809 + 810 + if (rapl_pkg_pmu_is_pkg_scope()) 811 + rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG; 896 812 897 813 id = x86_match_cpu(rapl_model_match); 898 814 if (!id) 899 815 return -ENODEV; 900 816 901 - rm = (struct rapl_model *) id->driver_data; 817 + rapl_model = (struct rapl_model *) id->driver_data; 902 818 903 - rapl_msrs = rm->rapl_msrs; 904 - 905 - rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, 906 - false, (void *) &rm->events); 907 - 908 - ret = rapl_check_hw_unit(rm); 819 + ret = rapl_check_hw_unit(); 909 820 if (ret) 910 821 return ret; 911 822 912 - ret = init_rapl_pmus(); 823 + ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups, 824 + rapl_attr_update); 913 825 if (ret) 914 826 return ret; 915 827 916 - ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); 828 + rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, 829 + PERF_RAPL_PKG_EVENTS_MAX, false, 830 + (void *) &rapl_model->pkg_events); 831 + 832 + ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); 917 833 if (ret) 918 834 goto out; 919 835 836 + if (rapl_model->core_events) { 837 + ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE, 838 + rapl_core_attr_groups, 839 + rapl_core_attr_update); 840 + if (ret) { 841 + pr_warn("power-core PMU initialization failed (%d)\n", ret); 842 + goto core_init_failed; 843 + } 844 + 845 + rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, 846 + PERF_RAPL_CORE_EVENTS_MAX, false, 847 + (void *) &rapl_model->core_events); 848 + 849 + ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1); 850 + if (ret) { 851 + pr_warn("power-core PMU registration failed (%d)\n", ret); 852 + cleanup_rapl_pmus(rapl_pmus_core); 853 + } 854 + } 855 + 856 + core_init_failed: 920 857 rapl_advertise(); 921 858 return 0; 922 859 923 860 out: 924 861 pr_warn("Initialization failed (%d), disabled\n", ret); 925 - cleanup_rapl_pmus(); 862 + cleanup_rapl_pmus(rapl_pmus_pkg); 926 863 return ret; 927 864 } 928 865 module_init(rapl_pmu_init); 929 866 930 867 static void __exit intel_rapl_exit(void) 931 868 { 932 - perf_pmu_unregister(&rapl_pmus->pmu); 933 - cleanup_rapl_pmus(); 869 + if (rapl_pmus_core) { 870 + perf_pmu_unregister(&rapl_pmus_core->pmu); 871 + cleanup_rapl_pmus(rapl_pmus_core); 872 + } 873 + perf_pmu_unregister(&rapl_pmus_pkg->pmu); 874 + cleanup_rapl_pmus(rapl_pmus_pkg); 934 875 } 935 876 module_exit(intel_rapl_exit);
+18 -2
arch/x86/include/asm/perf_event.h
··· 41 41 #define INTEL_FIXED_0_USER (1ULL << 1) 42 42 #define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) 43 43 #define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) 44 + #define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2) 44 45 45 46 #define HSW_IN_TX (1ULL << 32) 46 47 #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) ··· 373 372 #define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND 374 373 #define INTEL_TD_METRIC_NUM 8 375 374 375 + #define INTEL_TD_CFG_METRIC_CLEAR_BIT 0 376 + #define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT) 377 + 376 378 static inline bool is_metric_idx(int idx) 377 379 { 378 380 return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; ··· 426 422 */ 427 423 428 424 struct pebs_basic { 429 - u64 format_size; 425 + u64 format_group:32, 426 + retire_latency:16, 427 + format_size:16; 430 428 u64 ip; 431 429 u64 applicable_counters; 432 430 u64 tsc; ··· 437 431 struct pebs_meminfo { 438 432 u64 address; 439 433 u64 aux; 440 - u64 latency; 434 + union { 435 + /* pre Alder Lake */ 436 + u64 mem_latency; 437 + /* Alder Lake and later */ 438 + struct { 439 + u64 instr_latency:16; 440 + u64 pad2:16; 441 + u64 cache_latency:16; 442 + u64 pad3:16; 443 + }; 444 + }; 441 445 u64 tsx_tuning; 442 446 }; 443 447
+1
arch/x86/include/asm/processor.h
··· 98 98 // Logical ID mappings 99 99 u32 logical_pkg_id; 100 100 u32 logical_die_id; 101 + u32 logical_core_id; 101 102 102 103 // AMD Node ID and Nodes per Package info 103 104 u32 amd_node_id;
+1
arch/x86/include/asm/topology.h
··· 143 143 #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) 144 144 #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) 145 145 #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) 146 + #define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) 146 147 #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) 147 148 #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) 148 149 #define topology_ppin(cpu) (cpu_data(cpu).ppin)
+1
arch/x86/kernel/cpu/debugfs.c
··· 25 25 seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); 26 26 seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); 27 27 seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); 28 + seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); 28 29 seq_printf(m, "llc_id: %u\n", c->topo.llc_id); 29 30 seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); 30 31 seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
+1
arch/x86/kernel/cpu/topology_common.c
··· 185 185 if (!early) { 186 186 c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); 187 187 c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); 188 + c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); 188 189 } 189 190 190 191 /* Package relative core ID */
+6 -6
include/linux/mm.h
··· 711 711 * we don't rely on for anything - the mm_lock_seq read against which we 712 712 * need ordering is below. 713 713 */ 714 - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) 714 + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) 715 715 return false; 716 716 717 717 if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) ··· 728 728 * after it has been unlocked. 729 729 * This pairs with RELEASE semantics in vma_end_write_all(). 730 730 */ 731 - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { 731 + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { 732 732 up_read(&vma->vm_lock->lock); 733 733 return false; 734 734 } ··· 743 743 } 744 744 745 745 /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ 746 - static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) 746 + static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) 747 747 { 748 748 mmap_assert_write_locked(vma->vm_mm); 749 749 ··· 751 751 * current task is holding mmap_write_lock, both vma->vm_lock_seq and 752 752 * mm->mm_lock_seq can't be concurrently modified. 753 753 */ 754 - *mm_lock_seq = vma->vm_mm->mm_lock_seq; 754 + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; 755 755 return (vma->vm_lock_seq == *mm_lock_seq); 756 756 } 757 757 ··· 762 762 */ 763 763 static inline void vma_start_write(struct vm_area_struct *vma) 764 764 { 765 - int mm_lock_seq; 765 + unsigned int mm_lock_seq; 766 766 767 767 if (__is_vma_write_locked(vma, &mm_lock_seq)) 768 768 return; ··· 780 780 781 781 static inline void vma_assert_write_locked(struct vm_area_struct *vma) 782 782 { 783 - int mm_lock_seq; 783 + unsigned int mm_lock_seq; 784 784 785 785 VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); 786 786 }
+5 -2
include/linux/mm_types.h
··· 727 727 * counter reuse can only lead to occasional unnecessary use of the 728 728 * slowpath. 729 729 */ 730 - int vm_lock_seq; 730 + unsigned int vm_lock_seq; 731 731 /* Unstable RCU readers are allowed to read this. */ 732 732 struct vma_lock *vm_lock; 733 733 #endif ··· 921 921 * Roughly speaking, incrementing the sequence number is 922 922 * equivalent to releasing locks on VMAs; reading the sequence 923 923 * number can be part of taking a read lock on a VMA. 924 + * Incremented every time mmap_lock is write-locked/unlocked. 925 + * Initialized to 0, therefore odd values indicate mmap_lock 926 + * is write-locked and even values that it's released. 924 927 * 925 928 * Can be modified under write mmap_lock using RELEASE 926 929 * semantics. ··· 932 929 * Can be read with ACQUIRE semantics if not holding write 933 930 * mmap_lock. 934 931 */ 935 - int mm_lock_seq; 932 + seqcount_t mm_lock_seq; 936 933 #endif 937 934 938 935
+86 -40
include/linux/mmap_lock.h
··· 71 71 } 72 72 73 73 #ifdef CONFIG_PER_VMA_LOCK 74 + 75 + static inline void mm_lock_seqcount_init(struct mm_struct *mm) 76 + { 77 + seqcount_init(&mm->mm_lock_seq); 78 + } 79 + 80 + static inline void mm_lock_seqcount_begin(struct mm_struct *mm) 81 + { 82 + do_raw_write_seqcount_begin(&mm->mm_lock_seq); 83 + } 84 + 85 + static inline void mm_lock_seqcount_end(struct mm_struct *mm) 86 + { 87 + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); 88 + do_raw_write_seqcount_end(&mm->mm_lock_seq); 89 + } 90 + 91 + static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) 92 + { 93 + /* 94 + * Since mmap_lock is a sleeping lock, and waiting for it to become 95 + * unlocked is more or less equivalent with taking it ourselves, don't 96 + * bother with the speculative path if mmap_lock is already write-locked 97 + * and take the slow path, which takes the lock. 98 + */ 99 + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); 100 + } 101 + 102 + static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) 103 + { 104 + return read_seqcount_retry(&mm->mm_lock_seq, seq); 105 + } 106 + 107 + #else /* CONFIG_PER_VMA_LOCK */ 108 + 109 + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} 110 + static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} 111 + static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} 112 + 113 + static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) 114 + { 115 + return false; 116 + } 117 + 118 + static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) 119 + { 120 + return true; 121 + } 122 + 123 + #endif /* CONFIG_PER_VMA_LOCK */ 124 + 125 + static inline void mmap_init_lock(struct mm_struct *mm) 126 + { 127 + init_rwsem(&mm->mmap_lock); 128 + mm_lock_seqcount_init(mm); 129 + } 130 + 131 + static inline void mmap_write_lock(struct mm_struct *mm) 132 + { 133 + __mmap_lock_trace_start_locking(mm, true); 134 + down_write(&mm->mmap_lock); 135 + mm_lock_seqcount_begin(mm); 136 + __mmap_lock_trace_acquire_returned(mm, true, true); 137 + } 138 + 139 + static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) 140 + { 141 + __mmap_lock_trace_start_locking(mm, true); 142 + down_write_nested(&mm->mmap_lock, subclass); 143 + mm_lock_seqcount_begin(mm); 144 + __mmap_lock_trace_acquire_returned(mm, true, true); 145 + } 146 + 147 + static inline int mmap_write_lock_killable(struct mm_struct *mm) 148 + { 149 + int ret; 150 + 151 + __mmap_lock_trace_start_locking(mm, true); 152 + ret = down_write_killable(&mm->mmap_lock); 153 + if (!ret) 154 + mm_lock_seqcount_begin(mm); 155 + __mmap_lock_trace_acquire_returned(mm, true, ret == 0); 156 + return ret; 157 + } 158 + 74 159 /* 75 160 * Drop all currently-held per-VMA locks. 76 161 * This is called from the mmap_lock implementation directly before releasing ··· 167 82 static inline void vma_end_write_all(struct mm_struct *mm) 168 83 { 169 84 mmap_assert_write_locked(mm); 170 - /* 171 - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive 172 - * mmap_lock being held. 173 - * We need RELEASE semantics here to ensure that preceding stores into 174 - * the VMA take effect before we unlock it with this store. 175 - * Pairs with ACQUIRE semantics in vma_start_read(). 176 - */ 177 - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); 178 - } 179 - #else 180 - static inline void vma_end_write_all(struct mm_struct *mm) {} 181 - #endif 182 - 183 - static inline void mmap_init_lock(struct mm_struct *mm) 184 - { 185 - init_rwsem(&mm->mmap_lock); 186 - } 187 - 188 - static inline void mmap_write_lock(struct mm_struct *mm) 189 - { 190 - __mmap_lock_trace_start_locking(mm, true); 191 - down_write(&mm->mmap_lock); 192 - __mmap_lock_trace_acquire_returned(mm, true, true); 193 - } 194 - 195 - static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) 196 - { 197 - __mmap_lock_trace_start_locking(mm, true); 198 - down_write_nested(&mm->mmap_lock, subclass); 199 - __mmap_lock_trace_acquire_returned(mm, true, true); 200 - } 201 - 202 - static inline int mmap_write_lock_killable(struct mm_struct *mm) 203 - { 204 - int ret; 205 - 206 - __mmap_lock_trace_start_locking(mm, true); 207 - ret = down_write_killable(&mm->mmap_lock); 208 - __mmap_lock_trace_acquire_returned(mm, true, ret == 0); 209 - return ret; 85 + mm_lock_seqcount_end(mm); 210 86 } 211 87 212 88 static inline void mmap_write_unlock(struct mm_struct *mm)
+27 -5
include/linux/perf_event.h
··· 1279 1279 { 1280 1280 int size = 1; 1281 1281 1282 + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) 1283 + return; 1284 + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) 1285 + return; 1286 + 1282 1287 data->callchain = perf_callchain(event, regs); 1283 1288 size += data->callchain->nr; 1284 1289 ··· 1292 1287 } 1293 1288 1294 1289 static inline void perf_sample_save_raw_data(struct perf_sample_data *data, 1290 + struct perf_event *event, 1295 1291 struct perf_raw_record *raw) 1296 1292 { 1297 1293 struct perf_raw_frag *frag = &raw->frag; 1298 1294 u32 sum = 0; 1299 1295 int size; 1296 + 1297 + if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) 1298 + return; 1299 + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) 1300 + return; 1300 1301 1301 1302 do { 1302 1303 sum += frag->size; ··· 1320 1309 data->sample_flags |= PERF_SAMPLE_RAW; 1321 1310 } 1322 1311 1312 + static inline bool has_branch_stack(struct perf_event *event) 1313 + { 1314 + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; 1315 + } 1316 + 1323 1317 static inline void perf_sample_save_brstack(struct perf_sample_data *data, 1324 1318 struct perf_event *event, 1325 1319 struct perf_branch_stack *brs, 1326 1320 u64 *brs_cntr) 1327 1321 { 1328 1322 int size = sizeof(u64); /* nr */ 1323 + 1324 + if (!has_branch_stack(event)) 1325 + return; 1326 + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) 1327 + return; 1329 1328 1330 1329 if (branch_sample_hw_index(event)) 1331 1330 size += sizeof(u64); ··· 1690 1669 return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); 1691 1670 } 1692 1671 1672 + extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); 1673 + 1693 1674 extern void perf_event_init(void); 1694 1675 extern void perf_tp_event(u16 event_type, u64 count, void *record, 1695 1676 int entry_size, struct pt_regs *regs, ··· 1727 1704 } 1728 1705 # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) 1729 1706 #endif 1730 - 1731 - static inline bool has_branch_stack(struct perf_event *event) 1732 - { 1733 - return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; 1734 - } 1735 1707 1736 1708 static inline bool needs_branch_stack(struct perf_event *event) 1737 1709 { ··· 1894 1876 return -EINVAL; 1895 1877 } 1896 1878 static inline u64 perf_event_pause(struct perf_event *event, bool reset) 1879 + { 1880 + return 0; 1881 + } 1882 + static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) 1897 1883 { 1898 1884 return 0; 1899 1885 }
+22
include/linux/seqlock.h
··· 319 319 }) 320 320 321 321 /** 322 + * raw_seqcount_try_begin() - begin a seqcount_t read critical section 323 + * w/o lockdep and w/o counter stabilization 324 + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants 325 + * 326 + * Similar to raw_seqcount_begin(), except it enables eliding the critical 327 + * section entirely if odd, instead of doing the speculation knowing it will 328 + * fail. 329 + * 330 + * Useful when counter stabilization is more or less equivalent to taking 331 + * the lock and there is a slowpath that does that. 332 + * 333 + * If true, start will be set to the (even) sequence count read. 334 + * 335 + * Return: true when a read critical section is started. 336 + */ 337 + #define raw_seqcount_try_begin(s, start) \ 338 + ({ \ 339 + start = raw_read_seqcount(s); \ 340 + !(start & 1); \ 341 + }) 342 + 343 + /** 322 344 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o 323 345 * lockdep and w/o counter stabilization 324 346 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
+13 -3
include/linux/uprobes.h
··· 16 16 #include <linux/types.h> 17 17 #include <linux/wait.h> 18 18 #include <linux/timer.h> 19 + #include <linux/seqlock.h> 19 20 20 21 struct uprobe; 21 22 struct vm_area_struct; ··· 125 124 unsigned int depth; 126 125 struct return_instance *return_instances; 127 126 127 + struct return_instance *ri_pool; 128 + struct timer_list ri_timer; 129 + seqcount_t ri_seqcount; 130 + 128 131 union { 129 132 struct { 130 133 struct arch_uprobe_task autask; ··· 142 137 }; 143 138 144 139 struct uprobe *active_uprobe; 145 - struct timer_list ri_timer; 146 140 unsigned long xol_vaddr; 147 141 148 142 struct arch_uprobe *auprobe; ··· 158 154 unsigned long stack; /* stack pointer */ 159 155 unsigned long orig_ret_vaddr; /* original return address */ 160 156 bool chained; /* true, if instance is nested */ 161 - int consumers_cnt; 157 + int cons_cnt; /* total number of session consumers */ 162 158 163 159 struct return_instance *next; /* keep as stack */ 164 160 struct rcu_head rcu; 165 161 166 - struct return_consumer consumers[] __counted_by(consumers_cnt); 162 + /* singular pre-allocated return_consumer instance for common case */ 163 + struct return_consumer consumer; 164 + /* 165 + * extra return_consumer instances for rare cases of multiple session consumers, 166 + * contains (cons_cnt - 1) elements 167 + */ 168 + struct return_consumer *extra_consumers; 167 169 } ____cacheline_aligned; 168 170 169 171 enum rp_check {
+100 -56
kernel/events/core.c
··· 6277 6277 } 6278 6278 EXPORT_SYMBOL_GPL(perf_event_update_userpage); 6279 6279 6280 - static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) 6281 - { 6282 - struct perf_event *event = vmf->vma->vm_file->private_data; 6283 - struct perf_buffer *rb; 6284 - vm_fault_t ret = VM_FAULT_SIGBUS; 6285 - 6286 - if (vmf->flags & FAULT_FLAG_MKWRITE) { 6287 - if (vmf->pgoff == 0) 6288 - ret = 0; 6289 - return ret; 6290 - } 6291 - 6292 - rcu_read_lock(); 6293 - rb = rcu_dereference(event->rb); 6294 - if (!rb) 6295 - goto unlock; 6296 - 6297 - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 6298 - goto unlock; 6299 - 6300 - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); 6301 - if (!vmf->page) 6302 - goto unlock; 6303 - 6304 - get_page(vmf->page); 6305 - vmf->page->mapping = vmf->vma->vm_file->f_mapping; 6306 - vmf->page->index = vmf->pgoff; 6307 - 6308 - ret = 0; 6309 - unlock: 6310 - rcu_read_unlock(); 6311 - 6312 - return ret; 6313 - } 6314 - 6315 6280 static void ring_buffer_attach(struct perf_event *event, 6316 6281 struct perf_buffer *rb) 6317 6282 { ··· 6516 6551 ring_buffer_put(rb); /* could be last */ 6517 6552 } 6518 6553 6554 + static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) 6555 + { 6556 + /* The first page is the user control page, others are read-only. */ 6557 + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; 6558 + } 6559 + 6519 6560 static const struct vm_operations_struct perf_mmap_vmops = { 6520 6561 .open = perf_mmap_open, 6521 6562 .close = perf_mmap_close, /* non mergeable */ 6522 - .fault = perf_mmap_fault, 6523 - .page_mkwrite = perf_mmap_fault, 6563 + .pfn_mkwrite = perf_mmap_pfn_mkwrite, 6524 6564 }; 6565 + 6566 + static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) 6567 + { 6568 + unsigned long nr_pages = vma_pages(vma); 6569 + int err = 0; 6570 + unsigned long pagenum; 6571 + 6572 + /* 6573 + * We map this as a VM_PFNMAP VMA. 6574 + * 6575 + * This is not ideal as this is designed broadly for mappings of PFNs 6576 + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which 6577 + * !pfn_valid(pfn). 6578 + * 6579 + * We are mapping kernel-allocated memory (memory we manage ourselves) 6580 + * which would more ideally be mapped using vm_insert_page() or a 6581 + * similar mechanism, that is as a VM_MIXEDMAP mapping. 6582 + * 6583 + * However this won't work here, because: 6584 + * 6585 + * 1. It uses vma->vm_page_prot, but this field has not been completely 6586 + * setup at the point of the f_op->mmp() hook, so we are unable to 6587 + * indicate that this should be mapped CoW in order that the 6588 + * mkwrite() hook can be invoked to make the first page R/W and the 6589 + * rest R/O as desired. 6590 + * 6591 + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in 6592 + * vm_normal_page() returning a struct page * pointer, which means 6593 + * vm_ops->page_mkwrite() will be invoked rather than 6594 + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping 6595 + * to work around retry logic in the fault handler, however this 6596 + * field is no longer allowed to be used within struct page. 6597 + * 6598 + * 3. Having a struct page * made available in the fault logic also 6599 + * means that the page gets put on the rmap and becomes 6600 + * inappropriately accessible and subject to map and ref counting. 6601 + * 6602 + * Ideally we would have a mechanism that could explicitly express our 6603 + * desires, but this is not currently the case, so we instead use 6604 + * VM_PFNMAP. 6605 + * 6606 + * We manage the lifetime of these mappings with internal refcounts (see 6607 + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of 6608 + * this mapping is maintained correctly. 6609 + */ 6610 + for (pagenum = 0; pagenum < nr_pages; pagenum++) { 6611 + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; 6612 + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); 6613 + 6614 + if (page == NULL) { 6615 + err = -EINVAL; 6616 + break; 6617 + } 6618 + 6619 + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ 6620 + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, 6621 + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); 6622 + if (err) 6623 + break; 6624 + } 6625 + 6626 + #ifdef CONFIG_MMU 6627 + /* Clear any partial mappings on error. */ 6628 + if (err) 6629 + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); 6630 + #endif 6631 + 6632 + return err; 6633 + } 6525 6634 6526 6635 static int perf_mmap(struct file *file, struct vm_area_struct *vma) 6527 6636 { ··· 6721 6682 goto again; 6722 6683 } 6723 6684 6685 + /* We need the rb to map pages. */ 6686 + rb = event->rb; 6724 6687 goto unlock; 6725 6688 } 6726 6689 ··· 6816 6775 */ 6817 6776 vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); 6818 6777 vma->vm_ops = &perf_mmap_vmops; 6778 + 6779 + if (!ret) 6780 + ret = map_range(rb, vma); 6819 6781 6820 6782 if (event->pmu->event_mapped) 6821 6783 event->pmu->event_mapped(event, vma->vm_mm); ··· 10083 10039 perf_swevent_overflow(event, 0, data, regs); 10084 10040 } 10085 10041 10086 - static int perf_exclude_event(struct perf_event *event, 10087 - struct pt_regs *regs) 10042 + int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) 10088 10043 { 10089 10044 if (event->hw.state & PERF_HES_STOPPED) 10090 10045 return 1; ··· 10468 10425 }; 10469 10426 10470 10427 static int perf_tp_filter_match(struct perf_event *event, 10471 - struct perf_sample_data *data) 10428 + struct perf_raw_record *raw) 10472 10429 { 10473 - void *record = data->raw->frag.data; 10430 + void *record = raw->frag.data; 10474 10431 10475 10432 /* only top level events have filters set */ 10476 10433 if (event->parent) ··· 10482 10439 } 10483 10440 10484 10441 static int perf_tp_event_match(struct perf_event *event, 10485 - struct perf_sample_data *data, 10442 + struct perf_raw_record *raw, 10486 10443 struct pt_regs *regs) 10487 10444 { 10488 10445 if (event->hw.state & PERF_HES_STOPPED) ··· 10493 10450 if (event->attr.exclude_kernel && !user_mode(regs)) 10494 10451 return 0; 10495 10452 10496 - if (!perf_tp_filter_match(event, data)) 10453 + if (!perf_tp_filter_match(event, raw)) 10497 10454 return 0; 10498 10455 10499 10456 return 1; ··· 10519 10476 static void __perf_tp_event_target_task(u64 count, void *record, 10520 10477 struct pt_regs *regs, 10521 10478 struct perf_sample_data *data, 10479 + struct perf_raw_record *raw, 10522 10480 struct perf_event *event) 10523 10481 { 10524 10482 struct trace_entry *entry = record; ··· 10529 10485 /* Cannot deliver synchronous signal to other task. */ 10530 10486 if (event->attr.sigtrap) 10531 10487 return; 10532 - if (perf_tp_event_match(event, data, regs)) 10488 + if (perf_tp_event_match(event, raw, regs)) { 10489 + perf_sample_data_init(data, 0, 0); 10490 + perf_sample_save_raw_data(data, event, raw); 10533 10491 perf_swevent_event(event, count, data, regs); 10492 + } 10534 10493 } 10535 10494 10536 10495 static void perf_tp_event_target_task(u64 count, void *record, 10537 10496 struct pt_regs *regs, 10538 10497 struct perf_sample_data *data, 10498 + struct perf_raw_record *raw, 10539 10499 struct perf_event_context *ctx) 10540 10500 { 10541 10501 unsigned int cpu = smp_processor_id(); ··· 10547 10499 struct perf_event *event, *sibling; 10548 10500 10549 10501 perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { 10550 - __perf_tp_event_target_task(count, record, regs, data, event); 10502 + __perf_tp_event_target_task(count, record, regs, data, raw, event); 10551 10503 for_each_sibling_event(sibling, event) 10552 - __perf_tp_event_target_task(count, record, regs, data, sibling); 10504 + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); 10553 10505 } 10554 10506 10555 10507 perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { 10556 - __perf_tp_event_target_task(count, record, regs, data, event); 10508 + __perf_tp_event_target_task(count, record, regs, data, raw, event); 10557 10509 for_each_sibling_event(sibling, event) 10558 - __perf_tp_event_target_task(count, record, regs, data, sibling); 10510 + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); 10559 10511 } 10560 10512 } 10561 10513 ··· 10573 10525 }, 10574 10526 }; 10575 10527 10576 - perf_sample_data_init(&data, 0, 0); 10577 - perf_sample_save_raw_data(&data, &raw); 10578 - 10579 10528 perf_trace_buf_update(record, event_type); 10580 10529 10581 10530 hlist_for_each_entry_rcu(event, head, hlist_entry) { 10582 - if (perf_tp_event_match(event, &data, regs)) { 10583 - perf_swevent_event(event, count, &data, regs); 10584 - 10531 + if (perf_tp_event_match(event, &raw, regs)) { 10585 10532 /* 10586 10533 * Here use the same on-stack perf_sample_data, 10587 10534 * some members in data are event-specific and ··· 10586 10543 * because data->sample_flags is set. 10587 10544 */ 10588 10545 perf_sample_data_init(&data, 0, 0); 10589 - perf_sample_save_raw_data(&data, &raw); 10546 + perf_sample_save_raw_data(&data, event, &raw); 10547 + perf_swevent_event(event, count, &data, regs); 10590 10548 } 10591 10549 } 10592 10550 ··· 10604 10560 goto unlock; 10605 10561 10606 10562 raw_spin_lock(&ctx->lock); 10607 - perf_tp_event_target_task(count, record, regs, &data, ctx); 10563 + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); 10608 10564 raw_spin_unlock(&ctx->lock); 10609 10565 unlock: 10610 10566 rcu_read_unlock();
+1 -18
kernel/events/ring_buffer.c
··· 643 643 struct page *page = virt_to_page(rb->aux_pages[idx]); 644 644 645 645 ClearPagePrivate(page); 646 - page->mapping = NULL; 647 646 __free_page(page); 648 647 } 649 648 ··· 818 819 { 819 820 struct page *page = virt_to_page(addr); 820 821 821 - page->mapping = NULL; 822 822 __free_page(page); 823 823 } 824 824 ··· 888 890 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); 889 891 } 890 892 891 - static void perf_mmap_unmark_page(void *addr) 892 - { 893 - struct page *page = vmalloc_to_page(addr); 894 - 895 - page->mapping = NULL; 896 - } 897 - 898 893 static void rb_free_work(struct work_struct *work) 899 894 { 900 895 struct perf_buffer *rb; 901 - void *base; 902 - int i, nr; 903 896 904 897 rb = container_of(work, struct perf_buffer, work); 905 - nr = data_page_nr(rb); 906 898 907 - base = rb->user_page; 908 - /* The '<=' counts in the user page. */ 909 - for (i = 0; i <= nr; i++) 910 - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 911 - 912 - vfree(base); 899 + vfree(rb->user_page); 913 900 kfree(rb); 914 901 } 915 902
+168 -57
kernel/events/uprobes.c
··· 1888 1888 return instruction_pointer(regs); 1889 1889 } 1890 1890 1891 - static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) 1891 + static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) 1892 1892 { 1893 - struct return_instance *next = ri->next; 1893 + ri->cons_cnt = 0; 1894 + ri->next = utask->ri_pool; 1895 + utask->ri_pool = ri; 1896 + } 1897 + 1898 + static struct return_instance *ri_pool_pop(struct uprobe_task *utask) 1899 + { 1900 + struct return_instance *ri = utask->ri_pool; 1901 + 1902 + if (likely(ri)) 1903 + utask->ri_pool = ri->next; 1904 + 1905 + return ri; 1906 + } 1907 + 1908 + static void ri_free(struct return_instance *ri) 1909 + { 1910 + kfree(ri->extra_consumers); 1911 + kfree_rcu(ri, rcu); 1912 + } 1913 + 1914 + static void free_ret_instance(struct uprobe_task *utask, 1915 + struct return_instance *ri, bool cleanup_hprobe) 1916 + { 1917 + unsigned seq; 1894 1918 1895 1919 if (cleanup_hprobe) { 1896 1920 enum hprobe_state hstate; ··· 1923 1899 hprobe_finalize(&ri->hprobe, hstate); 1924 1900 } 1925 1901 1926 - kfree_rcu(ri, rcu); 1927 - return next; 1902 + /* 1903 + * At this point return_instance is unlinked from utask's 1904 + * return_instances list and this has become visible to ri_timer(). 1905 + * If seqcount now indicates that ri_timer's return instance 1906 + * processing loop isn't active, we can return ri into the pool of 1907 + * to-be-reused return instances for future uretprobes. If ri_timer() 1908 + * happens to be running right now, though, we fallback to safety and 1909 + * just perform RCU-delated freeing of ri. 1910 + */ 1911 + if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { 1912 + /* immediate reuse of ri without RCU GP is OK */ 1913 + ri_pool_push(utask, ri); 1914 + } else { 1915 + /* we might be racing with ri_timer(), so play it safe */ 1916 + ri_free(ri); 1917 + } 1928 1918 } 1929 1919 1930 1920 /* ··· 1948 1910 void uprobe_free_utask(struct task_struct *t) 1949 1911 { 1950 1912 struct uprobe_task *utask = t->utask; 1951 - struct return_instance *ri; 1913 + struct return_instance *ri, *ri_next; 1952 1914 1953 1915 if (!utask) 1954 1916 return; ··· 1959 1921 timer_delete_sync(&utask->ri_timer); 1960 1922 1961 1923 ri = utask->return_instances; 1962 - while (ri) 1963 - ri = free_ret_instance(ri, true /* cleanup_hprobe */); 1924 + while (ri) { 1925 + ri_next = ri->next; 1926 + free_ret_instance(utask, ri, true /* cleanup_hprobe */); 1927 + ri = ri_next; 1928 + } 1929 + 1930 + /* free_ret_instance() above might add to ri_pool, so this loop should come last */ 1931 + ri = utask->ri_pool; 1932 + while (ri) { 1933 + ri_next = ri->next; 1934 + ri_free(ri); 1935 + ri = ri_next; 1936 + } 1964 1937 1965 1938 kfree(utask); 1966 1939 } ··· 1991 1942 /* RCU protects return_instance from freeing. */ 1992 1943 guard(rcu)(); 1993 1944 1945 + write_seqcount_begin(&utask->ri_seqcount); 1946 + 1994 1947 for_each_ret_instance_rcu(ri, utask->return_instances) 1995 1948 hprobe_expire(&ri->hprobe, false); 1949 + 1950 + write_seqcount_end(&utask->ri_seqcount); 1996 1951 } 1997 1952 1998 1953 static struct uprobe_task *alloc_utask(void) ··· 2008 1955 return NULL; 2009 1956 2010 1957 timer_setup(&utask->ri_timer, ri_timer, 0); 1958 + seqcount_init(&utask->ri_seqcount); 2011 1959 2012 1960 return utask; 2013 1961 } ··· 2028 1974 return current->utask; 2029 1975 } 2030 1976 2031 - static size_t ri_size(int consumers_cnt) 1977 + static struct return_instance *alloc_return_instance(struct uprobe_task *utask) 2032 1978 { 2033 1979 struct return_instance *ri; 2034 1980 2035 - return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; 2036 - } 1981 + ri = ri_pool_pop(utask); 1982 + if (ri) 1983 + return ri; 2037 1984 2038 - #define DEF_CNT 4 2039 - 2040 - static struct return_instance *alloc_return_instance(void) 2041 - { 2042 - struct return_instance *ri; 2043 - 2044 - ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); 1985 + ri = kzalloc(sizeof(*ri), GFP_KERNEL); 2045 1986 if (!ri) 2046 1987 return ZERO_SIZE_PTR; 2047 1988 2048 - ri->consumers_cnt = DEF_CNT; 2049 1989 return ri; 2050 1990 } 2051 1991 2052 1992 static struct return_instance *dup_return_instance(struct return_instance *old) 2053 1993 { 2054 - size_t size = ri_size(old->consumers_cnt); 1994 + struct return_instance *ri; 2055 1995 2056 - return kmemdup(old, size, GFP_KERNEL); 1996 + ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); 1997 + if (!ri) 1998 + return NULL; 1999 + 2000 + if (unlikely(old->cons_cnt > 1)) { 2001 + ri->extra_consumers = kmemdup(old->extra_consumers, 2002 + sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), 2003 + GFP_KERNEL); 2004 + if (!ri->extra_consumers) { 2005 + kfree(ri); 2006 + return NULL; 2007 + } 2008 + } 2009 + 2010 + return ri; 2057 2011 } 2058 2012 2059 2013 static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) ··· 2170 2108 static void cleanup_return_instances(struct uprobe_task *utask, bool chained, 2171 2109 struct pt_regs *regs) 2172 2110 { 2173 - struct return_instance *ri = utask->return_instances; 2111 + struct return_instance *ri = utask->return_instances, *ri_next; 2174 2112 enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; 2175 2113 2176 2114 while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { 2177 - ri = free_ret_instance(ri, true /* cleanup_hprobe */); 2115 + ri_next = ri->next; 2116 + rcu_assign_pointer(utask->return_instances, ri_next); 2178 2117 utask->depth--; 2118 + 2119 + free_ret_instance(utask, ri, true /* cleanup_hprobe */); 2120 + ri = ri_next; 2179 2121 } 2180 - rcu_assign_pointer(utask->return_instances, ri); 2181 2122 } 2182 2123 2183 2124 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, ··· 2245 2180 2246 2181 return; 2247 2182 free: 2248 - kfree(ri); 2183 + ri_free(ri); 2249 2184 } 2250 2185 2251 2186 /* Prepare to single-step probed instruction out of line. */ ··· 2359 2294 return is_trap_insn(&opcode); 2360 2295 } 2361 2296 2297 + static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) 2298 + { 2299 + struct mm_struct *mm = current->mm; 2300 + struct uprobe *uprobe = NULL; 2301 + struct vm_area_struct *vma; 2302 + struct file *vm_file; 2303 + loff_t offset; 2304 + unsigned int seq; 2305 + 2306 + guard(rcu)(); 2307 + 2308 + if (!mmap_lock_speculate_try_begin(mm, &seq)) 2309 + return NULL; 2310 + 2311 + vma = vma_lookup(mm, bp_vaddr); 2312 + if (!vma) 2313 + return NULL; 2314 + 2315 + /* 2316 + * vm_file memory can be reused for another instance of struct file, 2317 + * but can't be freed from under us, so it's safe to read fields from 2318 + * it, even if the values are some garbage values; ultimately 2319 + * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure 2320 + * that whatever we speculatively found is correct 2321 + */ 2322 + vm_file = READ_ONCE(vma->vm_file); 2323 + if (!vm_file) 2324 + return NULL; 2325 + 2326 + offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); 2327 + uprobe = find_uprobe_rcu(vm_file->f_inode, offset); 2328 + if (!uprobe) 2329 + return NULL; 2330 + 2331 + /* now double check that nothing about MM changed */ 2332 + if (mmap_lock_speculate_retry(mm, seq)) 2333 + return NULL; 2334 + 2335 + return uprobe; 2336 + } 2337 + 2362 2338 /* assumes being inside RCU protected region */ 2363 2339 static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) 2364 2340 { ··· 2407 2301 struct uprobe *uprobe = NULL; 2408 2302 struct vm_area_struct *vma; 2409 2303 2304 + uprobe = find_active_uprobe_speculative(bp_vaddr); 2305 + if (uprobe) 2306 + return uprobe; 2307 + 2410 2308 mmap_read_lock(mm); 2411 2309 vma = vma_lookup(mm, bp_vaddr); 2412 2310 if (vma) { 2413 - if (valid_vma(vma, false)) { 2311 + if (vma->vm_file) { 2414 2312 struct inode *inode = file_inode(vma->vm_file); 2415 2313 loff_t offset = vaddr_to_offset(vma, bp_vaddr); 2416 2314 ··· 2434 2324 return uprobe; 2435 2325 } 2436 2326 2437 - static struct return_instance* 2438 - push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) 2327 + static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) 2439 2328 { 2329 + struct return_consumer *ric; 2330 + 2440 2331 if (unlikely(ri == ZERO_SIZE_PTR)) 2441 2332 return ri; 2442 2333 2443 - if (unlikely(idx >= ri->consumers_cnt)) { 2444 - struct return_instance *old_ri = ri; 2445 - 2446 - ri->consumers_cnt += DEF_CNT; 2447 - ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); 2448 - if (!ri) { 2449 - kfree(old_ri); 2334 + if (unlikely(ri->cons_cnt > 0)) { 2335 + ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); 2336 + if (!ric) { 2337 + ri_free(ri); 2450 2338 return ZERO_SIZE_PTR; 2451 2339 } 2340 + ri->extra_consumers = ric; 2452 2341 } 2453 2342 2454 - ri->consumers[idx].id = id; 2455 - ri->consumers[idx].cookie = cookie; 2343 + ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; 2344 + ric->id = id; 2345 + ric->cookie = cookie; 2346 + 2347 + ri->cons_cnt++; 2456 2348 return ri; 2457 2349 } 2458 2350 ··· 2462 2350 return_consumer_find(struct return_instance *ri, int *iter, int id) 2463 2351 { 2464 2352 struct return_consumer *ric; 2465 - int idx = *iter; 2353 + int idx; 2466 2354 2467 - for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { 2355 + for (idx = *iter; idx < ri->cons_cnt; idx++) 2356 + { 2357 + ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; 2468 2358 if (ric->id == id) { 2469 2359 *iter = idx + 1; 2470 2360 return ric; 2471 2361 } 2472 2362 } 2363 + 2473 2364 return NULL; 2474 2365 } 2475 2366 ··· 2486 2371 struct uprobe_consumer *uc; 2487 2372 bool has_consumers = false, remove = true; 2488 2373 struct return_instance *ri = NULL; 2489 - int push_idx = 0; 2374 + struct uprobe_task *utask = current->utask; 2490 2375 2491 - current->utask->auprobe = &uprobe->arch; 2376 + utask->auprobe = &uprobe->arch; 2492 2377 2493 2378 list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { 2494 2379 bool session = uc->handler && uc->ret_handler; ··· 2508 2393 continue; 2509 2394 2510 2395 if (!ri) 2511 - ri = alloc_return_instance(); 2396 + ri = alloc_return_instance(utask); 2512 2397 2513 2398 if (session) 2514 - ri = push_consumer(ri, push_idx++, uc->id, cookie); 2399 + ri = push_consumer(ri, uc->id, cookie); 2515 2400 } 2516 - current->utask->auprobe = NULL; 2401 + utask->auprobe = NULL; 2517 2402 2518 - if (!ZERO_OR_NULL_PTR(ri)) { 2519 - /* 2520 - * The push_idx value has the final number of return consumers, 2521 - * and ri->consumers_cnt has number of allocated consumers. 2522 - */ 2523 - ri->consumers_cnt = push_idx; 2403 + if (!ZERO_OR_NULL_PTR(ri)) 2524 2404 prepare_uretprobe(uprobe, regs, ri); 2525 - } 2526 2405 2527 2406 if (remove && has_consumers) { 2528 2407 down_read(&uprobe->register_rwsem); ··· 2570 2461 void uprobe_handle_trampoline(struct pt_regs *regs) 2571 2462 { 2572 2463 struct uprobe_task *utask; 2573 - struct return_instance *ri, *next; 2464 + struct return_instance *ri, *ri_next, *next_chain; 2574 2465 struct uprobe *uprobe; 2575 2466 enum hprobe_state hstate; 2576 2467 bool valid; ··· 2590 2481 * or NULL; the latter case means that nobody but ri->func 2591 2482 * could hit this trampoline on return. TODO: sigaltstack(). 2592 2483 */ 2593 - next = find_next_ret_chain(ri); 2594 - valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); 2484 + next_chain = find_next_ret_chain(ri); 2485 + valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); 2595 2486 2596 2487 instruction_pointer_set(regs, ri->orig_ret_vaddr); 2597 2488 do { ··· 2603 2494 * trampoline addresses on the stack are replaced with correct 2604 2495 * original return addresses 2605 2496 */ 2606 - rcu_assign_pointer(utask->return_instances, ri->next); 2497 + ri_next = ri->next; 2498 + rcu_assign_pointer(utask->return_instances, ri_next); 2499 + utask->depth--; 2607 2500 2608 2501 uprobe = hprobe_consume(&ri->hprobe, &hstate); 2609 2502 if (valid) ··· 2613 2502 hprobe_finalize(&ri->hprobe, hstate); 2614 2503 2615 2504 /* We already took care of hprobe, no need to waste more time on that. */ 2616 - ri = free_ret_instance(ri, false /* !cleanup_hprobe */); 2617 - utask->depth--; 2618 - } while (ri != next); 2505 + free_ret_instance(utask, ri, false /* !cleanup_hprobe */); 2506 + ri = ri_next; 2507 + } while (ri != next_chain); 2619 2508 } while (!valid); 2620 2509 2621 2510 return;
+1 -4
kernel/fork.c
··· 448 448 return false; 449 449 450 450 init_rwsem(&vma->vm_lock->lock); 451 - vma->vm_lock_seq = -1; 451 + vma->vm_lock_seq = UINT_MAX; 452 452 453 453 return true; 454 454 } ··· 1262 1262 seqcount_init(&mm->write_protect_seq); 1263 1263 mmap_init_lock(mm); 1264 1264 INIT_LIST_HEAD(&mm->mmlist); 1265 - #ifdef CONFIG_PER_VMA_LOCK 1266 - mm->mm_lock_seq = 0; 1267 - #endif 1268 1265 mm_pgtables_bytes_init(mm); 1269 1266 mm->map_count = 0; 1270 1267 mm->locked_vm = 0;
+6 -5
kernel/trace/bpf_trace.c
··· 619 619 620 620 static __always_inline u64 621 621 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, 622 - u64 flags, struct perf_sample_data *sd) 622 + u64 flags, struct perf_raw_record *raw, 623 + struct perf_sample_data *sd) 623 624 { 624 625 struct bpf_array *array = container_of(map, struct bpf_array, map); 625 626 unsigned int cpu = smp_processor_id(); ··· 644 643 645 644 if (unlikely(event->oncpu != cpu)) 646 645 return -EOPNOTSUPP; 646 + 647 + perf_sample_save_raw_data(sd, event, raw); 647 648 648 649 return perf_event_output(event, sd, regs); 649 650 } ··· 690 687 } 691 688 692 689 perf_sample_data_init(sd, 0, 0); 693 - perf_sample_save_raw_data(sd, &raw); 694 690 695 - err = __bpf_perf_event_output(regs, map, flags, sd); 691 + err = __bpf_perf_event_output(regs, map, flags, &raw, sd); 696 692 out: 697 693 this_cpu_dec(bpf_trace_nest_level); 698 694 preempt_enable(); ··· 750 748 751 749 perf_fetch_caller_regs(regs); 752 750 perf_sample_data_init(sd, 0, 0); 753 - perf_sample_save_raw_data(sd, &raw); 754 751 755 - ret = __bpf_perf_event_output(regs, map, flags, sd); 752 + ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); 756 753 out: 757 754 this_cpu_dec(bpf_event_output_nest_level); 758 755 preempt_enable();
+1 -2
mm/gup.c
··· 3360 3360 return 0; 3361 3361 3362 3362 if (gup_flags & FOLL_PIN) { 3363 - seq = raw_read_seqcount(&current->mm->write_protect_seq); 3364 - if (seq & 1) 3363 + if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq)) 3365 3364 return 0; 3366 3365 } 3367 3366
+1 -1
mm/init-mm.c
··· 40 40 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), 41 41 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 42 42 #ifdef CONFIG_PER_VMA_LOCK 43 - .mm_lock_seq = 0, 43 + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), 44 44 #endif 45 45 .user_ns = &init_user_ns, 46 46 .cpu_bitmap = CPU_BITS_NONE,
+2 -2
tools/testing/vma/vma.c
··· 89 89 * begun. Linking to the tree will have caused this to be incremented, 90 90 * which means we will get a false positive otherwise. 91 91 */ 92 - vma->vm_lock_seq = -1; 92 + vma->vm_lock_seq = UINT_MAX; 93 93 94 94 return vma; 95 95 } ··· 214 214 int seq = vma->vm_lock_seq; 215 215 216 216 /* We reset after each check. */ 217 - vma->vm_lock_seq = -1; 217 + vma->vm_lock_seq = UINT_MAX; 218 218 219 219 /* The vma_start_write() stub simply increments this value. */ 220 220 return seq > -1;
+2 -2
tools/testing/vma/vma_internal.h
··· 241 241 * counter reuse can only lead to occasional unnecessary use of the 242 242 * slowpath. 243 243 */ 244 - int vm_lock_seq; 244 + unsigned int vm_lock_seq; 245 245 struct vma_lock *vm_lock; 246 246 #endif 247 247 ··· 416 416 return false; 417 417 418 418 init_rwsem(&vma->vm_lock->lock); 419 - vma->vm_lock_seq = -1; 419 + vma->vm_lock_seq = UINT_MAX; 420 420 421 421 return true; 422 422 }