Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:
"PMU driver updates:

- Add AMD Last Branch Record Extension Version 2 (LbrExtV2) feature
support for Zen 4 processors.

- Extend the perf ABI to provide branch speculation information, if
available, and use this on CPUs that have it (eg. LbrExtV2).

- Improve Intel PEBS TSC timestamp handling & integration.

- Add Intel Raptor Lake S CPU support.

- Add 'perf mem' and 'perf c2c' memory profiling support on AMD CPUs
by utilizing IBS tagged load/store samples.

- Clean up & optimize various x86 PMU details.

HW breakpoints:

- Big rework to optimize the code for systems with hundreds of CPUs
and thousands of breakpoints:

- Replace the nr_bp_mutex global mutex with the bp_cpuinfo_sem
per-CPU rwsem that is read-locked during most of the key
operations.

- Improve the O(#cpus * #tasks) logic in toggle_bp_slot() and
fetch_bp_busy_slots().

- Apply micro-optimizations & cleanups.

- Misc cleanups & enhancements"

* tag 'perf-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (75 commits)
perf/hw_breakpoint: Annotate tsk->perf_event_mutex vs ctx->mutex
perf: Fix pmu_filter_match()
perf: Fix lockdep_assert_event_ctx()
perf/x86/amd/lbr: Adjust LBR regardless of filtering
perf/x86/utils: Fix uninitialized var in get_branch_type()
perf/uapi: Define PERF_MEM_SNOOPX_PEER in kernel header file
perf/x86/amd: Support PERF_SAMPLE_PHY_ADDR
perf/x86/amd: Support PERF_SAMPLE_ADDR
perf/x86/amd: Support PERF_SAMPLE_{WEIGHT|WEIGHT_STRUCT}
perf/x86/amd: Support PERF_SAMPLE_DATA_SRC
perf/x86/amd: Add IBS OP_DATA2 DataSrc bit definitions
perf/mem: Introduce PERF_MEM_LVLNUM_{EXTN_MEM|IO}
perf/x86/uncore: Add new Raptor Lake S support
perf/x86/cstate: Add new Raptor Lake S support
perf/x86/msr: Add new Raptor Lake S support
perf/x86: Add new Raptor Lake S support
bpf: Check flags for branch stack in bpf_read_branch_records helper
perf, hw_breakpoint: Fix use-after-free if perf_event_open() fails
perf: Use sample_flags for raw_data
perf: Use sample_flags for addr
...

+2613 -754
+40 -13
arch/powerpc/kernel/hw_breakpoint.c
··· 15 15 #include <linux/kernel.h> 16 16 #include <linux/sched.h> 17 17 #include <linux/smp.h> 18 + #include <linux/spinlock.h> 18 19 #include <linux/debugfs.h> 19 20 #include <linux/init.h> 20 21 ··· 130 129 bool ptrace_bp; 131 130 }; 132 131 132 + /* 133 + * While kernel/events/hw_breakpoint.c does its own synchronization, we cannot 134 + * rely on it safely synchronizing internals here; however, we can rely on it 135 + * not requesting more breakpoints than available. 136 + */ 137 + static DEFINE_SPINLOCK(cpu_bps_lock); 133 138 static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]); 139 + static DEFINE_SPINLOCK(task_bps_lock); 134 140 static LIST_HEAD(task_bps); 135 141 136 142 static struct breakpoint *alloc_breakpoint(struct perf_event *bp) ··· 182 174 if (IS_ERR(tmp)) 183 175 return PTR_ERR(tmp); 184 176 177 + spin_lock(&task_bps_lock); 185 178 list_add(&tmp->list, &task_bps); 179 + spin_unlock(&task_bps_lock); 186 180 return 0; 187 181 } 188 182 ··· 192 182 { 193 183 struct list_head *pos, *q; 194 184 185 + spin_lock(&task_bps_lock); 195 186 list_for_each_safe(pos, q, &task_bps) { 196 187 struct breakpoint *tmp = list_entry(pos, struct breakpoint, list); 197 188 ··· 202 191 break; 203 192 } 204 193 } 194 + spin_unlock(&task_bps_lock); 205 195 } 206 196 207 197 /* ··· 212 200 static bool all_task_bps_check(struct perf_event *bp) 213 201 { 214 202 struct breakpoint *tmp; 203 + bool ret = false; 215 204 205 + spin_lock(&task_bps_lock); 216 206 list_for_each_entry(tmp, &task_bps, list) { 217 - if (!can_co_exist(tmp, bp)) 218 - return true; 207 + if (!can_co_exist(tmp, bp)) { 208 + ret = true; 209 + break; 210 + } 219 211 } 220 - return false; 212 + spin_unlock(&task_bps_lock); 213 + return ret; 221 214 } 222 215 223 216 /* ··· 232 215 static bool same_task_bps_check(struct perf_event *bp) 233 216 { 234 217 struct breakpoint *tmp; 218 + bool ret = false; 235 219 220 + spin_lock(&task_bps_lock); 236 221 list_for_each_entry(tmp, &task_bps, list) { 237 222 if (tmp->bp->hw.target == bp->hw.target && 238 - !can_co_exist(tmp, bp)) 239 - return true; 223 + !can_co_exist(tmp, bp)) { 224 + ret = true; 225 + break; 226 + } 240 227 } 241 - return false; 228 + spin_unlock(&task_bps_lock); 229 + return ret; 242 230 } 243 231 244 232 static int cpu_bps_add(struct perf_event *bp) ··· 256 234 if (IS_ERR(tmp)) 257 235 return PTR_ERR(tmp); 258 236 237 + spin_lock(&cpu_bps_lock); 259 238 cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); 260 239 for (i = 0; i < nr_wp_slots(); i++) { 261 240 if (!cpu_bp[i]) { ··· 264 241 break; 265 242 } 266 243 } 244 + spin_unlock(&cpu_bps_lock); 267 245 return 0; 268 246 } 269 247 ··· 273 249 struct breakpoint **cpu_bp; 274 250 int i = 0; 275 251 252 + spin_lock(&cpu_bps_lock); 276 253 cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); 277 254 for (i = 0; i < nr_wp_slots(); i++) { 278 255 if (!cpu_bp[i]) ··· 285 260 break; 286 261 } 287 262 } 263 + spin_unlock(&cpu_bps_lock); 288 264 } 289 265 290 266 static bool cpu_bps_check(int cpu, struct perf_event *bp) 291 267 { 292 268 struct breakpoint **cpu_bp; 269 + bool ret = false; 293 270 int i; 294 271 272 + spin_lock(&cpu_bps_lock); 295 273 cpu_bp = per_cpu_ptr(cpu_bps, cpu); 296 274 for (i = 0; i < nr_wp_slots(); i++) { 297 - if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) 298 - return true; 275 + if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) { 276 + ret = true; 277 + break; 278 + } 299 279 } 300 - return false; 280 + spin_unlock(&cpu_bps_lock); 281 + return ret; 301 282 } 302 283 303 284 static bool all_cpu_bps_check(struct perf_event *bp) ··· 317 286 return false; 318 287 } 319 288 320 - /* 321 - * We don't use any locks to serialize accesses to cpu_bps or task_bps 322 - * because are already inside nr_bp_mutex. 323 - */ 324 289 int arch_reserve_bp_slot(struct perf_event *bp) 325 290 { 326 291 int ret;
+7 -3
arch/powerpc/perf/core-book3s.c
··· 2314 2314 cpuhw = this_cpu_ptr(&cpu_hw_events); 2315 2315 power_pmu_bhrb_read(event, cpuhw); 2316 2316 data.br_stack = &cpuhw->bhrb_stack; 2317 + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; 2317 2318 } 2318 2319 2319 2320 if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && 2320 - ppmu->get_mem_data_src) 2321 + ppmu->get_mem_data_src) { 2321 2322 ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); 2323 + data.sample_flags |= PERF_SAMPLE_DATA_SRC; 2324 + } 2322 2325 2323 2326 if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && 2324 - ppmu->get_mem_weight) 2327 + ppmu->get_mem_weight) { 2325 2328 ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); 2326 - 2329 + data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 2330 + } 2327 2331 if (perf_event_overflow(event, &data, regs)) 2328 2332 power_pmu_stop(event, 0); 2329 2333 } else if (period) {
+1
arch/s390/kernel/perf_cpum_cf.c
··· 664 664 raw.frag.data = cpuhw->stop; 665 665 raw.size = raw.frag.size; 666 666 data.raw = &raw; 667 + data.sample_flags |= PERF_SAMPLE_RAW; 667 668 } 668 669 669 670 overflow = perf_event_overflow(event, &data, &regs);
+1
arch/s390/kernel/perf_pai_crypto.c
··· 366 366 raw.frag.data = cpump->save; 367 367 raw.size = raw.frag.size; 368 368 data.raw = &raw; 369 + data.sample_flags |= PERF_SAMPLE_RAW; 369 370 } 370 371 371 372 overflow = perf_event_overflow(event, &data, &regs);
+1 -4
arch/sh/include/asm/hw_breakpoint.h
··· 48 48 /* Maximum number of UBC channels */ 49 49 #define HBP_NUM 2 50 50 51 - static inline int hw_breakpoint_slots(int type) 52 - { 53 - return HBP_NUM; 54 - } 51 + #define hw_breakpoint_slots(type) (HBP_NUM) 55 52 56 53 /* arch/sh/kernel/hw_breakpoint.c */ 57 54 extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
+1 -1
arch/x86/events/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - obj-y += core.o probe.o 2 + obj-y += core.o probe.o utils.o 3 3 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o 4 4 obj-y += amd/ 5 5 obj-$(CONFIG_X86_LOCAL_APIC) += msr.o
+1 -1
arch/x86/events/amd/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 - obj-$(CONFIG_CPU_SUP_AMD) += core.o 2 + obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o 3 3 obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o 4 4 obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o 5 5 obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o
+68 -1
arch/x86/events/amd/brs.c
··· 81 81 * a br_sel_map. Software filtering is not supported because it would not correlate well 82 82 * with a sampling period. 83 83 */ 84 - int amd_brs_setup_filter(struct perf_event *event) 84 + static int amd_brs_setup_filter(struct perf_event *event) 85 85 { 86 86 u64 type = event->attr.branch_sample_type; 87 87 ··· 94 94 return -EINVAL; 95 95 96 96 return 0; 97 + } 98 + 99 + static inline int amd_is_brs_event(struct perf_event *e) 100 + { 101 + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; 102 + } 103 + 104 + int amd_brs_hw_config(struct perf_event *event) 105 + { 106 + int ret = 0; 107 + 108 + /* 109 + * Due to interrupt holding, BRS is not recommended in 110 + * counting mode. 111 + */ 112 + if (!is_sampling_event(event)) 113 + return -EINVAL; 114 + 115 + /* 116 + * Due to the way BRS operates by holding the interrupt until 117 + * lbr_nr entries have been captured, it does not make sense 118 + * to allow sampling on BRS with an event that does not match 119 + * what BRS is capturing, i.e., retired taken branches. 120 + * Otherwise the correlation with the event's period is even 121 + * more loose: 122 + * 123 + * With retired taken branch: 124 + * Effective P = P + 16 + X 125 + * With any other event: 126 + * Effective P = P + Y + X 127 + * 128 + * Where X is the number of taken branches due to interrupt 129 + * skid. Skid is large. 130 + * 131 + * Where Y is the occurences of the event while BRS is 132 + * capturing the lbr_nr entries. 133 + * 134 + * By using retired taken branches, we limit the impact on the 135 + * Y variable. We know it cannot be more than the depth of 136 + * BRS. 137 + */ 138 + if (!amd_is_brs_event(event)) 139 + return -EINVAL; 140 + 141 + /* 142 + * BRS implementation does not work with frequency mode 143 + * reprogramming of the period. 144 + */ 145 + if (event->attr.freq) 146 + return -EINVAL; 147 + /* 148 + * The kernel subtracts BRS depth from period, so it must 149 + * be big enough. 150 + */ 151 + if (event->attr.sample_period <= x86_pmu.lbr_nr) 152 + return -EINVAL; 153 + 154 + /* 155 + * Check if we can allow PERF_SAMPLE_BRANCH_STACK 156 + */ 157 + ret = amd_brs_setup_filter(event); 158 + 159 + /* only set in case of success */ 160 + if (!ret) 161 + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; 162 + 163 + return ret; 97 164 } 98 165 99 166 /* tos = top of stack, i.e., last valid entry written */
+102 -108
arch/x86/events/amd/core.c
··· 330 330 } 331 331 } 332 332 333 - #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ 334 - static inline int amd_is_brs_event(struct perf_event *e) 335 - { 336 - return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; 337 - } 333 + DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config); 338 334 339 335 static int amd_core_hw_config(struct perf_event *event) 340 336 { 341 - int ret = 0; 342 - 343 337 if (event->attr.exclude_host && event->attr.exclude_guest) 344 338 /* 345 339 * When HO == GO == 1 the hardware treats that as GO == HO == 0 ··· 350 356 if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) 351 357 event->hw.flags |= PERF_X86_EVENT_PAIR; 352 358 353 - /* 354 - * if branch stack is requested 355 - */ 356 - if (has_branch_stack(event)) { 357 - /* 358 - * Due to interrupt holding, BRS is not recommended in 359 - * counting mode. 360 - */ 361 - if (!is_sampling_event(event)) 362 - return -EINVAL; 359 + if (has_branch_stack(event)) 360 + return static_call(amd_pmu_branch_hw_config)(event); 363 361 364 - /* 365 - * Due to the way BRS operates by holding the interrupt until 366 - * lbr_nr entries have been captured, it does not make sense 367 - * to allow sampling on BRS with an event that does not match 368 - * what BRS is capturing, i.e., retired taken branches. 369 - * Otherwise the correlation with the event's period is even 370 - * more loose: 371 - * 372 - * With retired taken branch: 373 - * Effective P = P + 16 + X 374 - * With any other event: 375 - * Effective P = P + Y + X 376 - * 377 - * Where X is the number of taken branches due to interrupt 378 - * skid. Skid is large. 379 - * 380 - * Where Y is the occurences of the event while BRS is 381 - * capturing the lbr_nr entries. 382 - * 383 - * By using retired taken branches, we limit the impact on the 384 - * Y variable. We know it cannot be more than the depth of 385 - * BRS. 386 - */ 387 - if (!amd_is_brs_event(event)) 388 - return -EINVAL; 389 - 390 - /* 391 - * BRS implementation does not work with frequency mode 392 - * reprogramming of the period. 393 - */ 394 - if (event->attr.freq) 395 - return -EINVAL; 396 - /* 397 - * The kernel subtracts BRS depth from period, so it must 398 - * be big enough. 399 - */ 400 - if (event->attr.sample_period <= x86_pmu.lbr_nr) 401 - return -EINVAL; 402 - 403 - /* 404 - * Check if we can allow PERF_SAMPLE_BRANCH_STACK 405 - */ 406 - ret = amd_brs_setup_filter(event); 407 - 408 - /* only set in case of success */ 409 - if (!ret) 410 - event->hw.flags |= PERF_X86_EVENT_AMD_BRS; 411 - } 412 - return ret; 362 + return 0; 413 363 } 414 364 415 365 static inline int amd_is_nb_event(struct hw_perf_event *hwc) ··· 520 582 return nb; 521 583 } 522 584 585 + typedef void (amd_pmu_branch_reset_t)(void); 586 + DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t); 587 + 523 588 static void amd_pmu_cpu_reset(int cpu) 524 589 { 590 + if (x86_pmu.lbr_nr) 591 + static_call(amd_pmu_branch_reset)(); 592 + 525 593 if (x86_pmu.version < 2) 526 594 return; 527 595 ··· 542 598 { 543 599 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 544 600 601 + cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL, 602 + cpu_to_node(cpu)); 603 + if (!cpuc->lbr_sel) 604 + return -ENOMEM; 605 + 545 606 WARN_ON_ONCE(cpuc->amd_nb); 546 607 547 608 if (!x86_pmu.amd_nb_constraints) 548 609 return 0; 549 610 550 611 cpuc->amd_nb = amd_alloc_nb(cpu); 551 - if (!cpuc->amd_nb) 552 - return -ENOMEM; 612 + if (cpuc->amd_nb) 613 + return 0; 553 614 554 - return 0; 615 + kfree(cpuc->lbr_sel); 616 + cpuc->lbr_sel = NULL; 617 + 618 + return -ENOMEM; 555 619 } 556 620 557 621 static void amd_pmu_cpu_starting(int cpu) ··· 592 640 cpuc->amd_nb->nb_id = nb_id; 593 641 cpuc->amd_nb->refcnt++; 594 642 595 - amd_brs_reset(); 596 643 amd_pmu_cpu_reset(cpu); 597 644 } 598 645 599 646 static void amd_pmu_cpu_dead(int cpu) 600 647 { 601 - struct cpu_hw_events *cpuhw; 648 + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); 649 + 650 + kfree(cpuhw->lbr_sel); 651 + cpuhw->lbr_sel = NULL; 602 652 603 653 if (!x86_pmu.amd_nb_constraints) 604 654 return; 605 - 606 - cpuhw = &per_cpu(cpu_hw_events, cpu); 607 655 608 656 if (cpuhw->amd_nb) { 609 657 struct amd_nb *nb = cpuhw->amd_nb; ··· 629 677 /* PerfCntrGlobalStatus is read-only */ 630 678 rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); 631 679 632 - return status & amd_pmu_global_cntr_mask; 680 + return status; 633 681 } 634 682 635 683 static inline void amd_pmu_ack_global_status(u64 status) ··· 640 688 * clears the same bit in PerfCntrGlobalStatus 641 689 */ 642 690 643 - /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ 644 - status &= amd_pmu_global_cntr_mask; 645 691 wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); 646 692 } 647 693 ··· 749 799 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 750 800 } 751 801 752 - static void amd_pmu_v2_enable_all(int added) 802 + static __always_inline void amd_pmu_core_enable_all(void) 753 803 { 754 804 amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); 805 + } 806 + 807 + static void amd_pmu_v2_enable_all(int added) 808 + { 809 + amd_pmu_lbr_enable_all(); 810 + amd_pmu_core_enable_all(); 755 811 } 756 812 757 813 static void amd_pmu_disable_event(struct perf_event *event) ··· 784 828 amd_pmu_check_overflow(); 785 829 } 786 830 831 + static __always_inline void amd_pmu_core_disable_all(void) 832 + { 833 + amd_pmu_set_global_ctl(0); 834 + } 835 + 787 836 static void amd_pmu_v2_disable_all(void) 788 837 { 789 - /* Disable all PMCs */ 790 - amd_pmu_set_global_ctl(0); 838 + amd_pmu_core_disable_all(); 839 + amd_pmu_lbr_disable_all(); 791 840 amd_pmu_check_overflow(); 792 841 } 842 + 843 + DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add); 793 844 794 845 static void amd_pmu_add_event(struct perf_event *event) 795 846 { 796 847 if (needs_branch_stack(event)) 797 - amd_pmu_brs_add(event); 848 + static_call(amd_pmu_branch_add)(event); 798 849 } 850 + 851 + DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del); 799 852 800 853 static void amd_pmu_del_event(struct perf_event *event) 801 854 { 802 855 if (needs_branch_stack(event)) 803 - amd_pmu_brs_del(event); 856 + static_call(amd_pmu_branch_del)(event); 804 857 } 805 858 806 859 /* ··· 895 930 pmu_enabled = cpuc->enabled; 896 931 cpuc->enabled = 0; 897 932 898 - /* Stop counting */ 899 - amd_pmu_v2_disable_all(); 933 + /* Stop counting but do not disable LBR */ 934 + amd_pmu_core_disable_all(); 900 935 901 936 status = amd_pmu_get_global_status(); 902 937 903 938 /* Check if any overflows are pending */ 904 939 if (!status) 905 940 goto done; 941 + 942 + /* Read branch records before unfreezing */ 943 + if (status & GLOBAL_STATUS_LBRS_FROZEN) { 944 + amd_pmu_lbr_read(); 945 + status &= ~GLOBAL_STATUS_LBRS_FROZEN; 946 + } 906 947 907 948 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 908 949 if (!test_bit(idx, cpuc->active_mask)) ··· 929 958 if (!x86_perf_event_set_period(event)) 930 959 continue; 931 960 961 + if (has_branch_stack(event)) { 962 + data.br_stack = &cpuc->lbr_stack; 963 + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; 964 + } 965 + 932 966 if (perf_event_overflow(event, &data, regs)) 933 967 x86_pmu_stop(event, 0); 934 968 ··· 947 971 */ 948 972 WARN_ON(status > 0); 949 973 950 - /* Clear overflow bits */ 974 + /* Clear overflow and freeze bits */ 951 975 amd_pmu_ack_global_status(~status); 952 976 953 977 /* ··· 961 985 962 986 /* Resume counting only if PMU is active */ 963 987 if (pmu_enabled) 964 - amd_pmu_v2_enable_all(0); 988 + amd_pmu_core_enable_all(); 965 989 966 990 return amd_pmu_adjust_nmi_window(handled); 967 991 } ··· 1224 1248 return x86_event_sysfs_show(page, config, event); 1225 1249 } 1226 1250 1227 - static void amd_pmu_sched_task(struct perf_event_context *ctx, 1228 - bool sched_in) 1229 - { 1230 - if (sched_in && x86_pmu.lbr_nr) 1231 - amd_pmu_brs_sched_task(ctx, sched_in); 1232 - } 1233 - 1234 - static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) 1251 + static void amd_pmu_limit_period(struct perf_event *event, s64 *left) 1235 1252 { 1236 1253 /* 1237 1254 * Decrease period by the depth of the BRS feature to get the last N 1238 1255 * taken branches and approximate the desired period 1239 1256 */ 1240 - if (has_branch_stack(event) && left > x86_pmu.lbr_nr) 1241 - left -= x86_pmu.lbr_nr; 1242 - 1243 - return left; 1257 + if (has_branch_stack(event) && *left > x86_pmu.lbr_nr) 1258 + *left -= x86_pmu.lbr_nr; 1244 1259 } 1245 1260 1246 1261 static __initconst const struct x86_pmu amd_pmu = { ··· 1278 1311 1279 1312 static DEVICE_ATTR_RO(branches); 1280 1313 1281 - static struct attribute *amd_pmu_brs_attrs[] = { 1314 + static struct attribute *amd_pmu_branches_attrs[] = { 1282 1315 &dev_attr_branches.attr, 1283 1316 NULL, 1284 1317 }; 1285 1318 1286 1319 static umode_t 1287 - amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) 1320 + amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i) 1288 1321 { 1289 1322 return x86_pmu.lbr_nr ? attr->mode : 0; 1290 1323 } 1291 1324 1292 - static struct attribute_group group_caps_amd_brs = { 1325 + static struct attribute_group group_caps_amd_branches = { 1293 1326 .name = "caps", 1294 - .attrs = amd_pmu_brs_attrs, 1295 - .is_visible = amd_brs_is_visible, 1327 + .attrs = amd_pmu_branches_attrs, 1328 + .is_visible = amd_branches_is_visible, 1296 1329 }; 1330 + 1331 + #ifdef CONFIG_PERF_EVENTS_AMD_BRS 1297 1332 1298 1333 EVENT_ATTR_STR(branch-brs, amd_branch_brs, 1299 1334 "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); ··· 1305 1336 NULL, 1306 1337 }; 1307 1338 1339 + static umode_t 1340 + amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) 1341 + { 1342 + return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ? 1343 + attr->mode : 0; 1344 + } 1345 + 1308 1346 static struct attribute_group group_events_amd_brs = { 1309 1347 .name = "events", 1310 1348 .attrs = amd_brs_events_attrs, 1311 1349 .is_visible = amd_brs_is_visible, 1312 1350 }; 1313 1351 1352 + #endif /* CONFIG_PERF_EVENTS_AMD_BRS */ 1353 + 1314 1354 static const struct attribute_group *amd_attr_update[] = { 1315 - &group_caps_amd_brs, 1355 + &group_caps_amd_branches, 1356 + #ifdef CONFIG_PERF_EVENTS_AMD_BRS 1316 1357 &group_events_amd_brs, 1358 + #endif 1317 1359 NULL, 1318 1360 }; 1319 1361 ··· 1401 1421 x86_pmu.flags |= PMU_FL_PAIR; 1402 1422 } 1403 1423 1404 - /* 1405 - * BRS requires special event constraints and flushing on ctxsw. 1406 - */ 1407 - if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { 1424 + /* LBR and BRS are mutually exclusive features */ 1425 + if (!amd_pmu_lbr_init()) { 1426 + /* LBR requires flushing on context switch */ 1427 + x86_pmu.sched_task = amd_pmu_lbr_sched_task; 1428 + static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config); 1429 + static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); 1430 + static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); 1431 + static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); 1432 + } else if (!amd_brs_init()) { 1433 + /* 1434 + * BRS requires special event constraints and flushing on ctxsw. 1435 + */ 1408 1436 x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; 1409 - x86_pmu.sched_task = amd_pmu_sched_task; 1437 + x86_pmu.sched_task = amd_pmu_brs_sched_task; 1410 1438 x86_pmu.limit_period = amd_pmu_limit_period; 1439 + 1440 + static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config); 1441 + static_call_update(amd_pmu_branch_reset, amd_brs_reset); 1442 + static_call_update(amd_pmu_branch_add, amd_pmu_brs_add); 1443 + static_call_update(amd_pmu_branch_del, amd_pmu_brs_del); 1444 + 1411 1445 /* 1412 1446 * put_event_constraints callback same as Fam17h, set above 1413 1447 */
+343 -17
arch/x86/events/amd/ibs.c
··· 300 300 hwc->config_base = perf_ibs->msr; 301 301 hwc->config = config; 302 302 303 - /* 304 - * rip recorded by IbsOpRip will not be consistent with rsp and rbp 305 - * recorded as part of interrupt regs. Thus we need to use rip from 306 - * interrupt regs while unwinding call stack. Setting _EARLY flag 307 - * makes sure we unwind call-stack before perf sample rip is set to 308 - * IbsOpRip. 309 - */ 310 - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 311 - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; 312 - 313 303 return 0; 314 304 } 315 305 ··· 678 688 .get_count = get_ibs_op_count, 679 689 }; 680 690 691 + static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, 692 + struct perf_sample_data *data) 693 + { 694 + union perf_mem_data_src *data_src = &data->data_src; 695 + 696 + data_src->mem_op = PERF_MEM_OP_NA; 697 + 698 + if (op_data3->ld_op) 699 + data_src->mem_op = PERF_MEM_OP_LOAD; 700 + else if (op_data3->st_op) 701 + data_src->mem_op = PERF_MEM_OP_STORE; 702 + } 703 + 704 + /* 705 + * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has 706 + * more fine granular DataSrc encodings. Others have coarse. 707 + */ 708 + static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) 709 + { 710 + if (ibs_caps & IBS_CAPS_ZEN4) 711 + return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; 712 + 713 + return op_data2->data_src_lo; 714 + } 715 + 716 + static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, 717 + union ibs_op_data3 *op_data3, 718 + struct perf_sample_data *data) 719 + { 720 + union perf_mem_data_src *data_src = &data->data_src; 721 + u8 ibs_data_src = perf_ibs_data_src(op_data2); 722 + 723 + data_src->mem_lvl = 0; 724 + 725 + /* 726 + * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached 727 + * memory accesses. So, check DcUcMemAcc bit early. 728 + */ 729 + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { 730 + data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; 731 + return; 732 + } 733 + 734 + /* L1 Hit */ 735 + if (op_data3->dc_miss == 0) { 736 + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; 737 + return; 738 + } 739 + 740 + /* L2 Hit */ 741 + if (op_data3->l2_miss == 0) { 742 + /* Erratum #1293 */ 743 + if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || 744 + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { 745 + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; 746 + return; 747 + } 748 + } 749 + 750 + /* 751 + * OP_DATA2 is valid only for load ops. Skip all checks which 752 + * uses OP_DATA2[DataSrc]. 753 + */ 754 + if (data_src->mem_op != PERF_MEM_OP_LOAD) 755 + goto check_mab; 756 + 757 + /* L3 Hit */ 758 + if (ibs_caps & IBS_CAPS_ZEN4) { 759 + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { 760 + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; 761 + return; 762 + } 763 + } else { 764 + if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { 765 + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | 766 + PERF_MEM_LVL_HIT; 767 + return; 768 + } 769 + } 770 + 771 + /* A peer cache in a near CCX */ 772 + if (ibs_caps & IBS_CAPS_ZEN4 && 773 + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { 774 + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; 775 + return; 776 + } 777 + 778 + /* A peer cache in a far CCX */ 779 + if (ibs_caps & IBS_CAPS_ZEN4) { 780 + if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { 781 + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; 782 + return; 783 + } 784 + } else { 785 + if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { 786 + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; 787 + return; 788 + } 789 + } 790 + 791 + /* DRAM */ 792 + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { 793 + if (op_data2->rmt_node == 0) 794 + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; 795 + else 796 + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; 797 + return; 798 + } 799 + 800 + /* PMEM */ 801 + if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { 802 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; 803 + if (op_data2->rmt_node) { 804 + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 805 + /* IBS doesn't provide Remote socket detail */ 806 + data_src->mem_hops = PERF_MEM_HOPS_1; 807 + } 808 + return; 809 + } 810 + 811 + /* Extension Memory */ 812 + if (ibs_caps & IBS_CAPS_ZEN4 && 813 + ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { 814 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM; 815 + if (op_data2->rmt_node) { 816 + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 817 + /* IBS doesn't provide Remote socket detail */ 818 + data_src->mem_hops = PERF_MEM_HOPS_1; 819 + } 820 + return; 821 + } 822 + 823 + /* IO */ 824 + if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { 825 + data_src->mem_lvl = PERF_MEM_LVL_IO; 826 + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; 827 + if (op_data2->rmt_node) { 828 + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; 829 + /* IBS doesn't provide Remote socket detail */ 830 + data_src->mem_hops = PERF_MEM_HOPS_1; 831 + } 832 + return; 833 + } 834 + 835 + check_mab: 836 + /* 837 + * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding 838 + * DC misses. However, such data may come from any level in mem 839 + * hierarchy. IBS provides detail about both MAB as well as actual 840 + * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set 841 + * MAB only when IBS fails to provide DataSrc. 842 + */ 843 + if (op_data3->dc_miss_no_mab_alloc) { 844 + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; 845 + return; 846 + } 847 + 848 + data_src->mem_lvl = PERF_MEM_LVL_NA; 849 + } 850 + 851 + static bool perf_ibs_cache_hit_st_valid(void) 852 + { 853 + /* 0: Uninitialized, 1: Valid, -1: Invalid */ 854 + static int cache_hit_st_valid; 855 + 856 + if (unlikely(!cache_hit_st_valid)) { 857 + if (boot_cpu_data.x86 == 0x19 && 858 + (boot_cpu_data.x86_model <= 0xF || 859 + (boot_cpu_data.x86_model >= 0x20 && 860 + boot_cpu_data.x86_model <= 0x5F))) { 861 + cache_hit_st_valid = -1; 862 + } else { 863 + cache_hit_st_valid = 1; 864 + } 865 + } 866 + 867 + return cache_hit_st_valid == 1; 868 + } 869 + 870 + static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, 871 + struct perf_sample_data *data) 872 + { 873 + union perf_mem_data_src *data_src = &data->data_src; 874 + u8 ibs_data_src; 875 + 876 + data_src->mem_snoop = PERF_MEM_SNOOP_NA; 877 + 878 + if (!perf_ibs_cache_hit_st_valid() || 879 + data_src->mem_op != PERF_MEM_OP_LOAD || 880 + data_src->mem_lvl & PERF_MEM_LVL_L1 || 881 + data_src->mem_lvl & PERF_MEM_LVL_L2 || 882 + op_data2->cache_hit_st) 883 + return; 884 + 885 + ibs_data_src = perf_ibs_data_src(op_data2); 886 + 887 + if (ibs_caps & IBS_CAPS_ZEN4) { 888 + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || 889 + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || 890 + ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) 891 + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 892 + } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { 893 + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; 894 + } 895 + } 896 + 897 + static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, 898 + struct perf_sample_data *data) 899 + { 900 + union perf_mem_data_src *data_src = &data->data_src; 901 + 902 + data_src->mem_dtlb = PERF_MEM_TLB_NA; 903 + 904 + if (!op_data3->dc_lin_addr_valid) 905 + return; 906 + 907 + if (!op_data3->dc_l1tlb_miss) { 908 + data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; 909 + return; 910 + } 911 + 912 + if (!op_data3->dc_l2tlb_miss) { 913 + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; 914 + return; 915 + } 916 + 917 + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; 918 + } 919 + 920 + static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, 921 + struct perf_sample_data *data) 922 + { 923 + union perf_mem_data_src *data_src = &data->data_src; 924 + 925 + data_src->mem_lock = PERF_MEM_LOCK_NA; 926 + 927 + if (op_data3->dc_locked_op) 928 + data_src->mem_lock = PERF_MEM_LOCK_LOCKED; 929 + } 930 + 931 + #define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) 932 + 933 + static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, 934 + struct perf_sample_data *data, 935 + union ibs_op_data2 *op_data2, 936 + union ibs_op_data3 *op_data3) 937 + { 938 + perf_ibs_get_mem_lvl(op_data2, op_data3, data); 939 + perf_ibs_get_mem_snoop(op_data2, data); 940 + perf_ibs_get_tlb_lvl(op_data3, data); 941 + perf_ibs_get_mem_lock(op_data3, data); 942 + } 943 + 944 + static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, 945 + union ibs_op_data3 *op_data3) 946 + { 947 + __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; 948 + 949 + /* Erratum #1293 */ 950 + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && 951 + (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { 952 + /* 953 + * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. 954 + * DataSrc=0 is 'No valid status' and RmtNode is invalid when 955 + * DataSrc=0. 956 + */ 957 + val = 0; 958 + } 959 + return val; 960 + } 961 + 962 + static void perf_ibs_parse_ld_st_data(__u64 sample_type, 963 + struct perf_ibs_data *ibs_data, 964 + struct perf_sample_data *data) 965 + { 966 + union ibs_op_data3 op_data3; 967 + union ibs_op_data2 op_data2; 968 + union ibs_op_data op_data; 969 + 970 + data->data_src.val = PERF_MEM_NA; 971 + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; 972 + 973 + perf_ibs_get_mem_op(&op_data3, data); 974 + if (data->data_src.mem_op != PERF_MEM_OP_LOAD && 975 + data->data_src.mem_op != PERF_MEM_OP_STORE) 976 + return; 977 + 978 + op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); 979 + 980 + if (sample_type & PERF_SAMPLE_DATA_SRC) { 981 + perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); 982 + data->sample_flags |= PERF_SAMPLE_DATA_SRC; 983 + } 984 + 985 + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && 986 + data->data_src.mem_op == PERF_MEM_OP_LOAD) { 987 + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; 988 + 989 + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { 990 + data->weight.var1_dw = op_data3.dc_miss_lat; 991 + data->weight.var2_w = op_data.tag_to_ret_ctr; 992 + } else if (sample_type & PERF_SAMPLE_WEIGHT) { 993 + data->weight.full = op_data3.dc_miss_lat; 994 + } 995 + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 996 + } 997 + 998 + if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { 999 + data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; 1000 + data->sample_flags |= PERF_SAMPLE_ADDR; 1001 + } 1002 + 1003 + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { 1004 + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; 1005 + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; 1006 + } 1007 + } 1008 + 1009 + static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, 1010 + int check_rip) 1011 + { 1012 + if (sample_type & PERF_SAMPLE_RAW || 1013 + (perf_ibs == &perf_ibs_op && 1014 + (sample_type & PERF_SAMPLE_DATA_SRC || 1015 + sample_type & PERF_SAMPLE_WEIGHT_TYPE || 1016 + sample_type & PERF_SAMPLE_ADDR || 1017 + sample_type & PERF_SAMPLE_PHYS_ADDR))) 1018 + return perf_ibs->offset_max; 1019 + else if (check_rip) 1020 + return 3; 1021 + return 1; 1022 + } 1023 + 681 1024 static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) 682 1025 { 683 1026 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); ··· 1058 735 size = 1; 1059 736 offset = 1; 1060 737 check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); 1061 - if (event->attr.sample_type & PERF_SAMPLE_RAW) 1062 - offset_max = perf_ibs->offset_max; 1063 - else if (check_rip) 1064 - offset_max = 3; 1065 - else 1066 - offset_max = 1; 738 + 739 + offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); 740 + 1067 741 do { 1068 742 rdmsrl(msr + offset, *buf++); 1069 743 size++; ··· 1111 791 }, 1112 792 }; 1113 793 data.raw = &raw; 794 + data.sample_flags |= PERF_SAMPLE_RAW; 1114 795 } 796 + 797 + if (perf_ibs == &perf_ibs_op) 798 + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); 1115 799 1116 800 /* 1117 801 * rip recorded by IbsOpRip will not be consistent with rsp and rbp 1118 802 * recorded as part of interrupt regs. Thus we need to use rip from 1119 803 * interrupt regs while unwinding call stack. 1120 804 */ 1121 - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 805 + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 1122 806 data.callchain = perf_callchain(event, iregs); 807 + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; 808 + } 1123 809 1124 810 throttle = perf_event_overflow(event, &data, &regs); 1125 811 out:
+439
arch/x86/events/amd/lbr.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/perf_event.h> 3 + #include <asm/perf_event.h> 4 + 5 + #include "../perf_event.h" 6 + 7 + /* LBR Branch Select valid bits */ 8 + #define LBR_SELECT_MASK 0x1ff 9 + 10 + /* 11 + * LBR Branch Select filter bits which when set, ensures that the 12 + * corresponding type of branches are not recorded 13 + */ 14 + #define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */ 15 + #define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */ 16 + #define LBR_SELECT_JCC 2 /* Conditional branches */ 17 + #define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */ 18 + #define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */ 19 + #define LBR_SELECT_RET_NEAR 5 /* Near returns */ 20 + #define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */ 21 + #define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */ 22 + #define LBR_SELECT_FAR_BRANCH 8 /* Far branches */ 23 + 24 + #define LBR_KERNEL BIT(LBR_SELECT_KERNEL) 25 + #define LBR_USER BIT(LBR_SELECT_USER) 26 + #define LBR_JCC BIT(LBR_SELECT_JCC) 27 + #define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL) 28 + #define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND) 29 + #define LBR_RETURN BIT(LBR_SELECT_RET_NEAR) 30 + #define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL) 31 + #define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND) 32 + #define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH) 33 + #define LBR_NOT_SUPP -1 /* unsupported filter */ 34 + #define LBR_IGNORE 0 35 + 36 + #define LBR_ANY \ 37 + (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \ 38 + LBR_REL_JMP | LBR_IND_JMP | LBR_FAR) 39 + 40 + struct branch_entry { 41 + union { 42 + struct { 43 + u64 ip:58; 44 + u64 ip_sign_ext:5; 45 + u64 mispredict:1; 46 + } split; 47 + u64 full; 48 + } from; 49 + 50 + union { 51 + struct { 52 + u64 ip:58; 53 + u64 ip_sign_ext:3; 54 + u64 reserved:1; 55 + u64 spec:1; 56 + u64 valid:1; 57 + } split; 58 + u64 full; 59 + } to; 60 + }; 61 + 62 + static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) 63 + { 64 + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); 65 + } 66 + 67 + static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) 68 + { 69 + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); 70 + } 71 + 72 + static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) 73 + { 74 + u64 val; 75 + 76 + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); 77 + 78 + return val; 79 + } 80 + 81 + static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) 82 + { 83 + u64 val; 84 + 85 + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); 86 + 87 + return val; 88 + } 89 + 90 + static __always_inline u64 sign_ext_branch_ip(u64 ip) 91 + { 92 + u32 shift = 64 - boot_cpu_data.x86_virt_bits; 93 + 94 + return (u64)(((s64)ip << shift) >> shift); 95 + } 96 + 97 + static void amd_pmu_lbr_filter(void) 98 + { 99 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 100 + int br_sel = cpuc->br_sel, offset, type, i, j; 101 + bool compress = false; 102 + bool fused_only = false; 103 + u64 from, to; 104 + 105 + /* If sampling all branches, there is nothing to filter */ 106 + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && 107 + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) 108 + fused_only = true; 109 + 110 + for (i = 0; i < cpuc->lbr_stack.nr; i++) { 111 + from = cpuc->lbr_entries[i].from; 112 + to = cpuc->lbr_entries[i].to; 113 + type = branch_type_fused(from, to, 0, &offset); 114 + 115 + /* 116 + * Adjust the branch from address in case of instruction 117 + * fusion where it points to an instruction preceding the 118 + * actual branch 119 + */ 120 + if (offset) { 121 + cpuc->lbr_entries[i].from += offset; 122 + if (fused_only) 123 + continue; 124 + } 125 + 126 + /* If type does not correspond, then discard */ 127 + if (type == X86_BR_NONE || (br_sel & type) != type) { 128 + cpuc->lbr_entries[i].from = 0; /* mark invalid */ 129 + compress = true; 130 + } 131 + 132 + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) 133 + cpuc->lbr_entries[i].type = common_branch_type(type); 134 + } 135 + 136 + if (!compress) 137 + return; 138 + 139 + /* Remove all invalid entries */ 140 + for (i = 0; i < cpuc->lbr_stack.nr; ) { 141 + if (!cpuc->lbr_entries[i].from) { 142 + j = i; 143 + while (++j < cpuc->lbr_stack.nr) 144 + cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j]; 145 + cpuc->lbr_stack.nr--; 146 + if (!cpuc->lbr_entries[i].from) 147 + continue; 148 + } 149 + i++; 150 + } 151 + } 152 + 153 + static const int lbr_spec_map[PERF_BR_SPEC_MAX] = { 154 + PERF_BR_SPEC_NA, 155 + PERF_BR_SPEC_WRONG_PATH, 156 + PERF_BR_NON_SPEC_CORRECT_PATH, 157 + PERF_BR_SPEC_CORRECT_PATH, 158 + }; 159 + 160 + void amd_pmu_lbr_read(void) 161 + { 162 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 163 + struct perf_branch_entry *br = cpuc->lbr_entries; 164 + struct branch_entry entry; 165 + int out = 0, idx, i; 166 + 167 + if (!cpuc->lbr_users) 168 + return; 169 + 170 + for (i = 0; i < x86_pmu.lbr_nr; i++) { 171 + entry.from.full = amd_pmu_lbr_get_from(i); 172 + entry.to.full = amd_pmu_lbr_get_to(i); 173 + 174 + /* 175 + * Check if a branch has been logged; if valid = 0, spec = 0 176 + * then no branch was recorded 177 + */ 178 + if (!entry.to.split.valid && !entry.to.split.spec) 179 + continue; 180 + 181 + perf_clear_branch_entry_bitfields(br + out); 182 + 183 + br[out].from = sign_ext_branch_ip(entry.from.split.ip); 184 + br[out].to = sign_ext_branch_ip(entry.to.split.ip); 185 + br[out].mispred = entry.from.split.mispredict; 186 + br[out].predicted = !br[out].mispred; 187 + 188 + /* 189 + * Set branch speculation information using the status of 190 + * the valid and spec bits. 191 + * 192 + * When valid = 0, spec = 0, no branch was recorded and the 193 + * entry is discarded as seen above. 194 + * 195 + * When valid = 0, spec = 1, the recorded branch was 196 + * speculative but took the wrong path. 197 + * 198 + * When valid = 1, spec = 0, the recorded branch was 199 + * non-speculative but took the correct path. 200 + * 201 + * When valid = 1, spec = 1, the recorded branch was 202 + * speculative and took the correct path 203 + */ 204 + idx = (entry.to.split.valid << 1) | entry.to.split.spec; 205 + br[out].spec = lbr_spec_map[idx]; 206 + out++; 207 + } 208 + 209 + cpuc->lbr_stack.nr = out; 210 + 211 + /* 212 + * Internal register renaming always ensures that LBR From[0] and 213 + * LBR To[0] always represent the TOS 214 + */ 215 + cpuc->lbr_stack.hw_idx = 0; 216 + 217 + /* Perform further software filtering */ 218 + amd_pmu_lbr_filter(); 219 + } 220 + 221 + static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { 222 + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, 223 + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, 224 + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, 225 + 226 + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, 227 + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, 228 + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, 229 + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, 230 + [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, 231 + [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, 232 + [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP, 233 + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, 234 + 235 + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP, 236 + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, 237 + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, 238 + 239 + [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, 240 + [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, 241 + }; 242 + 243 + static int amd_pmu_lbr_setup_filter(struct perf_event *event) 244 + { 245 + struct hw_perf_event_extra *reg = &event->hw.branch_reg; 246 + u64 br_type = event->attr.branch_sample_type; 247 + u64 mask = 0, v; 248 + int i; 249 + 250 + /* No LBR support */ 251 + if (!x86_pmu.lbr_nr) 252 + return -EOPNOTSUPP; 253 + 254 + if (br_type & PERF_SAMPLE_BRANCH_USER) 255 + mask |= X86_BR_USER; 256 + 257 + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) 258 + mask |= X86_BR_KERNEL; 259 + 260 + /* Ignore BRANCH_HV here */ 261 + 262 + if (br_type & PERF_SAMPLE_BRANCH_ANY) 263 + mask |= X86_BR_ANY; 264 + 265 + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) 266 + mask |= X86_BR_ANY_CALL; 267 + 268 + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) 269 + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; 270 + 271 + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) 272 + mask |= X86_BR_IND_CALL; 273 + 274 + if (br_type & PERF_SAMPLE_BRANCH_COND) 275 + mask |= X86_BR_JCC; 276 + 277 + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) 278 + mask |= X86_BR_IND_JMP; 279 + 280 + if (br_type & PERF_SAMPLE_BRANCH_CALL) 281 + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; 282 + 283 + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) 284 + mask |= X86_BR_TYPE_SAVE; 285 + 286 + reg->reg = mask; 287 + mask = 0; 288 + 289 + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { 290 + if (!(br_type & BIT_ULL(i))) 291 + continue; 292 + 293 + v = lbr_select_map[i]; 294 + if (v == LBR_NOT_SUPP) 295 + return -EOPNOTSUPP; 296 + 297 + if (v != LBR_IGNORE) 298 + mask |= v; 299 + } 300 + 301 + /* Filter bits operate in suppress mode */ 302 + reg->config = mask ^ LBR_SELECT_MASK; 303 + 304 + return 0; 305 + } 306 + 307 + int amd_pmu_lbr_hw_config(struct perf_event *event) 308 + { 309 + int ret = 0; 310 + 311 + /* LBR is not recommended in counting mode */ 312 + if (!is_sampling_event(event)) 313 + return -EINVAL; 314 + 315 + ret = amd_pmu_lbr_setup_filter(event); 316 + if (!ret) 317 + event->attach_state |= PERF_ATTACH_SCHED_CB; 318 + 319 + return ret; 320 + } 321 + 322 + void amd_pmu_lbr_reset(void) 323 + { 324 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 325 + int i; 326 + 327 + if (!x86_pmu.lbr_nr) 328 + return; 329 + 330 + /* Reset all branch records individually */ 331 + for (i = 0; i < x86_pmu.lbr_nr; i++) { 332 + amd_pmu_lbr_set_from(i, 0); 333 + amd_pmu_lbr_set_to(i, 0); 334 + } 335 + 336 + cpuc->last_task_ctx = NULL; 337 + cpuc->last_log_id = 0; 338 + wrmsrl(MSR_AMD64_LBR_SELECT, 0); 339 + } 340 + 341 + void amd_pmu_lbr_add(struct perf_event *event) 342 + { 343 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 344 + struct hw_perf_event_extra *reg = &event->hw.branch_reg; 345 + 346 + if (!x86_pmu.lbr_nr) 347 + return; 348 + 349 + if (has_branch_stack(event)) { 350 + cpuc->lbr_select = 1; 351 + cpuc->lbr_sel->config = reg->config; 352 + cpuc->br_sel = reg->reg; 353 + } 354 + 355 + perf_sched_cb_inc(event->ctx->pmu); 356 + 357 + if (!cpuc->lbr_users++ && !event->total_time_running) 358 + amd_pmu_lbr_reset(); 359 + } 360 + 361 + void amd_pmu_lbr_del(struct perf_event *event) 362 + { 363 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 364 + 365 + if (!x86_pmu.lbr_nr) 366 + return; 367 + 368 + if (has_branch_stack(event)) 369 + cpuc->lbr_select = 0; 370 + 371 + cpuc->lbr_users--; 372 + WARN_ON_ONCE(cpuc->lbr_users < 0); 373 + perf_sched_cb_dec(event->ctx->pmu); 374 + } 375 + 376 + void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) 377 + { 378 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 379 + 380 + /* 381 + * A context switch can flip the address space and LBR entries are 382 + * not tagged with an identifier. Hence, branches cannot be resolved 383 + * from the old address space and the LBR records should be wiped. 384 + */ 385 + if (cpuc->lbr_users && sched_in) 386 + amd_pmu_lbr_reset(); 387 + } 388 + 389 + void amd_pmu_lbr_enable_all(void) 390 + { 391 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 392 + u64 lbr_select, dbg_ctl, dbg_extn_cfg; 393 + 394 + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) 395 + return; 396 + 397 + /* Set hardware branch filter */ 398 + if (cpuc->lbr_select) { 399 + lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; 400 + wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); 401 + } 402 + 403 + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); 404 + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); 405 + 406 + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 407 + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); 408 + } 409 + 410 + void amd_pmu_lbr_disable_all(void) 411 + { 412 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 413 + u64 dbg_ctl, dbg_extn_cfg; 414 + 415 + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) 416 + return; 417 + 418 + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); 419 + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); 420 + 421 + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); 422 + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 423 + } 424 + 425 + __init int amd_pmu_lbr_init(void) 426 + { 427 + union cpuid_0x80000022_ebx ebx; 428 + 429 + if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2)) 430 + return -EOPNOTSUPP; 431 + 432 + /* Set number of entries */ 433 + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); 434 + x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz; 435 + 436 + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); 437 + 438 + return 0; 439 + }
+31 -30
arch/x86/events/core.c
··· 72 72 DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); 73 73 DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); 74 74 75 + DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); 76 + DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); 77 + DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); 78 + 75 79 DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); 76 80 DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); 77 81 DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); ··· 119 115 120 116 if (unlikely(!hwc->event_base)) 121 117 return 0; 122 - 123 - if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) 124 - return x86_pmu.update_topdown_event(event); 125 118 126 119 /* 127 120 * Careful: an NMI might modify the previous event value. ··· 622 621 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; 623 622 624 623 if (event->attr.sample_period && x86_pmu.limit_period) { 625 - if (x86_pmu.limit_period(event, event->attr.sample_period) > 626 - event->attr.sample_period) 624 + s64 left = event->attr.sample_period; 625 + x86_pmu.limit_period(event, &left); 626 + if (left > event->attr.sample_period) 627 627 return -EINVAL; 628 628 } 629 629 ··· 1356 1354 static_call(x86_pmu_enable_all)(added); 1357 1355 } 1358 1356 1359 - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1357 + DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1360 1358 1361 1359 /* 1362 1360 * Set the next IRQ period, based on the hwc->period_left value. ··· 1371 1369 1372 1370 if (unlikely(!hwc->event_base)) 1373 1371 return 0; 1374 - 1375 - if (unlikely(is_topdown_count(event)) && 1376 - x86_pmu.set_topdown_event_period) 1377 - return x86_pmu.set_topdown_event_period(event); 1378 1372 1379 1373 /* 1380 1374 * If we are way outside a reasonable range then just skip forward: ··· 1397 1399 if (left > x86_pmu.max_period) 1398 1400 left = x86_pmu.max_period; 1399 1401 1400 - if (x86_pmu.limit_period) 1401 - left = x86_pmu.limit_period(event, left); 1402 + static_call_cond(x86_pmu_limit_period)(event, &left); 1402 1403 1403 - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; 1404 + this_cpu_write(pmc_prev_left[idx], left); 1404 1405 1405 1406 /* 1406 1407 * The hw event starts counting from this event offset, ··· 1415 1418 */ 1416 1419 if (is_counter_pair(hwc)) 1417 1420 wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); 1418 - 1419 - /* 1420 - * Due to erratum on certan cpu we need 1421 - * a second write to be sure the register 1422 - * is updated properly 1423 - */ 1424 - if (x86_pmu.perfctr_second_write) { 1425 - wrmsrl(hwc->event_base, 1426 - (u64)(-left) & x86_pmu.cntval_mask); 1427 - } 1428 1421 1429 1422 perf_event_update_userpage(event); 1430 1423 ··· 1505 1518 1506 1519 if (flags & PERF_EF_RELOAD) { 1507 1520 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); 1508 - x86_perf_event_set_period(event); 1521 + static_call(x86_pmu_set_period)(event); 1509 1522 } 1510 1523 1511 1524 event->hw.state = 0; ··· 1597 1610 * Drain the remaining delta count out of a event 1598 1611 * that we are disabling: 1599 1612 */ 1600 - x86_perf_event_update(event); 1613 + static_call(x86_pmu_update)(event); 1601 1614 hwc->state |= PERF_HES_UPTODATE; 1602 1615 } 1603 1616 } ··· 1687 1700 1688 1701 event = cpuc->events[idx]; 1689 1702 1690 - val = x86_perf_event_update(event); 1703 + val = static_call(x86_pmu_update)(event); 1691 1704 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1692 1705 continue; 1693 1706 ··· 1696 1709 */ 1697 1710 handled++; 1698 1711 1699 - if (!x86_perf_event_set_period(event)) 1712 + if (!static_call(x86_pmu_set_period)(event)) 1700 1713 continue; 1701 1714 1702 1715 perf_sample_data_init(&data, 0, event->hw.last_period); 1703 1716 1704 - if (has_branch_stack(event)) 1717 + if (has_branch_stack(event)) { 1705 1718 data.br_stack = &cpuc->lbr_stack; 1719 + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; 1720 + } 1706 1721 1707 1722 if (perf_event_overflow(event, &data, regs)) 1708 1723 x86_pmu_stop(event, 0); ··· 2012 2023 static_call_update(x86_pmu_del, x86_pmu.del); 2013 2024 static_call_update(x86_pmu_read, x86_pmu.read); 2014 2025 2026 + static_call_update(x86_pmu_set_period, x86_pmu.set_period); 2027 + static_call_update(x86_pmu_update, x86_pmu.update); 2028 + static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); 2029 + 2015 2030 static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); 2016 2031 static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); 2017 2032 static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); ··· 2035 2042 2036 2043 static void _x86_pmu_read(struct perf_event *event) 2037 2044 { 2038 - x86_perf_event_update(event); 2045 + static_call(x86_pmu_update)(event); 2039 2046 } 2040 2047 2041 2048 void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, ··· 2141 2148 2142 2149 if (!x86_pmu.guest_get_msrs) 2143 2150 x86_pmu.guest_get_msrs = (void *)&__static_call_return0; 2151 + 2152 + if (!x86_pmu.set_period) 2153 + x86_pmu.set_period = x86_perf_event_set_period; 2154 + 2155 + if (!x86_pmu.update) 2156 + x86_pmu.update = x86_perf_event_update; 2144 2157 2145 2158 x86_pmu_static_call_update(); 2146 2159 ··· 2669 2670 return -EINVAL; 2670 2671 2671 2672 if (value && x86_pmu.limit_period) { 2672 - if (x86_pmu.limit_period(event, value) > value) 2673 + s64 left = value; 2674 + x86_pmu.limit_period(event, &left); 2675 + if (left > value) 2673 2676 return -EINVAL; 2674 2677 } 2675 2678
+64 -37
arch/x86/events/intel/core.c
··· 2199 2199 u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); 2200 2200 2201 2201 intel_pmu_lbr_enable_all(pmi); 2202 + 2203 + if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) { 2204 + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); 2205 + cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val; 2206 + } 2207 + 2202 2208 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 2203 2209 intel_ctrl & ~cpuc->intel_ctrl_guest_mask); 2204 2210 ··· 2317 2311 for (i = 0; i < 4; i++) { 2318 2312 event = cpuc->events[i]; 2319 2313 if (event) 2320 - x86_perf_event_update(event); 2314 + static_call(x86_pmu_update)(event); 2321 2315 } 2322 2316 2323 2317 for (i = 0; i < 4; i++) { ··· 2332 2326 event = cpuc->events[i]; 2333 2327 2334 2328 if (event) { 2335 - x86_perf_event_set_period(event); 2329 + static_call(x86_pmu_set_period)(event); 2336 2330 __x86_pmu_enable_event(&event->hw, 2337 2331 ARCH_PERFMON_EVENTSEL_ENABLE); 2338 2332 } else ··· 2422 2416 2423 2417 static void intel_pmu_disable_fixed(struct perf_event *event) 2424 2418 { 2419 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2425 2420 struct hw_perf_event *hwc = &event->hw; 2426 - u64 ctrl_val, mask; 2427 2421 int idx = hwc->idx; 2422 + u64 mask; 2428 2423 2429 2424 if (is_topdown_idx(idx)) { 2430 2425 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); ··· 2442 2435 intel_clear_masks(event, idx); 2443 2436 2444 2437 mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); 2445 - rdmsrl(hwc->config_base, ctrl_val); 2446 - ctrl_val &= ~mask; 2447 - wrmsrl(hwc->config_base, ctrl_val); 2438 + cpuc->fixed_ctrl_val &= ~mask; 2448 2439 } 2449 2440 2450 2441 static void intel_pmu_disable_event(struct perf_event *event) ··· 2534 2529 2535 2530 return icl_set_topdown_event_period(event); 2536 2531 } 2532 + 2533 + DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); 2537 2534 2538 2535 static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) 2539 2536 { ··· 2687 2680 return icl_update_topdown_event(event); 2688 2681 } 2689 2682 2683 + DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); 2690 2684 2691 2685 static void intel_pmu_read_topdown_event(struct perf_event *event) 2692 2686 { ··· 2699 2691 return; 2700 2692 2701 2693 perf_pmu_disable(event->pmu); 2702 - x86_pmu.update_topdown_event(event); 2694 + static_call(intel_pmu_update_topdown_event)(event); 2703 2695 perf_pmu_enable(event->pmu); 2704 2696 } 2705 2697 ··· 2707 2699 { 2708 2700 if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) 2709 2701 intel_pmu_auto_reload_read(event); 2710 - else if (is_topdown_count(event) && x86_pmu.update_topdown_event) 2702 + else if (is_topdown_count(event)) 2711 2703 intel_pmu_read_topdown_event(event); 2712 2704 else 2713 2705 x86_perf_event_update(event); ··· 2715 2707 2716 2708 static void intel_pmu_enable_fixed(struct perf_event *event) 2717 2709 { 2710 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2718 2711 struct hw_perf_event *hwc = &event->hw; 2719 - u64 ctrl_val, mask, bits = 0; 2712 + u64 mask, bits = 0; 2720 2713 int idx = hwc->idx; 2721 2714 2722 2715 if (is_topdown_idx(idx)) { ··· 2761 2752 mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); 2762 2753 } 2763 2754 2764 - rdmsrl(hwc->config_base, ctrl_val); 2765 - ctrl_val &= ~mask; 2766 - ctrl_val |= bits; 2767 - wrmsrl(hwc->config_base, ctrl_val); 2755 + cpuc->fixed_ctrl_val &= ~mask; 2756 + cpuc->fixed_ctrl_val |= bits; 2768 2757 } 2769 2758 2770 2759 static void intel_pmu_enable_event(struct perf_event *event) ··· 2810 2803 */ 2811 2804 int intel_pmu_save_and_restart(struct perf_event *event) 2812 2805 { 2813 - x86_perf_event_update(event); 2806 + static_call(x86_pmu_update)(event); 2814 2807 /* 2815 2808 * For a checkpointed counter always reset back to 0. This 2816 2809 * avoids a situation where the counter overflows, aborts the ··· 2822 2815 wrmsrl(event->hw.event_base, 0); 2823 2816 local64_set(&event->hw.prev_count, 0); 2824 2817 } 2818 + return static_call(x86_pmu_set_period)(event); 2819 + } 2820 + 2821 + static int intel_pmu_set_period(struct perf_event *event) 2822 + { 2823 + if (unlikely(is_topdown_count(event))) 2824 + return static_call(intel_pmu_set_topdown_event_period)(event); 2825 + 2825 2826 return x86_perf_event_set_period(event); 2827 + } 2828 + 2829 + static u64 intel_pmu_update(struct perf_event *event) 2830 + { 2831 + if (unlikely(is_topdown_count(event))) 2832 + return static_call(intel_pmu_update_topdown_event)(event); 2833 + 2834 + return x86_perf_event_update(event); 2826 2835 } 2827 2836 2828 2837 static void intel_pmu_reset(void) ··· 3003 2980 */ 3004 2981 if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { 3005 2982 handled++; 3006 - if (x86_pmu.update_topdown_event) 3007 - x86_pmu.update_topdown_event(NULL); 2983 + static_call(intel_pmu_update_topdown_event)(NULL); 3008 2984 } 3009 2985 3010 2986 /* ··· 3026 3004 3027 3005 perf_sample_data_init(&data, 0, event->hw.last_period); 3028 3006 3029 - if (has_branch_stack(event)) 3007 + if (has_branch_stack(event)) { 3030 3008 data.br_stack = &cpuc->lbr_stack; 3009 + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; 3010 + } 3031 3011 3032 3012 if (perf_event_overflow(event, &data, regs)) 3033 3013 x86_pmu_stop(event, 0); ··· 3877 3853 } 3878 3854 if (x86_pmu.pebs_aliases) 3879 3855 x86_pmu.pebs_aliases(event); 3880 - 3881 - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3882 - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; 3883 3856 } 3884 3857 3885 3858 if (needs_branch_stack(event)) { ··· 4355 4334 * Therefore the effective (average) period matches the requested period, 4356 4335 * despite coarser hardware granularity. 4357 4336 */ 4358 - static u64 bdw_limit_period(struct perf_event *event, u64 left) 4337 + static void bdw_limit_period(struct perf_event *event, s64 *left) 4359 4338 { 4360 4339 if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == 4361 4340 X86_CONFIG(.event=0xc0, .umask=0x01)) { 4362 - if (left < 128) 4363 - left = 128; 4364 - left &= ~0x3fULL; 4341 + if (*left < 128) 4342 + *left = 128; 4343 + *left &= ~0x3fULL; 4365 4344 } 4366 - return left; 4367 4345 } 4368 4346 4369 - static u64 nhm_limit_period(struct perf_event *event, u64 left) 4347 + static void nhm_limit_period(struct perf_event *event, s64 *left) 4370 4348 { 4371 - return max(left, 32ULL); 4349 + *left = max(*left, 32LL); 4372 4350 } 4373 4351 4374 - static u64 spr_limit_period(struct perf_event *event, u64 left) 4352 + static void spr_limit_period(struct perf_event *event, s64 *left) 4375 4353 { 4376 4354 if (event->attr.precise_ip == 3) 4377 - return max(left, 128ULL); 4378 - 4379 - return left; 4355 + *left = max(*left, 128LL); 4380 4356 } 4381 4357 4382 4358 PMU_FORMAT_ATTR(event, "config:0-7" ); ··· 4812 4794 .add = intel_pmu_add_event, 4813 4795 .del = intel_pmu_del_event, 4814 4796 .read = intel_pmu_read_event, 4797 + .set_period = intel_pmu_set_period, 4798 + .update = intel_pmu_update, 4815 4799 .hw_config = intel_pmu_hw_config, 4816 4800 .schedule_events = x86_schedule_events, 4817 4801 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, ··· 6332 6312 x86_pmu.lbr_pt_coexist = true; 6333 6313 intel_pmu_pebs_data_source_skl(pmem); 6334 6314 x86_pmu.num_topdown_events = 4; 6335 - x86_pmu.update_topdown_event = icl_update_topdown_event; 6336 - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; 6315 + static_call_update(intel_pmu_update_topdown_event, 6316 + &icl_update_topdown_event); 6317 + static_call_update(intel_pmu_set_topdown_event_period, 6318 + &icl_set_topdown_event_period); 6337 6319 pr_cont("Icelake events, "); 6338 6320 name = "icelake"; 6339 6321 break; ··· 6370 6348 x86_pmu.lbr_pt_coexist = true; 6371 6349 intel_pmu_pebs_data_source_skl(pmem); 6372 6350 x86_pmu.num_topdown_events = 8; 6373 - x86_pmu.update_topdown_event = icl_update_topdown_event; 6374 - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; 6351 + static_call_update(intel_pmu_update_topdown_event, 6352 + &icl_update_topdown_event); 6353 + static_call_update(intel_pmu_set_topdown_event_period, 6354 + &icl_set_topdown_event_period); 6375 6355 pr_cont("Sapphire Rapids events, "); 6376 6356 name = "sapphire_rapids"; 6377 6357 break; ··· 6382 6358 case INTEL_FAM6_ALDERLAKE_L: 6383 6359 case INTEL_FAM6_RAPTORLAKE: 6384 6360 case INTEL_FAM6_RAPTORLAKE_P: 6361 + case INTEL_FAM6_RAPTORLAKE_S: 6385 6362 /* 6386 6363 * Alder Lake has 2 types of CPU, core and atom. 6387 6364 * ··· 6407 6382 intel_pmu_pebs_data_source_adl(); 6408 6383 x86_pmu.pebs_latency_data = adl_latency_data_small; 6409 6384 x86_pmu.num_topdown_events = 8; 6410 - x86_pmu.update_topdown_event = adl_update_topdown_event; 6411 - x86_pmu.set_topdown_event_period = adl_set_topdown_event_period; 6385 + static_call_update(intel_pmu_update_topdown_event, 6386 + &adl_update_topdown_event); 6387 + static_call_update(intel_pmu_set_topdown_event_period, 6388 + &adl_set_topdown_event_period); 6412 6389 6413 6390 x86_pmu.filter_match = intel_pmu_filter_match; 6414 6391 x86_pmu.get_event_constraints = adl_get_event_constraints;
+1
arch/x86/events/intel/cstate.c
··· 685 685 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), 686 686 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), 687 687 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), 688 + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), 688 689 { }, 689 690 }; 690 691 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
+41 -14
arch/x86/events/intel/ds.c
··· 1540 1540 /* 1541 1541 * Use latency for weight (only avail with PEBS-LL) 1542 1542 */ 1543 - if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) 1543 + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { 1544 1544 data->weight.full = pebs->lat; 1545 + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 1546 + } 1545 1547 1546 1548 /* 1547 1549 * data.data_src encodes the data source 1548 1550 */ 1549 - if (sample_type & PERF_SAMPLE_DATA_SRC) 1551 + if (sample_type & PERF_SAMPLE_DATA_SRC) { 1550 1552 data->data_src.val = get_data_src(event, pebs->dse); 1553 + data->sample_flags |= PERF_SAMPLE_DATA_SRC; 1554 + } 1551 1555 1552 1556 /* 1553 1557 * We must however always use iregs for the unwinder to stay sane; the ··· 1559 1555 * previous PMI context or an (I)RET happened between the record and 1560 1556 * PMI. 1561 1557 */ 1562 - if (sample_type & PERF_SAMPLE_CALLCHAIN) 1558 + if (sample_type & PERF_SAMPLE_CALLCHAIN) { 1563 1559 data->callchain = perf_callchain(event, iregs); 1560 + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; 1561 + } 1564 1562 1565 1563 /* 1566 1564 * We use the interrupt regs as a base because the PEBS record does not ··· 1634 1628 1635 1629 1636 1630 if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && 1637 - x86_pmu.intel_cap.pebs_format >= 1) 1631 + x86_pmu.intel_cap.pebs_format >= 1) { 1638 1632 data->addr = pebs->dla; 1633 + data->sample_flags |= PERF_SAMPLE_ADDR; 1634 + } 1639 1635 1640 1636 if (x86_pmu.intel_cap.pebs_format >= 2) { 1641 1637 /* Only set the TSX weight when no memory weight. */ 1642 - if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) 1638 + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { 1643 1639 data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); 1644 - 1645 - if (sample_type & PERF_SAMPLE_TRANSACTION) 1640 + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 1641 + } 1642 + if (sample_type & PERF_SAMPLE_TRANSACTION) { 1646 1643 data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, 1647 1644 pebs->ax); 1645 + data->sample_flags |= PERF_SAMPLE_TRANSACTION; 1646 + } 1648 1647 } 1649 1648 1650 1649 /* ··· 1659 1648 * We can only do this for the default trace clock. 1660 1649 */ 1661 1650 if (x86_pmu.intel_cap.pebs_format >= 3 && 1662 - event->attr.use_clockid == 0) 1651 + event->attr.use_clockid == 0) { 1663 1652 data->time = native_sched_clock_from_tsc(pebs->tsc); 1653 + data->sample_flags |= PERF_SAMPLE_TIME; 1654 + } 1664 1655 1665 - if (has_branch_stack(event)) 1656 + if (has_branch_stack(event)) { 1666 1657 data->br_stack = &cpuc->lbr_stack; 1658 + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; 1659 + } 1667 1660 } 1668 1661 1669 1662 static void adaptive_pebs_save_regs(struct pt_regs *regs, ··· 1725 1710 perf_sample_data_init(data, 0, event->hw.last_period); 1726 1711 data->period = event->hw.last_period; 1727 1712 1728 - if (event->attr.use_clockid == 0) 1713 + if (event->attr.use_clockid == 0) { 1729 1714 data->time = native_sched_clock_from_tsc(basic->tsc); 1715 + data->sample_flags |= PERF_SAMPLE_TIME; 1716 + } 1730 1717 1731 1718 /* 1732 1719 * We must however always use iregs for the unwinder to stay sane; the ··· 1736 1719 * previous PMI context or an (I)RET happened between the record and 1737 1720 * PMI. 1738 1721 */ 1739 - if (sample_type & PERF_SAMPLE_CALLCHAIN) 1722 + if (sample_type & PERF_SAMPLE_CALLCHAIN) { 1740 1723 data->callchain = perf_callchain(event, iregs); 1724 + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; 1725 + } 1741 1726 1742 1727 *regs = *iregs; 1743 1728 /* The ip in basic is EventingIP */ ··· 1790 1771 data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: 1791 1772 intel_get_tsx_weight(meminfo->tsx_tuning); 1792 1773 } 1774 + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 1793 1775 } 1794 1776 1795 - if (sample_type & PERF_SAMPLE_DATA_SRC) 1777 + if (sample_type & PERF_SAMPLE_DATA_SRC) { 1796 1778 data->data_src.val = get_data_src(event, meminfo->aux); 1779 + data->sample_flags |= PERF_SAMPLE_DATA_SRC; 1780 + } 1797 1781 1798 - if (sample_type & PERF_SAMPLE_ADDR_TYPE) 1782 + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { 1799 1783 data->addr = meminfo->address; 1784 + data->sample_flags |= PERF_SAMPLE_ADDR; 1785 + } 1800 1786 1801 - if (sample_type & PERF_SAMPLE_TRANSACTION) 1787 + if (sample_type & PERF_SAMPLE_TRANSACTION) { 1802 1788 data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, 1803 1789 gprs ? gprs->ax : 0); 1790 + data->sample_flags |= PERF_SAMPLE_TRANSACTION; 1791 + } 1804 1792 } 1805 1793 1806 1794 if (format_size & PEBS_DATACFG_XMMS) { ··· 1826 1800 if (has_branch_stack(event)) { 1827 1801 intel_pmu_store_pebs_lbrs(lbr); 1828 1802 data->br_stack = &cpuc->lbr_stack; 1803 + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; 1829 1804 } 1830 1805 } 1831 1806
-273
arch/x86/events/intel/lbr.c
··· 4 4 5 5 #include <asm/perf_event.h> 6 6 #include <asm/msr.h> 7 - #include <asm/insn.h> 8 7 9 8 #include "../perf_event.h" 10 9 ··· 63 64 #define LBR_FROM_FLAG_ABORT BIT_ULL(61) 64 65 65 66 #define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59)) 66 - 67 - /* 68 - * x86control flow change classification 69 - * x86control flow changes include branches, interrupts, traps, faults 70 - */ 71 - enum { 72 - X86_BR_NONE = 0, /* unknown */ 73 - 74 - X86_BR_USER = 1 << 0, /* branch target is user */ 75 - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ 76 - 77 - X86_BR_CALL = 1 << 2, /* call */ 78 - X86_BR_RET = 1 << 3, /* return */ 79 - X86_BR_SYSCALL = 1 << 4, /* syscall */ 80 - X86_BR_SYSRET = 1 << 5, /* syscall return */ 81 - X86_BR_INT = 1 << 6, /* sw interrupt */ 82 - X86_BR_IRET = 1 << 7, /* return from interrupt */ 83 - X86_BR_JCC = 1 << 8, /* conditional */ 84 - X86_BR_JMP = 1 << 9, /* jump */ 85 - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ 86 - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ 87 - X86_BR_ABORT = 1 << 12,/* transaction abort */ 88 - X86_BR_IN_TX = 1 << 13,/* in transaction */ 89 - X86_BR_NO_TX = 1 << 14,/* not in transaction */ 90 - X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ 91 - X86_BR_CALL_STACK = 1 << 16,/* call stack */ 92 - X86_BR_IND_JMP = 1 << 17,/* indirect jump */ 93 - 94 - X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ 95 - 96 - }; 97 - 98 - #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) 99 - #define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) 100 - 101 - #define X86_BR_ANY \ 102 - (X86_BR_CALL |\ 103 - X86_BR_RET |\ 104 - X86_BR_SYSCALL |\ 105 - X86_BR_SYSRET |\ 106 - X86_BR_INT |\ 107 - X86_BR_IRET |\ 108 - X86_BR_JCC |\ 109 - X86_BR_JMP |\ 110 - X86_BR_IRQ |\ 111 - X86_BR_ABORT |\ 112 - X86_BR_IND_CALL |\ 113 - X86_BR_IND_JMP |\ 114 - X86_BR_ZERO_CALL) 115 - 116 - #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) 117 - 118 - #define X86_BR_ANY_CALL \ 119 - (X86_BR_CALL |\ 120 - X86_BR_IND_CALL |\ 121 - X86_BR_ZERO_CALL |\ 122 - X86_BR_SYSCALL |\ 123 - X86_BR_IRQ |\ 124 - X86_BR_INT) 125 67 126 68 /* 127 69 * Intel LBR_CTL bits ··· 1089 1149 ret = intel_pmu_setup_hw_lbr_filter(event); 1090 1150 1091 1151 return ret; 1092 - } 1093 - 1094 - /* 1095 - * return the type of control flow change at address "from" 1096 - * instruction is not necessarily a branch (in case of interrupt). 1097 - * 1098 - * The branch type returned also includes the priv level of the 1099 - * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). 1100 - * 1101 - * If a branch type is unknown OR the instruction cannot be 1102 - * decoded (e.g., text page not present), then X86_BR_NONE is 1103 - * returned. 1104 - */ 1105 - static int branch_type(unsigned long from, unsigned long to, int abort) 1106 - { 1107 - struct insn insn; 1108 - void *addr; 1109 - int bytes_read, bytes_left; 1110 - int ret = X86_BR_NONE; 1111 - int ext, to_plm, from_plm; 1112 - u8 buf[MAX_INSN_SIZE]; 1113 - int is64 = 0; 1114 - 1115 - to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; 1116 - from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; 1117 - 1118 - /* 1119 - * maybe zero if lbr did not fill up after a reset by the time 1120 - * we get a PMU interrupt 1121 - */ 1122 - if (from == 0 || to == 0) 1123 - return X86_BR_NONE; 1124 - 1125 - if (abort) 1126 - return X86_BR_ABORT | to_plm; 1127 - 1128 - if (from_plm == X86_BR_USER) { 1129 - /* 1130 - * can happen if measuring at the user level only 1131 - * and we interrupt in a kernel thread, e.g., idle. 1132 - */ 1133 - if (!current->mm) 1134 - return X86_BR_NONE; 1135 - 1136 - /* may fail if text not present */ 1137 - bytes_left = copy_from_user_nmi(buf, (void __user *)from, 1138 - MAX_INSN_SIZE); 1139 - bytes_read = MAX_INSN_SIZE - bytes_left; 1140 - if (!bytes_read) 1141 - return X86_BR_NONE; 1142 - 1143 - addr = buf; 1144 - } else { 1145 - /* 1146 - * The LBR logs any address in the IP, even if the IP just 1147 - * faulted. This means userspace can control the from address. 1148 - * Ensure we don't blindly read any address by validating it is 1149 - * a known text address. 1150 - */ 1151 - if (kernel_text_address(from)) { 1152 - addr = (void *)from; 1153 - /* 1154 - * Assume we can get the maximum possible size 1155 - * when grabbing kernel data. This is not 1156 - * _strictly_ true since we could possibly be 1157 - * executing up next to a memory hole, but 1158 - * it is very unlikely to be a problem. 1159 - */ 1160 - bytes_read = MAX_INSN_SIZE; 1161 - } else { 1162 - return X86_BR_NONE; 1163 - } 1164 - } 1165 - 1166 - /* 1167 - * decoder needs to know the ABI especially 1168 - * on 64-bit systems running 32-bit apps 1169 - */ 1170 - #ifdef CONFIG_X86_64 1171 - is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); 1172 - #endif 1173 - insn_init(&insn, addr, bytes_read, is64); 1174 - if (insn_get_opcode(&insn)) 1175 - return X86_BR_ABORT; 1176 - 1177 - switch (insn.opcode.bytes[0]) { 1178 - case 0xf: 1179 - switch (insn.opcode.bytes[1]) { 1180 - case 0x05: /* syscall */ 1181 - case 0x34: /* sysenter */ 1182 - ret = X86_BR_SYSCALL; 1183 - break; 1184 - case 0x07: /* sysret */ 1185 - case 0x35: /* sysexit */ 1186 - ret = X86_BR_SYSRET; 1187 - break; 1188 - case 0x80 ... 0x8f: /* conditional */ 1189 - ret = X86_BR_JCC; 1190 - break; 1191 - default: 1192 - ret = X86_BR_NONE; 1193 - } 1194 - break; 1195 - case 0x70 ... 0x7f: /* conditional */ 1196 - ret = X86_BR_JCC; 1197 - break; 1198 - case 0xc2: /* near ret */ 1199 - case 0xc3: /* near ret */ 1200 - case 0xca: /* far ret */ 1201 - case 0xcb: /* far ret */ 1202 - ret = X86_BR_RET; 1203 - break; 1204 - case 0xcf: /* iret */ 1205 - ret = X86_BR_IRET; 1206 - break; 1207 - case 0xcc ... 0xce: /* int */ 1208 - ret = X86_BR_INT; 1209 - break; 1210 - case 0xe8: /* call near rel */ 1211 - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { 1212 - /* zero length call */ 1213 - ret = X86_BR_ZERO_CALL; 1214 - break; 1215 - } 1216 - fallthrough; 1217 - case 0x9a: /* call far absolute */ 1218 - ret = X86_BR_CALL; 1219 - break; 1220 - case 0xe0 ... 0xe3: /* loop jmp */ 1221 - ret = X86_BR_JCC; 1222 - break; 1223 - case 0xe9 ... 0xeb: /* jmp */ 1224 - ret = X86_BR_JMP; 1225 - break; 1226 - case 0xff: /* call near absolute, call far absolute ind */ 1227 - if (insn_get_modrm(&insn)) 1228 - return X86_BR_ABORT; 1229 - 1230 - ext = (insn.modrm.bytes[0] >> 3) & 0x7; 1231 - switch (ext) { 1232 - case 2: /* near ind call */ 1233 - case 3: /* far ind call */ 1234 - ret = X86_BR_IND_CALL; 1235 - break; 1236 - case 4: 1237 - case 5: 1238 - ret = X86_BR_IND_JMP; 1239 - break; 1240 - } 1241 - break; 1242 - default: 1243 - ret = X86_BR_NONE; 1244 - } 1245 - /* 1246 - * interrupts, traps, faults (and thus ring transition) may 1247 - * occur on any instructions. Thus, to classify them correctly, 1248 - * we need to first look at the from and to priv levels. If they 1249 - * are different and to is in the kernel, then it indicates 1250 - * a ring transition. If the from instruction is not a ring 1251 - * transition instr (syscall, systenter, int), then it means 1252 - * it was a irq, trap or fault. 1253 - * 1254 - * we have no way of detecting kernel to kernel faults. 1255 - */ 1256 - if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL 1257 - && ret != X86_BR_SYSCALL && ret != X86_BR_INT) 1258 - ret = X86_BR_IRQ; 1259 - 1260 - /* 1261 - * branch priv level determined by target as 1262 - * is done by HW when LBR_SELECT is implemented 1263 - */ 1264 - if (ret != X86_BR_NONE) 1265 - ret |= to_plm; 1266 - 1267 - return ret; 1268 - } 1269 - 1270 - #define X86_BR_TYPE_MAP_MAX 16 1271 - 1272 - static int branch_map[X86_BR_TYPE_MAP_MAX] = { 1273 - PERF_BR_CALL, /* X86_BR_CALL */ 1274 - PERF_BR_RET, /* X86_BR_RET */ 1275 - PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ 1276 - PERF_BR_SYSRET, /* X86_BR_SYSRET */ 1277 - PERF_BR_UNKNOWN, /* X86_BR_INT */ 1278 - PERF_BR_ERET, /* X86_BR_IRET */ 1279 - PERF_BR_COND, /* X86_BR_JCC */ 1280 - PERF_BR_UNCOND, /* X86_BR_JMP */ 1281 - PERF_BR_IRQ, /* X86_BR_IRQ */ 1282 - PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ 1283 - PERF_BR_UNKNOWN, /* X86_BR_ABORT */ 1284 - PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ 1285 - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ 1286 - PERF_BR_CALL, /* X86_BR_ZERO_CALL */ 1287 - PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ 1288 - PERF_BR_IND, /* X86_BR_IND_JMP */ 1289 - }; 1290 - 1291 - static int 1292 - common_branch_type(int type) 1293 - { 1294 - int i; 1295 - 1296 - type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ 1297 - 1298 - if (type) { 1299 - i = __ffs(type); 1300 - if (i < X86_BR_TYPE_MAP_MAX) 1301 - return branch_map[i]; 1302 - } 1303 - 1304 - return PERF_BR_UNKNOWN; 1305 1152 } 1306 1153 1307 1154 enum {
+27 -10
arch/x86/events/intel/p4.c
··· 1006 1006 } 1007 1007 } 1008 1008 1009 + static int p4_pmu_set_period(struct perf_event *event) 1010 + { 1011 + struct hw_perf_event *hwc = &event->hw; 1012 + s64 left = this_cpu_read(pmc_prev_left[hwc->idx]); 1013 + int ret; 1014 + 1015 + ret = x86_perf_event_set_period(event); 1016 + 1017 + if (hwc->event_base) { 1018 + /* 1019 + * This handles erratum N15 in intel doc 249199-029, 1020 + * the counter may not be updated correctly on write 1021 + * so we need a second write operation to do the trick 1022 + * (the official workaround didn't work) 1023 + * 1024 + * the former idea is taken from OProfile code 1025 + */ 1026 + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); 1027 + } 1028 + 1029 + return ret; 1030 + } 1031 + 1009 1032 static int p4_pmu_handle_irq(struct pt_regs *regs) 1010 1033 { 1011 1034 struct perf_sample_data data; ··· 1067 1044 /* event overflow for sure */ 1068 1045 perf_sample_data_init(&data, 0, hwc->last_period); 1069 1046 1070 - if (!x86_perf_event_set_period(event)) 1047 + if (!static_call(x86_pmu_set_period)(event)) 1071 1048 continue; 1072 1049 1073 1050 ··· 1339 1316 .enable_all = p4_pmu_enable_all, 1340 1317 .enable = p4_pmu_enable_event, 1341 1318 .disable = p4_pmu_disable_event, 1319 + 1320 + .set_period = p4_pmu_set_period, 1321 + 1342 1322 .eventsel = MSR_P4_BPU_CCCR0, 1343 1323 .perfctr = MSR_P4_BPU_PERFCTR0, 1344 1324 .event_map = p4_pmu_event_map, ··· 1360 1334 .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, 1361 1335 .hw_config = p4_hw_config, 1362 1336 .schedule_events = p4_pmu_schedule_events, 1363 - /* 1364 - * This handles erratum N15 in intel doc 249199-029, 1365 - * the counter may not be updated correctly on write 1366 - * so we need a second write operation to do the trick 1367 - * (the official workaround didn't work) 1368 - * 1369 - * the former idea is taken from OProfile code 1370 - */ 1371 - .perfctr_second_write = 1, 1372 1337 1373 1338 .format_attrs = intel_p4_formats_attr, 1374 1339 };
+1
arch/x86/events/intel/uncore.c
··· 1831 1831 X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), 1832 1832 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), 1833 1833 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), 1834 + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), 1834 1835 X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), 1835 1836 X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), 1836 1837 {},
+1
arch/x86/events/msr.c
··· 106 106 case INTEL_FAM6_ALDERLAKE_N: 107 107 case INTEL_FAM6_RAPTORLAKE: 108 108 case INTEL_FAM6_RAPTORLAKE_P: 109 + case INTEL_FAM6_RAPTORLAKE_S: 109 110 if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) 110 111 return true; 111 112 break;
+105 -23
arch/x86/events/perf_event.h
··· 64 64 return ((ecode & c->cmask) - c->code) <= (u64)c->size; 65 65 } 66 66 67 + #define PERF_ARCH(name, val) \ 68 + PERF_X86_EVENT_##name = val, 69 + 67 70 /* 68 71 * struct hw_perf_event.flags flags 69 72 */ 70 - #define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ 71 - #define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ 72 - #define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ 73 - #define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ 74 - #define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ 75 - #define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ 76 - #define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ 73 + enum { 74 + #include "perf_event_flags.h" 75 + }; 77 76 78 - #define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ 79 - #define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ 80 - #define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ 81 - #define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ 82 - #define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ 83 - #define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ 84 - #define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ 85 - #define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ 86 - #define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ 87 - #define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */ 77 + #undef PERF_ARCH 78 + 79 + #define PERF_ARCH(name, val) \ 80 + static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \ 81 + PERF_X86_EVENT_##name); 82 + 83 + #include "perf_event_flags.h" 84 + 85 + #undef PERF_ARCH 88 86 89 87 static inline bool is_topdown_count(struct perf_event *event) 90 88 { ··· 269 271 u64 pebs_data_cfg; 270 272 u64 active_pebs_data_cfg; 271 273 int pebs_record_size; 274 + 275 + /* Intel Fixed counter configuration */ 276 + u64 fixed_ctrl_val; 277 + u64 active_fixed_ctrl_val; 272 278 273 279 /* 274 280 * Intel LBR bits ··· 747 745 void (*add)(struct perf_event *); 748 746 void (*del)(struct perf_event *); 749 747 void (*read)(struct perf_event *event); 748 + int (*set_period)(struct perf_event *event); 749 + u64 (*update)(struct perf_event *event); 750 750 int (*hw_config)(struct perf_event *event); 751 751 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); 752 752 unsigned eventsel; ··· 784 780 785 781 struct event_constraint *event_constraints; 786 782 struct x86_pmu_quirk *quirks; 787 - int perfctr_second_write; 788 - u64 (*limit_period)(struct perf_event *event, u64 l); 783 + void (*limit_period)(struct perf_event *event, s64 *l); 789 784 790 785 /* PMI handler bits */ 791 786 unsigned int late_ack :1, ··· 892 889 * Intel perf metrics 893 890 */ 894 891 int num_topdown_events; 895 - u64 (*update_topdown_event)(struct perf_event *event); 896 - int (*set_topdown_event_period)(struct perf_event *event); 897 892 898 893 /* 899 894 * perf task context (i.e. struct perf_event_context::task_ctx_data) ··· 1045 1044 struct pmu *x86_get_pmu(unsigned int cpu); 1046 1045 extern struct x86_pmu x86_pmu __read_mostly; 1047 1046 1047 + DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); 1048 + DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); 1049 + 1048 1050 static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) 1049 1051 { 1050 1052 if (static_cpu_has(X86_FEATURE_ARCH_LBR)) ··· 1063 1059 } 1064 1060 1065 1061 DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); 1062 + DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1066 1063 1067 1064 int x86_perf_event_set_period(struct perf_event *event); 1068 1065 ··· 1215 1210 regs->ip = ip; 1216 1211 } 1217 1212 1213 + /* 1214 + * x86control flow change classification 1215 + * x86control flow changes include branches, interrupts, traps, faults 1216 + */ 1217 + enum { 1218 + X86_BR_NONE = 0, /* unknown */ 1219 + 1220 + X86_BR_USER = 1 << 0, /* branch target is user */ 1221 + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ 1222 + 1223 + X86_BR_CALL = 1 << 2, /* call */ 1224 + X86_BR_RET = 1 << 3, /* return */ 1225 + X86_BR_SYSCALL = 1 << 4, /* syscall */ 1226 + X86_BR_SYSRET = 1 << 5, /* syscall return */ 1227 + X86_BR_INT = 1 << 6, /* sw interrupt */ 1228 + X86_BR_IRET = 1 << 7, /* return from interrupt */ 1229 + X86_BR_JCC = 1 << 8, /* conditional */ 1230 + X86_BR_JMP = 1 << 9, /* jump */ 1231 + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ 1232 + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ 1233 + X86_BR_ABORT = 1 << 12,/* transaction abort */ 1234 + X86_BR_IN_TX = 1 << 13,/* in transaction */ 1235 + X86_BR_NO_TX = 1 << 14,/* not in transaction */ 1236 + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ 1237 + X86_BR_CALL_STACK = 1 << 16,/* call stack */ 1238 + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ 1239 + 1240 + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ 1241 + 1242 + }; 1243 + 1244 + #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) 1245 + #define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) 1246 + 1247 + #define X86_BR_ANY \ 1248 + (X86_BR_CALL |\ 1249 + X86_BR_RET |\ 1250 + X86_BR_SYSCALL |\ 1251 + X86_BR_SYSRET |\ 1252 + X86_BR_INT |\ 1253 + X86_BR_IRET |\ 1254 + X86_BR_JCC |\ 1255 + X86_BR_JMP |\ 1256 + X86_BR_IRQ |\ 1257 + X86_BR_ABORT |\ 1258 + X86_BR_IND_CALL |\ 1259 + X86_BR_IND_JMP |\ 1260 + X86_BR_ZERO_CALL) 1261 + 1262 + #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) 1263 + 1264 + #define X86_BR_ANY_CALL \ 1265 + (X86_BR_CALL |\ 1266 + X86_BR_IND_CALL |\ 1267 + X86_BR_ZERO_CALL |\ 1268 + X86_BR_SYSCALL |\ 1269 + X86_BR_IRQ |\ 1270 + X86_BR_INT) 1271 + 1272 + int common_branch_type(int type); 1273 + int branch_type(unsigned long from, unsigned long to, int abort); 1274 + int branch_type_fused(unsigned long from, unsigned long to, int abort, 1275 + int *offset); 1276 + 1218 1277 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); 1219 1278 ssize_t intel_event_sysfs_show(char *page, u64 config); 1220 1279 ··· 1301 1232 1302 1233 int amd_pmu_init(void); 1303 1234 1235 + int amd_pmu_lbr_init(void); 1236 + void amd_pmu_lbr_reset(void); 1237 + void amd_pmu_lbr_read(void); 1238 + void amd_pmu_lbr_add(struct perf_event *event); 1239 + void amd_pmu_lbr_del(struct perf_event *event); 1240 + void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); 1241 + void amd_pmu_lbr_enable_all(void); 1242 + void amd_pmu_lbr_disable_all(void); 1243 + int amd_pmu_lbr_hw_config(struct perf_event *event); 1244 + 1304 1245 #ifdef CONFIG_PERF_EVENTS_AMD_BRS 1246 + 1247 + #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ 1248 + 1305 1249 int amd_brs_init(void); 1306 1250 void amd_brs_disable(void); 1307 1251 void amd_brs_enable(void); ··· 1323 1241 void amd_brs_drain(void); 1324 1242 void amd_brs_lopwr_init(void); 1325 1243 void amd_brs_disable_all(void); 1326 - int amd_brs_setup_filter(struct perf_event *event); 1244 + int amd_brs_hw_config(struct perf_event *event); 1327 1245 void amd_brs_reset(void); 1328 1246 1329 1247 static inline void amd_pmu_brs_add(struct perf_event *event) ··· 1359 1277 static inline void amd_brs_drain(void) {} 1360 1278 static inline void amd_brs_lopwr_init(void) {} 1361 1279 static inline void amd_brs_disable_all(void) {} 1362 - static inline int amd_brs_setup_filter(struct perf_event *event) 1280 + static inline int amd_brs_hw_config(struct perf_event *event) 1363 1281 { 1364 1282 return 0; 1365 1283 }
+22
arch/x86/events/perf_event_flags.h
··· 1 + 2 + /* 3 + * struct hw_perf_event.flags flags 4 + */ 5 + PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ 6 + PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ 7 + PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ 8 + PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ 9 + PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ 10 + PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ 11 + PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ 12 + /* 0x00080 */ 13 + PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ 14 + PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ 15 + PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ 16 + PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ 17 + PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ 18 + PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ 19 + PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ 20 + PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ 21 + PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ 22 + PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */
+251
arch/x86/events/utils.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <asm/insn.h> 3 + 4 + #include "perf_event.h" 5 + 6 + static int decode_branch_type(struct insn *insn) 7 + { 8 + int ext; 9 + 10 + if (insn_get_opcode(insn)) 11 + return X86_BR_ABORT; 12 + 13 + switch (insn->opcode.bytes[0]) { 14 + case 0xf: 15 + switch (insn->opcode.bytes[1]) { 16 + case 0x05: /* syscall */ 17 + case 0x34: /* sysenter */ 18 + return X86_BR_SYSCALL; 19 + case 0x07: /* sysret */ 20 + case 0x35: /* sysexit */ 21 + return X86_BR_SYSRET; 22 + case 0x80 ... 0x8f: /* conditional */ 23 + return X86_BR_JCC; 24 + } 25 + return X86_BR_NONE; 26 + case 0x70 ... 0x7f: /* conditional */ 27 + return X86_BR_JCC; 28 + case 0xc2: /* near ret */ 29 + case 0xc3: /* near ret */ 30 + case 0xca: /* far ret */ 31 + case 0xcb: /* far ret */ 32 + return X86_BR_RET; 33 + case 0xcf: /* iret */ 34 + return X86_BR_IRET; 35 + case 0xcc ... 0xce: /* int */ 36 + return X86_BR_INT; 37 + case 0xe8: /* call near rel */ 38 + if (insn_get_immediate(insn) || insn->immediate1.value == 0) { 39 + /* zero length call */ 40 + return X86_BR_ZERO_CALL; 41 + } 42 + fallthrough; 43 + case 0x9a: /* call far absolute */ 44 + return X86_BR_CALL; 45 + case 0xe0 ... 0xe3: /* loop jmp */ 46 + return X86_BR_JCC; 47 + case 0xe9 ... 0xeb: /* jmp */ 48 + return X86_BR_JMP; 49 + case 0xff: /* call near absolute, call far absolute ind */ 50 + if (insn_get_modrm(insn)) 51 + return X86_BR_ABORT; 52 + 53 + ext = (insn->modrm.bytes[0] >> 3) & 0x7; 54 + switch (ext) { 55 + case 2: /* near ind call */ 56 + case 3: /* far ind call */ 57 + return X86_BR_IND_CALL; 58 + case 4: 59 + case 5: 60 + return X86_BR_IND_JMP; 61 + } 62 + return X86_BR_NONE; 63 + } 64 + 65 + return X86_BR_NONE; 66 + } 67 + 68 + /* 69 + * return the type of control flow change at address "from" 70 + * instruction is not necessarily a branch (in case of interrupt). 71 + * 72 + * The branch type returned also includes the priv level of the 73 + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). 74 + * 75 + * If a branch type is unknown OR the instruction cannot be 76 + * decoded (e.g., text page not present), then X86_BR_NONE is 77 + * returned. 78 + * 79 + * While recording branches, some processors can report the "from" 80 + * address to be that of an instruction preceding the actual branch 81 + * when instruction fusion occurs. If fusion is expected, attempt to 82 + * find the type of the first branch instruction within the next 83 + * MAX_INSN_SIZE bytes and if found, provide the offset between the 84 + * reported "from" address and the actual branch instruction address. 85 + */ 86 + static int get_branch_type(unsigned long from, unsigned long to, int abort, 87 + bool fused, int *offset) 88 + { 89 + struct insn insn; 90 + void *addr; 91 + int bytes_read, bytes_left, insn_offset; 92 + int ret = X86_BR_NONE; 93 + int to_plm, from_plm; 94 + u8 buf[MAX_INSN_SIZE]; 95 + int is64 = 0; 96 + 97 + /* make sure we initialize offset */ 98 + if (offset) 99 + *offset = 0; 100 + 101 + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; 102 + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; 103 + 104 + /* 105 + * maybe zero if lbr did not fill up after a reset by the time 106 + * we get a PMU interrupt 107 + */ 108 + if (from == 0 || to == 0) 109 + return X86_BR_NONE; 110 + 111 + if (abort) 112 + return X86_BR_ABORT | to_plm; 113 + 114 + if (from_plm == X86_BR_USER) { 115 + /* 116 + * can happen if measuring at the user level only 117 + * and we interrupt in a kernel thread, e.g., idle. 118 + */ 119 + if (!current->mm) 120 + return X86_BR_NONE; 121 + 122 + /* may fail if text not present */ 123 + bytes_left = copy_from_user_nmi(buf, (void __user *)from, 124 + MAX_INSN_SIZE); 125 + bytes_read = MAX_INSN_SIZE - bytes_left; 126 + if (!bytes_read) 127 + return X86_BR_NONE; 128 + 129 + addr = buf; 130 + } else { 131 + /* 132 + * The LBR logs any address in the IP, even if the IP just 133 + * faulted. This means userspace can control the from address. 134 + * Ensure we don't blindly read any address by validating it is 135 + * a known text address. 136 + */ 137 + if (kernel_text_address(from)) { 138 + addr = (void *)from; 139 + /* 140 + * Assume we can get the maximum possible size 141 + * when grabbing kernel data. This is not 142 + * _strictly_ true since we could possibly be 143 + * executing up next to a memory hole, but 144 + * it is very unlikely to be a problem. 145 + */ 146 + bytes_read = MAX_INSN_SIZE; 147 + } else { 148 + return X86_BR_NONE; 149 + } 150 + } 151 + 152 + /* 153 + * decoder needs to know the ABI especially 154 + * on 64-bit systems running 32-bit apps 155 + */ 156 + #ifdef CONFIG_X86_64 157 + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); 158 + #endif 159 + insn_init(&insn, addr, bytes_read, is64); 160 + ret = decode_branch_type(&insn); 161 + insn_offset = 0; 162 + 163 + /* Check for the possibility of branch fusion */ 164 + while (fused && ret == X86_BR_NONE) { 165 + /* Check for decoding errors */ 166 + if (insn_get_length(&insn) || !insn.length) 167 + break; 168 + 169 + insn_offset += insn.length; 170 + bytes_read -= insn.length; 171 + if (bytes_read < 0) 172 + break; 173 + 174 + insn_init(&insn, addr + insn_offset, bytes_read, is64); 175 + ret = decode_branch_type(&insn); 176 + } 177 + 178 + if (offset) 179 + *offset = insn_offset; 180 + 181 + /* 182 + * interrupts, traps, faults (and thus ring transition) may 183 + * occur on any instructions. Thus, to classify them correctly, 184 + * we need to first look at the from and to priv levels. If they 185 + * are different and to is in the kernel, then it indicates 186 + * a ring transition. If the from instruction is not a ring 187 + * transition instr (syscall, systenter, int), then it means 188 + * it was a irq, trap or fault. 189 + * 190 + * we have no way of detecting kernel to kernel faults. 191 + */ 192 + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL 193 + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) 194 + ret = X86_BR_IRQ; 195 + 196 + /* 197 + * branch priv level determined by target as 198 + * is done by HW when LBR_SELECT is implemented 199 + */ 200 + if (ret != X86_BR_NONE) 201 + ret |= to_plm; 202 + 203 + return ret; 204 + } 205 + 206 + int branch_type(unsigned long from, unsigned long to, int abort) 207 + { 208 + return get_branch_type(from, to, abort, false, NULL); 209 + } 210 + 211 + int branch_type_fused(unsigned long from, unsigned long to, int abort, 212 + int *offset) 213 + { 214 + return get_branch_type(from, to, abort, true, offset); 215 + } 216 + 217 + #define X86_BR_TYPE_MAP_MAX 16 218 + 219 + static int branch_map[X86_BR_TYPE_MAP_MAX] = { 220 + PERF_BR_CALL, /* X86_BR_CALL */ 221 + PERF_BR_RET, /* X86_BR_RET */ 222 + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ 223 + PERF_BR_SYSRET, /* X86_BR_SYSRET */ 224 + PERF_BR_UNKNOWN, /* X86_BR_INT */ 225 + PERF_BR_ERET, /* X86_BR_IRET */ 226 + PERF_BR_COND, /* X86_BR_JCC */ 227 + PERF_BR_UNCOND, /* X86_BR_JMP */ 228 + PERF_BR_IRQ, /* X86_BR_IRQ */ 229 + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ 230 + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ 231 + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ 232 + PERF_BR_NO_TX, /* X86_BR_NO_TX */ 233 + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ 234 + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ 235 + PERF_BR_IND, /* X86_BR_IND_JMP */ 236 + }; 237 + 238 + int common_branch_type(int type) 239 + { 240 + int i; 241 + 242 + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ 243 + 244 + if (type) { 245 + i = __ffs(type); 246 + if (i < X86_BR_TYPE_MAP_MAX) 247 + return branch_map[i]; 248 + } 249 + 250 + return PERF_BR_UNKNOWN; 251 + }
+16
arch/x86/include/asm/amd-ibs.h
··· 6 6 7 7 #include <asm/msr-index.h> 8 8 9 + /* IBS_OP_DATA2 DataSrc */ 10 + #define IBS_DATA_SRC_LOC_CACHE 2 11 + #define IBS_DATA_SRC_DRAM 3 12 + #define IBS_DATA_SRC_REM_CACHE 4 13 + #define IBS_DATA_SRC_IO 7 14 + 15 + /* IBS_OP_DATA2 DataSrc Extension */ 16 + #define IBS_DATA_SRC_EXT_LOC_CACHE 1 17 + #define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2 18 + #define IBS_DATA_SRC_EXT_DRAM 3 19 + #define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5 20 + #define IBS_DATA_SRC_EXT_PMEM 6 21 + #define IBS_DATA_SRC_EXT_IO 7 22 + #define IBS_DATA_SRC_EXT_EXT_MEM 8 23 + #define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12 24 + 9 25 /* 10 26 * IBS Hardware MSRs 11 27 */
+1 -1
arch/x86/include/asm/cpufeatures.h
··· 96 96 #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ 97 97 #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ 98 98 #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ 99 - /* FREE! ( 3*32+17) */ 99 + #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ 100 100 #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ 101 101 #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ 102 102 #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
+1 -4
arch/x86/include/asm/hw_breakpoint.h
··· 44 44 /* Total number of available HW breakpoint registers */ 45 45 #define HBP_NUM 4 46 46 47 - static inline int hw_breakpoint_slots(int type) 48 - { 49 - return HBP_NUM; 50 - } 47 + #define hw_breakpoint_slots(type) (HBP_NUM) 51 48 52 49 struct perf_event_attr; 53 50 struct perf_event;
+5
arch/x86/include/asm/msr-index.h
··· 590 590 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 591 591 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 592 592 593 + /* AMD Last Branch Record MSRs */ 594 + #define MSR_AMD64_LBR_SELECT 0xc000010e 595 + 593 596 /* Fam 17h MSRs */ 594 597 #define MSR_F17H_IRPERF 0xc00000e9 595 598 ··· 763 760 /* AMD Branch Sampling configuration */ 764 761 #define MSR_AMD_DBG_EXTN_CFG 0xc000010f 765 762 #define MSR_AMD_SAMP_BR_FROM 0xc0010300 763 + 764 + #define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6) 766 765 767 766 #define MSR_IA32_MPERF 0x000000e7 768 767 #define MSR_IA32_APERF 0x000000e8
+2 -1
arch/x86/include/asm/perf_event.h
··· 207 207 struct { 208 208 /* Number of Core Performance Counters */ 209 209 unsigned int num_core_pmc:4; 210 - unsigned int reserved:6; 210 + /* Number of available LBR Stack Entries */ 211 + unsigned int lbr_v2_stack_sz:6; 211 212 /* Number of Data Fabric Counters */ 212 213 unsigned int num_df_pmc:6; 213 214 } split;
+1
arch/x86/kernel/cpu/scattered.c
··· 45 45 { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, 46 46 { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, 47 47 { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, 48 + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, 48 49 { 0, 0, 0, 0, 0 } 49 50 }; 50 51
+3 -1
drivers/perf/arm_spe_pmu.c
··· 44 44 * This allows us to perform the check, i.e, perfmon_capable(), 45 45 * in the context of the event owner, once, during the event_init(). 46 46 */ 47 - #define SPE_PMU_HW_FLAGS_CX BIT(0) 47 + #define SPE_PMU_HW_FLAGS_CX 0x00001 48 + 49 + static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX); 48 50 49 51 static void set_spe_event_has_cx(struct perf_event *event) 50 52 {
+3 -1
include/linux/hw_breakpoint.h
··· 74 74 extern int register_perf_hw_breakpoint(struct perf_event *bp); 75 75 extern void unregister_hw_breakpoint(struct perf_event *bp); 76 76 extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); 77 + extern bool hw_breakpoint_is_used(void); 77 78 78 79 extern int dbg_reserve_bp_slot(struct perf_event *bp); 79 80 extern int dbg_release_bp_slot(struct perf_event *bp); 80 81 extern int reserve_bp_slot(struct perf_event *bp); 81 82 extern void release_bp_slot(struct perf_event *bp); 82 - int hw_breakpoint_weight(struct perf_event *bp); 83 83 int arch_reserve_bp_slot(struct perf_event *bp); 84 84 void arch_release_bp_slot(struct perf_event *bp); 85 85 void arch_unregister_hw_breakpoint(struct perf_event *bp); ··· 121 121 static inline void unregister_hw_breakpoint(struct perf_event *bp) { } 122 122 static inline void 123 123 unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } 124 + static inline bool hw_breakpoint_is_used(void) { return false; } 125 + 124 126 static inline int 125 127 reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } 126 128 static inline void release_bp_slot(struct perf_event *bp) { }
+6
include/linux/percpu-rwsem.h
··· 121 121 preempt_enable(); 122 122 } 123 123 124 + extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); 124 125 extern void percpu_down_write(struct percpu_rw_semaphore *); 125 126 extern void percpu_up_write(struct percpu_rw_semaphore *); 127 + 128 + static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) 129 + { 130 + return atomic_read(&sem->block); 131 + } 126 132 127 133 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, 128 134 const char *, struct lock_class_key *);
+5 -4
include/linux/perf/arm_pmu.h
··· 24 24 /* 25 25 * ARM PMU hw_event flags 26 26 */ 27 - /* Event uses a 64bit counter */ 28 - #define ARMPMU_EVT_64BIT 1 29 - /* Event uses a 47bit counter */ 30 - #define ARMPMU_EVT_47BIT 2 27 + #define ARMPMU_EVT_64BIT 0x00001 /* Event uses a 64bit counter */ 28 + #define ARMPMU_EVT_47BIT 0x00002 /* Event uses a 47bit counter */ 29 + 30 + static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT); 31 + static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT); 31 32 32 33 #define HW_OP_UNSUPPORTED 0xFFFF 33 34 #define C(_x) PERF_COUNT_HW_CACHE_##_x
+63 -14
include/linux/perf_event.h
··· 36 36 }; 37 37 38 38 #ifdef CONFIG_HAVE_HW_BREAKPOINT 39 + #include <linux/rhashtable-types.h> 39 40 #include <asm/hw_breakpoint.h> 40 41 #endif 41 42 ··· 61 60 #include <linux/refcount.h> 62 61 #include <linux/security.h> 63 62 #include <linux/static_call.h> 63 + #include <linux/lockdep.h> 64 64 #include <asm/local.h> 65 65 66 66 struct perf_callchain_entry { ··· 139 137 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific 140 138 * usage. 141 139 */ 142 - #define PERF_EVENT_FLAG_ARCH 0x0000ffff 140 + #define PERF_EVENT_FLAG_ARCH 0x000fffff 143 141 #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 142 + 143 + static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); 144 144 145 145 /** 146 146 * struct hw_perf_event - performance event hardware details: ··· 182 178 * creation and event initalization. 183 179 */ 184 180 struct arch_hw_breakpoint info; 185 - struct list_head bp_list; 181 + struct rhlist_head bp_list; 186 182 }; 187 183 #endif 188 184 struct { /* amd_iommu */ ··· 635 631 struct list_head list; 636 632 }; 637 633 634 + /* 635 + * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex 636 + * as such iteration must hold either lock. However, since ctx->lock is an IRQ 637 + * safe lock, and is only held by the CPU doing the modification, having IRQs 638 + * disabled is sufficient since it will hold-off the IPIs. 639 + */ 640 + #ifdef CONFIG_PROVE_LOCKING 641 + #define lockdep_assert_event_ctx(event) \ 642 + WARN_ON_ONCE(__lockdep_enabled && \ 643 + (this_cpu_read(hardirqs_enabled) && \ 644 + lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) 645 + #else 646 + #define lockdep_assert_event_ctx(event) 647 + #endif 648 + 638 649 #define for_each_sibling_event(sibling, event) \ 650 + lockdep_assert_event_ctx(event); \ 639 651 if ((event)->group_leader == (event)) \ 640 652 list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) 641 653 ··· 1027 1007 * Fields set by perf_sample_data_init(), group so as to 1028 1008 * minimize the cachelines touched. 1029 1009 */ 1030 - u64 addr; 1031 - struct perf_raw_record *raw; 1032 - struct perf_branch_stack *br_stack; 1010 + u64 sample_flags; 1033 1011 u64 period; 1034 - union perf_sample_weight weight; 1035 - u64 txn; 1036 - union perf_mem_data_src data_src; 1037 1012 1038 1013 /* 1039 1014 * The other fields, optionally {set,used} by 1040 1015 * perf_{prepare,output}_sample(). 1041 1016 */ 1017 + struct perf_branch_stack *br_stack; 1018 + union perf_sample_weight weight; 1019 + union perf_mem_data_src data_src; 1020 + u64 txn; 1021 + u64 addr; 1022 + struct perf_raw_record *raw; 1023 + 1042 1024 u64 type; 1043 1025 u64 ip; 1044 1026 struct { ··· 1078 1056 u64 addr, u64 period) 1079 1057 { 1080 1058 /* remaining struct members initialized in perf_prepare_sample() */ 1081 - data->addr = addr; 1082 - data->raw = NULL; 1083 - data->br_stack = NULL; 1059 + data->sample_flags = PERF_SAMPLE_PERIOD; 1084 1060 data->period = period; 1085 - data->weight.full = 0; 1086 - data->data_src.val = PERF_MEM_NA; 1087 - data->txn = 0; 1061 + 1062 + if (addr) { 1063 + data->addr = addr; 1064 + data->sample_flags |= PERF_SAMPLE_ADDR; 1065 + } 1088 1066 } 1089 1067 1090 1068 /* ··· 1100 1078 br->abort = 0; 1101 1079 br->cycles = 0; 1102 1080 br->type = 0; 1081 + br->spec = PERF_BR_SPEC_NA; 1103 1082 br->reserved = 0; 1104 1083 } 1105 1084 ··· 1707 1684 } 1708 1685 #endif 1709 1686 1687 + #ifdef CONFIG_PERF_EVENTS 1688 + static inline bool branch_sample_no_flags(const struct perf_event *event) 1689 + { 1690 + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; 1691 + } 1692 + 1693 + static inline bool branch_sample_no_cycles(const struct perf_event *event) 1694 + { 1695 + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; 1696 + } 1697 + 1698 + static inline bool branch_sample_type(const struct perf_event *event) 1699 + { 1700 + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; 1701 + } 1702 + 1703 + static inline bool branch_sample_hw_index(const struct perf_event *event) 1704 + { 1705 + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; 1706 + } 1707 + 1708 + static inline bool branch_sample_priv(const struct perf_event *event) 1709 + { 1710 + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; 1711 + } 1712 + #endif /* CONFIG_PERF_EVENTS */ 1710 1713 #endif /* _LINUX_PERF_EVENT_H */
+52 -5
include/uapi/linux/perf_event.h
··· 164 164 PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, 165 165 166 166 PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ 167 - 168 - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ 169 167 }; 170 168 171 169 #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) ··· 202 204 203 205 PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ 204 206 207 + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ 208 + 205 209 PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ 206 210 }; 207 211 ··· 233 233 234 234 PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, 235 235 236 + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, 237 + 236 238 PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, 237 239 }; 238 240 ··· 255 253 PERF_BR_COND_RET = 10, /* conditional function return */ 256 254 PERF_BR_ERET = 11, /* exception return */ 257 255 PERF_BR_IRQ = 12, /* irq */ 256 + PERF_BR_SERROR = 13, /* system error */ 257 + PERF_BR_NO_TX = 14, /* not in transaction */ 258 + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ 258 259 PERF_BR_MAX, 259 260 }; 261 + 262 + /* 263 + * Common branch speculation outcome classification 264 + */ 265 + enum { 266 + PERF_BR_SPEC_NA = 0, /* Not available */ 267 + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ 268 + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ 269 + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ 270 + PERF_BR_SPEC_MAX, 271 + }; 272 + 273 + enum { 274 + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ 275 + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ 276 + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ 277 + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ 278 + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ 279 + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ 280 + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ 281 + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ 282 + PERF_BR_NEW_MAX, 283 + }; 284 + 285 + enum { 286 + PERF_BR_PRIV_UNKNOWN = 0, 287 + PERF_BR_PRIV_USER = 1, 288 + PERF_BR_PRIV_KERNEL = 2, 289 + PERF_BR_PRIV_HV = 3, 290 + }; 291 + 292 + #define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 293 + #define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 294 + #define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 295 + #define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 296 + #define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 260 297 261 298 #define PERF_SAMPLE_BRANCH_PLM_ALL \ 262 299 (PERF_SAMPLE_BRANCH_USER|\ ··· 1336 1295 #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ 1337 1296 #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ 1338 1297 #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ 1339 - /* 5-0xa available */ 1298 + /* 5-0x8 available */ 1299 + #define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */ 1300 + #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ 1340 1301 #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ 1341 1302 #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ 1342 1303 #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ ··· 1356 1313 #define PERF_MEM_SNOOP_SHIFT 19 1357 1314 1358 1315 #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ 1359 - /* 1 free */ 1316 + #define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ 1360 1317 #define PERF_MEM_SNOOPX_SHIFT 38 1361 1318 1362 1319 /* locked instruction */ ··· 1406 1363 * abort: aborting a hardware transaction 1407 1364 * cycles: cycles from last branch (or 0 if not supported) 1408 1365 * type: branch type 1366 + * spec: branch speculation info (or 0 if not supported) 1409 1367 */ 1410 1368 struct perf_branch_entry { 1411 1369 __u64 from; ··· 1417 1373 abort:1, /* transaction abort */ 1418 1374 cycles:16, /* cycle count to last branch */ 1419 1375 type:4, /* branch type */ 1420 - reserved:40; 1376 + spec:2, /* branch speculation info */ 1377 + new_type:4, /* additional branch type */ 1378 + priv:3, /* privilege level */ 1379 + reserved:31; 1421 1380 }; 1422 1381 1423 1382 union perf_sample_weight {
+2 -2
kernel/bpf/stackmap.c
··· 338 338 int ret; 339 339 340 340 /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ 341 - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) 341 + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) 342 342 return bpf_get_stackid((unsigned long)(ctx->regs), 343 343 (unsigned long) map, flags, 0, 0); 344 344 ··· 506 506 int err = -EINVAL; 507 507 __u64 nr_kernel; 508 508 509 - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) 509 + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) 510 510 return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); 511 511 512 512 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+1
kernel/events/Makefile
··· 2 2 obj-y := core.o ring_buffer.o callchain.o 3 3 4 4 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 5 + obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o 5 6 obj-$(CONFIG_UPROBES) += uprobes.o
+59 -29
kernel/events/core.c
··· 1468 1468 { 1469 1469 u64 now = perf_clock(); 1470 1470 1471 + lockdep_assert_held(&ctx->lock); 1472 + 1471 1473 if (adv) 1472 1474 ctx->time += now - ctx->timestamp; 1473 1475 ctx->timestamp = now; ··· 2226 2224 static inline int pmu_filter_match(struct perf_event *event) 2227 2225 { 2228 2226 struct perf_event *sibling; 2227 + unsigned long flags; 2228 + int ret = 1; 2229 2229 2230 2230 if (!__pmu_filter_match(event)) 2231 2231 return 0; 2232 2232 2233 + local_irq_save(flags); 2233 2234 for_each_sibling_event(sibling, event) { 2234 - if (!__pmu_filter_match(sibling)) 2235 - return 0; 2235 + if (!__pmu_filter_match(sibling)) { 2236 + ret = 0; 2237 + break; 2238 + } 2236 2239 } 2240 + local_irq_restore(flags); 2237 2241 2238 - return 1; 2242 + return ret; 2239 2243 } 2240 2244 2241 2245 static inline int ··· 6802 6794 6803 6795 static void __perf_event_header__init_id(struct perf_event_header *header, 6804 6796 struct perf_sample_data *data, 6805 - struct perf_event *event) 6797 + struct perf_event *event, 6798 + u64 sample_type) 6806 6799 { 6807 - u64 sample_type = event->attr.sample_type; 6808 - 6809 - data->type = sample_type; 6800 + data->type = event->attr.sample_type; 6810 6801 header->size += event->id_header_size; 6811 6802 6812 6803 if (sample_type & PERF_SAMPLE_TID) { ··· 6834 6827 struct perf_event *event) 6835 6828 { 6836 6829 if (event->attr.sample_id_all) 6837 - __perf_event_header__init_id(header, data, event); 6830 + __perf_event_header__init_id(header, data, event, event->attr.sample_type); 6838 6831 } 6839 6832 6840 6833 static void __perf_event__output_id_sample(struct perf_output_handle *handle, ··· 6983 6976 perf_output_read_one(handle, event, enabled, running); 6984 6977 } 6985 6978 6986 - static inline bool perf_sample_save_hw_index(struct perf_event *event) 6987 - { 6988 - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; 6989 - } 6990 - 6991 6979 void perf_output_sample(struct perf_output_handle *handle, 6992 6980 struct perf_event_header *header, 6993 6981 struct perf_sample_data *data, ··· 7064 7062 } 7065 7063 7066 7064 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 7067 - if (data->br_stack) { 7065 + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { 7068 7066 size_t size; 7069 7067 7070 7068 size = data->br_stack->nr 7071 7069 * sizeof(struct perf_branch_entry); 7072 7070 7073 7071 perf_output_put(handle, data->br_stack->nr); 7074 - if (perf_sample_save_hw_index(event)) 7072 + if (branch_sample_hw_index(event)) 7075 7073 perf_output_put(handle, data->br_stack->hw_idx); 7076 7074 perf_output_copy(handle, data->br_stack->entries, size); 7077 7075 } else { ··· 7314 7312 struct pt_regs *regs) 7315 7313 { 7316 7314 u64 sample_type = event->attr.sample_type; 7315 + u64 filtered_sample_type; 7317 7316 7318 7317 header->type = PERF_RECORD_SAMPLE; 7319 7318 header->size = sizeof(*header) + event->header_size; ··· 7322 7319 header->misc = 0; 7323 7320 header->misc |= perf_misc_flags(regs); 7324 7321 7325 - __perf_event_header__init_id(header, data, event); 7322 + /* 7323 + * Clear the sample flags that have already been done by the 7324 + * PMU driver. 7325 + */ 7326 + filtered_sample_type = sample_type & ~data->sample_flags; 7327 + __perf_event_header__init_id(header, data, event, filtered_sample_type); 7326 7328 7327 7329 if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) 7328 7330 data->ip = perf_instruction_pointer(regs); ··· 7335 7327 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 7336 7328 int size = 1; 7337 7329 7338 - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) 7330 + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) 7339 7331 data->callchain = perf_callchain(event, regs); 7340 7332 7341 7333 size += data->callchain->nr; ··· 7347 7339 struct perf_raw_record *raw = data->raw; 7348 7340 int size; 7349 7341 7350 - if (raw) { 7342 + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { 7351 7343 struct perf_raw_frag *frag = &raw->frag; 7352 7344 u32 sum = 0; 7353 7345 ··· 7363 7355 frag->pad = raw->size - sum; 7364 7356 } else { 7365 7357 size = sizeof(u64); 7358 + data->raw = NULL; 7366 7359 } 7367 7360 7368 7361 header->size += size; ··· 7371 7362 7372 7363 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 7373 7364 int size = sizeof(u64); /* nr */ 7374 - if (data->br_stack) { 7375 - if (perf_sample_save_hw_index(event)) 7365 + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { 7366 + if (branch_sample_hw_index(event)) 7376 7367 size += sizeof(u64); 7377 7368 7378 7369 size += data->br_stack->nr ··· 7421 7412 header->size += size; 7422 7413 } 7423 7414 7415 + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) 7416 + data->weight.full = 0; 7417 + 7418 + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) 7419 + data->data_src.val = PERF_MEM_NA; 7420 + 7421 + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) 7422 + data->txn = 0; 7423 + 7424 + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { 7425 + if (filtered_sample_type & PERF_SAMPLE_ADDR) 7426 + data->addr = 0; 7427 + } 7428 + 7424 7429 if (sample_type & PERF_SAMPLE_REGS_INTR) { 7425 7430 /* regs dump ABI info */ 7426 7431 int size = sizeof(u64); ··· 7450 7427 header->size += size; 7451 7428 } 7452 7429 7453 - if (sample_type & PERF_SAMPLE_PHYS_ADDR) 7430 + if (sample_type & PERF_SAMPLE_PHYS_ADDR && 7431 + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) 7454 7432 data->phys_addr = perf_virt_to_phys(data->addr); 7455 7433 7456 7434 #ifdef CONFIG_CGROUP_PERF ··· 10022 9998 goto out; 10023 9999 rcu_read_lock(); 10024 10000 prog = READ_ONCE(event->prog); 10025 - if (prog) 10001 + if (prog) { 10002 + if (prog->call_get_stack && 10003 + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && 10004 + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { 10005 + data->callchain = perf_callchain(event, regs); 10006 + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; 10007 + } 10008 + 10026 10009 ret = bpf_prog_run(prog, &ctx); 10010 + } 10027 10011 rcu_read_unlock(); 10028 10012 out: 10029 10013 __this_cpu_dec(bpf_prog_active); ··· 10057 10025 10058 10026 if (event->attr.precise_ip && 10059 10027 prog->call_get_stack && 10060 - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || 10028 + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || 10061 10029 event->attr.exclude_callchain_kernel || 10062 10030 event->attr.exclude_callchain_user)) { 10063 10031 /* ··· 10974 10942 { 10975 10943 struct pmu *pmu = dev_get_drvdata(dev); 10976 10944 10977 - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); 10945 + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); 10978 10946 } 10979 10947 DEVICE_ATTR_RO(nr_addr_filters); 10980 10948 ··· 10985 10953 { 10986 10954 struct pmu *pmu = dev_get_drvdata(dev); 10987 10955 10988 - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 10956 + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); 10989 10957 } 10990 10958 static DEVICE_ATTR_RO(type); 10991 10959 ··· 10996 10964 { 10997 10965 struct pmu *pmu = dev_get_drvdata(dev); 10998 10966 10999 - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); 10967 + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); 11000 10968 } 11001 10969 11002 10970 static DEFINE_MUTEX(mux_interval_mutex); ··· 11750 11718 event->destroy(event); 11751 11719 module_put(pmu->module); 11752 11720 err_ns: 11753 - if (event->ns) 11754 - put_pid_ns(event->ns); 11755 11721 if (event->hw.target) 11756 11722 put_task_struct(event->hw.target); 11757 - kmem_cache_free(perf_event_cache, event); 11723 + call_rcu(&event->rcu_head, free_event_rcu); 11758 11724 11759 11725 return ERR_PTR(err); 11760 11726 }
+493 -157
kernel/events/hw_breakpoint.c
··· 17 17 * This file contains the arch-independent routines. 18 18 */ 19 19 20 + #include <linux/hw_breakpoint.h> 21 + 22 + #include <linux/atomic.h> 23 + #include <linux/bug.h> 24 + #include <linux/cpu.h> 25 + #include <linux/export.h> 26 + #include <linux/init.h> 20 27 #include <linux/irqflags.h> 21 - #include <linux/kallsyms.h> 22 - #include <linux/notifier.h> 23 - #include <linux/kprobes.h> 24 28 #include <linux/kdebug.h> 25 29 #include <linux/kernel.h> 26 - #include <linux/module.h> 30 + #include <linux/mutex.h> 31 + #include <linux/notifier.h> 32 + #include <linux/percpu-rwsem.h> 27 33 #include <linux/percpu.h> 34 + #include <linux/rhashtable.h> 28 35 #include <linux/sched.h> 29 - #include <linux/init.h> 30 36 #include <linux/slab.h> 31 - #include <linux/list.h> 32 - #include <linux/cpu.h> 33 - #include <linux/smp.h> 34 - #include <linux/bug.h> 35 37 36 - #include <linux/hw_breakpoint.h> 37 38 /* 38 - * Constraints data 39 + * Datastructure to track the total uses of N slots across tasks or CPUs; 40 + * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. 41 + */ 42 + struct bp_slots_histogram { 43 + #ifdef hw_breakpoint_slots 44 + atomic_t count[hw_breakpoint_slots(0)]; 45 + #else 46 + atomic_t *count; 47 + #endif 48 + }; 49 + 50 + /* 51 + * Per-CPU constraints data. 39 52 */ 40 53 struct bp_cpuinfo { 41 - /* Number of pinned cpu breakpoints in a cpu */ 42 - unsigned int cpu_pinned; 43 - /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ 44 - unsigned int *tsk_pinned; 45 - /* Number of non-pinned cpu/task breakpoints in a cpu */ 46 - unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ 54 + /* Number of pinned CPU breakpoints in a CPU. */ 55 + unsigned int cpu_pinned; 56 + /* Histogram of pinned task breakpoints in a CPU. */ 57 + struct bp_slots_histogram tsk_pinned; 47 58 }; 48 59 49 60 static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); 50 - static int nr_slots[TYPE_MAX]; 51 61 52 62 static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) 53 63 { 54 64 return per_cpu_ptr(bp_cpuinfo + type, cpu); 55 65 } 56 66 67 + /* Number of pinned CPU breakpoints globally. */ 68 + static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; 69 + /* Number of pinned CPU-independent task breakpoints. */ 70 + static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; 71 + 57 72 /* Keep track of the breakpoints attached to tasks */ 58 - static LIST_HEAD(bp_task_head); 59 - 60 - static int constraints_initialized; 61 - 62 - /* Gather the number of total pinned and un-pinned bp in a cpuset */ 63 - struct bp_busy_slots { 64 - unsigned int pinned; 65 - unsigned int flexible; 73 + static struct rhltable task_bps_ht; 74 + static const struct rhashtable_params task_bps_ht_params = { 75 + .head_offset = offsetof(struct hw_perf_event, bp_list), 76 + .key_offset = offsetof(struct hw_perf_event, target), 77 + .key_len = sizeof_field(struct hw_perf_event, target), 78 + .automatic_shrinking = true, 66 79 }; 67 80 68 - /* Serialize accesses to the above constraints */ 69 - static DEFINE_MUTEX(nr_bp_mutex); 81 + static bool constraints_initialized __ro_after_init; 70 82 71 - __weak int hw_breakpoint_weight(struct perf_event *bp) 83 + /* 84 + * Synchronizes accesses to the per-CPU constraints; the locking rules are: 85 + * 86 + * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock 87 + * (due to bp_slots_histogram::count being atomic, no update are lost). 88 + * 89 + * 2. Holding a write-lock is required for computations that require a 90 + * stable snapshot of all bp_cpuinfo::tsk_pinned. 91 + * 92 + * 3. In all other cases, non-atomic accesses require the appropriately held 93 + * lock (read-lock for read-only accesses; write-lock for reads/writes). 94 + */ 95 + DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); 96 + 97 + /* 98 + * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since 99 + * rhltable synchronizes concurrent insertions/deletions, independent tasks may 100 + * insert/delete concurrently; therefore, a mutex per task is sufficient. 101 + * 102 + * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a 103 + * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is 104 + * that hw_breakpoint may contend with per-task perf event list management. The 105 + * assumption is that perf usecases involving hw_breakpoints are very unlikely 106 + * to result in unnecessary contention. 107 + */ 108 + static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) 109 + { 110 + struct task_struct *tsk = bp->hw.target; 111 + 112 + return tsk ? &tsk->perf_event_mutex : NULL; 113 + } 114 + 115 + static struct mutex *bp_constraints_lock(struct perf_event *bp) 116 + { 117 + struct mutex *tsk_mtx = get_task_bps_mutex(bp); 118 + 119 + if (tsk_mtx) { 120 + /* 121 + * Fully analogous to the perf_try_init_event() nesting 122 + * argument in the comment near perf_event_ctx_lock_nested(); 123 + * this child->perf_event_mutex cannot ever deadlock against 124 + * the parent->perf_event_mutex usage from 125 + * perf_event_task_{en,dis}able(). 126 + * 127 + * Specifically, inherited events will never occur on 128 + * ->perf_event_list. 129 + */ 130 + mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); 131 + percpu_down_read(&bp_cpuinfo_sem); 132 + } else { 133 + percpu_down_write(&bp_cpuinfo_sem); 134 + } 135 + 136 + return tsk_mtx; 137 + } 138 + 139 + static void bp_constraints_unlock(struct mutex *tsk_mtx) 140 + { 141 + if (tsk_mtx) { 142 + percpu_up_read(&bp_cpuinfo_sem); 143 + mutex_unlock(tsk_mtx); 144 + } else { 145 + percpu_up_write(&bp_cpuinfo_sem); 146 + } 147 + } 148 + 149 + static bool bp_constraints_is_locked(struct perf_event *bp) 150 + { 151 + struct mutex *tsk_mtx = get_task_bps_mutex(bp); 152 + 153 + return percpu_is_write_locked(&bp_cpuinfo_sem) || 154 + (tsk_mtx ? mutex_is_locked(tsk_mtx) : 155 + percpu_is_read_locked(&bp_cpuinfo_sem)); 156 + } 157 + 158 + static inline void assert_bp_constraints_lock_held(struct perf_event *bp) 159 + { 160 + struct mutex *tsk_mtx = get_task_bps_mutex(bp); 161 + 162 + if (tsk_mtx) 163 + lockdep_assert_held(tsk_mtx); 164 + lockdep_assert_held(&bp_cpuinfo_sem); 165 + } 166 + 167 + #ifdef hw_breakpoint_slots 168 + /* 169 + * Number of breakpoint slots is constant, and the same for all types. 170 + */ 171 + static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); 172 + static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } 173 + static inline int init_breakpoint_slots(void) { return 0; } 174 + #else 175 + /* 176 + * Dynamic number of breakpoint slots. 177 + */ 178 + static int __nr_bp_slots[TYPE_MAX] __ro_after_init; 179 + 180 + static inline int hw_breakpoint_slots_cached(int type) 181 + { 182 + return __nr_bp_slots[type]; 183 + } 184 + 185 + static __init bool 186 + bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) 187 + { 188 + hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); 189 + return hist->count; 190 + } 191 + 192 + static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) 193 + { 194 + kfree(hist->count); 195 + } 196 + 197 + static __init int init_breakpoint_slots(void) 198 + { 199 + int i, cpu, err_cpu; 200 + 201 + for (i = 0; i < TYPE_MAX; i++) 202 + __nr_bp_slots[i] = hw_breakpoint_slots(i); 203 + 204 + for_each_possible_cpu(cpu) { 205 + for (i = 0; i < TYPE_MAX; i++) { 206 + struct bp_cpuinfo *info = get_bp_info(cpu, i); 207 + 208 + if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) 209 + goto err; 210 + } 211 + } 212 + for (i = 0; i < TYPE_MAX; i++) { 213 + if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) 214 + goto err; 215 + if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) 216 + goto err; 217 + } 218 + 219 + return 0; 220 + err: 221 + for_each_possible_cpu(err_cpu) { 222 + for (i = 0; i < TYPE_MAX; i++) 223 + bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); 224 + if (err_cpu == cpu) 225 + break; 226 + } 227 + for (i = 0; i < TYPE_MAX; i++) { 228 + bp_slots_histogram_free(&cpu_pinned[i]); 229 + bp_slots_histogram_free(&tsk_pinned_all[i]); 230 + } 231 + 232 + return -ENOMEM; 233 + } 234 + #endif 235 + 236 + static inline void 237 + bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) 238 + { 239 + const int old_idx = old - 1; 240 + const int new_idx = old_idx + val; 241 + 242 + if (old_idx >= 0) 243 + WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); 244 + if (new_idx >= 0) 245 + WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); 246 + } 247 + 248 + static int 249 + bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) 250 + { 251 + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { 252 + const int count = atomic_read(&hist->count[i]); 253 + 254 + /* Catch unexpected writers; we want a stable snapshot. */ 255 + ASSERT_EXCLUSIVE_WRITER(hist->count[i]); 256 + if (count > 0) 257 + return i + 1; 258 + WARN(count < 0, "inconsistent breakpoint slots histogram"); 259 + } 260 + 261 + return 0; 262 + } 263 + 264 + static int 265 + bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, 266 + enum bp_type_idx type) 267 + { 268 + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { 269 + const int count1 = atomic_read(&hist1->count[i]); 270 + const int count2 = atomic_read(&hist2->count[i]); 271 + 272 + /* Catch unexpected writers; we want a stable snapshot. */ 273 + ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); 274 + ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); 275 + if (count1 + count2 > 0) 276 + return i + 1; 277 + WARN(count1 < 0, "inconsistent breakpoint slots histogram"); 278 + WARN(count2 < 0, "inconsistent breakpoint slots histogram"); 279 + } 280 + 281 + return 0; 282 + } 283 + 284 + #ifndef hw_breakpoint_weight 285 + static inline int hw_breakpoint_weight(struct perf_event *bp) 72 286 { 73 287 return 1; 74 288 } 289 + #endif 75 290 76 291 static inline enum bp_type_idx find_slot_idx(u64 bp_type) 77 292 { ··· 297 82 } 298 83 299 84 /* 300 - * Report the maximum number of pinned breakpoints a task 301 - * have in this cpu 85 + * Return the maximum number of pinned breakpoints a task has in this CPU. 302 86 */ 303 87 static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) 304 88 { 305 - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; 306 - int i; 89 + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; 307 90 308 - for (i = nr_slots[type] - 1; i >= 0; i--) { 309 - if (tsk_pinned[i] > 0) 310 - return i + 1; 311 - } 312 - 313 - return 0; 91 + /* 92 + * At this point we want to have acquired the bp_cpuinfo_sem as a 93 + * writer to ensure that there are no concurrent writers in 94 + * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. 95 + */ 96 + lockdep_assert_held_write(&bp_cpuinfo_sem); 97 + return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); 314 98 } 315 99 316 100 /* 317 101 * Count the number of breakpoints of the same type and same task. 318 102 * The given event must be not on the list. 103 + * 104 + * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, 105 + * returns a negative value. 319 106 */ 320 107 static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) 321 108 { 322 - struct task_struct *tsk = bp->hw.target; 109 + struct rhlist_head *head, *pos; 323 110 struct perf_event *iter; 324 111 int count = 0; 325 112 326 - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 327 - if (iter->hw.target == tsk && 328 - find_slot_idx(iter->attr.bp_type) == type && 329 - (iter->cpu < 0 || cpu == iter->cpu)) 330 - count += hw_breakpoint_weight(iter); 113 + /* 114 + * We need a stable snapshot of the per-task breakpoint list. 115 + */ 116 + assert_bp_constraints_lock_held(bp); 117 + 118 + rcu_read_lock(); 119 + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); 120 + if (!head) 121 + goto out; 122 + 123 + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { 124 + if (find_slot_idx(iter->attr.bp_type) != type) 125 + continue; 126 + 127 + if (iter->cpu >= 0) { 128 + if (cpu == -1) { 129 + count = -1; 130 + goto out; 131 + } else if (cpu != iter->cpu) 132 + continue; 133 + } 134 + 135 + count += hw_breakpoint_weight(iter); 331 136 } 332 137 138 + out: 139 + rcu_read_unlock(); 333 140 return count; 334 141 } 335 142 ··· 363 126 } 364 127 365 128 /* 366 - * Report the number of pinned/un-pinned breakpoints we have in 367 - * a given cpu (cpu > -1) or in all of them (cpu = -1). 129 + * Returns the max pinned breakpoint slots in a given 130 + * CPU (cpu > -1) or across all of them (cpu = -1). 368 131 */ 369 - static void 370 - fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, 371 - enum bp_type_idx type) 132 + static int 133 + max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) 372 134 { 373 135 const struct cpumask *cpumask = cpumask_of_bp(bp); 136 + int pinned_slots = 0; 374 137 int cpu; 138 + 139 + if (bp->hw.target && bp->cpu < 0) { 140 + int max_pinned = task_bp_pinned(-1, bp, type); 141 + 142 + if (max_pinned >= 0) { 143 + /* 144 + * Fast path: task_bp_pinned() is CPU-independent and 145 + * returns the same value for any CPU. 146 + */ 147 + max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); 148 + return max_pinned; 149 + } 150 + } 375 151 376 152 for_each_cpu(cpu, cpumask) { 377 153 struct bp_cpuinfo *info = get_bp_info(cpu, type); ··· 396 146 else 397 147 nr += task_bp_pinned(cpu, bp, type); 398 148 399 - if (nr > slots->pinned) 400 - slots->pinned = nr; 401 - 402 - nr = info->flexible; 403 - if (nr > slots->flexible) 404 - slots->flexible = nr; 149 + pinned_slots = max(nr, pinned_slots); 405 150 } 406 - } 407 151 408 - /* 409 - * For now, continue to consider flexible as pinned, until we can 410 - * ensure no flexible event can ever be scheduled before a pinned event 411 - * in a same cpu. 412 - */ 413 - static void 414 - fetch_this_slot(struct bp_busy_slots *slots, int weight) 415 - { 416 - slots->pinned += weight; 417 - } 418 - 419 - /* 420 - * Add a pinned breakpoint for the given task in our constraint table 421 - */ 422 - static void toggle_bp_task_slot(struct perf_event *bp, int cpu, 423 - enum bp_type_idx type, int weight) 424 - { 425 - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; 426 - int old_idx, new_idx; 427 - 428 - old_idx = task_bp_pinned(cpu, bp, type) - 1; 429 - new_idx = old_idx + weight; 430 - 431 - if (old_idx >= 0) 432 - tsk_pinned[old_idx]--; 433 - if (new_idx >= 0) 434 - tsk_pinned[new_idx]++; 152 + return pinned_slots; 435 153 } 436 154 437 155 /* 438 156 * Add/remove the given breakpoint in our constraint table 439 157 */ 440 - static void 441 - toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, 442 - int weight) 158 + static int 159 + toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) 443 160 { 444 - const struct cpumask *cpumask = cpumask_of_bp(bp); 445 - int cpu; 161 + int cpu, next_tsk_pinned; 446 162 447 163 if (!enable) 448 164 weight = -weight; 449 165 450 - /* Pinned counter cpu profiling */ 451 166 if (!bp->hw.target) { 452 - get_bp_info(bp->cpu, type)->cpu_pinned += weight; 453 - return; 167 + /* 168 + * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the 169 + * global histogram. 170 + */ 171 + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); 172 + 173 + lockdep_assert_held_write(&bp_cpuinfo_sem); 174 + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); 175 + info->cpu_pinned += weight; 176 + return 0; 454 177 } 455 178 456 - /* Pinned counter task profiling */ 457 - for_each_cpu(cpu, cpumask) 458 - toggle_bp_task_slot(bp, cpu, type, weight); 179 + /* 180 + * If bp->hw.target, tsk_pinned is only modified, but not used 181 + * otherwise. We can permit concurrent updates as long as there are no 182 + * other uses: having acquired bp_cpuinfo_sem as a reader allows 183 + * concurrent updates here. Uses of tsk_pinned will require acquiring 184 + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. 185 + */ 186 + lockdep_assert_held_read(&bp_cpuinfo_sem); 187 + 188 + /* 189 + * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global 190 + * histogram. We need to take care of 4 cases: 191 + * 192 + * 1. This breakpoint targets all CPUs (cpu < 0), and there may only 193 + * exist other task breakpoints targeting all CPUs. In this case we 194 + * can simply update the global slots histogram. 195 + * 196 + * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may 197 + * only exist other task breakpoints targeting all CPUs. 198 + * 199 + * a. On enable: remove the existing breakpoints from the global 200 + * slots histogram and use the per-CPU histogram. 201 + * 202 + * b. On disable: re-insert the existing breakpoints into the global 203 + * slots histogram and remove from per-CPU histogram. 204 + * 205 + * 3. Some other existing task breakpoints target specific CPUs. Only 206 + * update the per-CPU slots histogram. 207 + */ 208 + 209 + if (!enable) { 210 + /* 211 + * Remove before updating histograms so we can determine if this 212 + * was the last task breakpoint for a specific CPU. 213 + */ 214 + int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); 215 + 216 + if (ret) 217 + return ret; 218 + } 219 + /* 220 + * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. 221 + */ 222 + next_tsk_pinned = task_bp_pinned(-1, bp, type); 223 + 224 + if (next_tsk_pinned >= 0) { 225 + if (bp->cpu < 0) { /* Case 1: fast path */ 226 + if (!enable) 227 + next_tsk_pinned += hw_breakpoint_weight(bp); 228 + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); 229 + } else if (enable) { /* Case 2.a: slow path */ 230 + /* Add existing to per-CPU histograms. */ 231 + for_each_possible_cpu(cpu) { 232 + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, 233 + 0, next_tsk_pinned); 234 + } 235 + /* Add this first CPU-pinned task breakpoint. */ 236 + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, 237 + next_tsk_pinned, weight); 238 + /* Rebalance global task pinned histogram. */ 239 + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, 240 + -next_tsk_pinned); 241 + } else { /* Case 2.b: slow path */ 242 + /* Remove this last CPU-pinned task breakpoint. */ 243 + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, 244 + next_tsk_pinned + hw_breakpoint_weight(bp), weight); 245 + /* Remove all from per-CPU histograms. */ 246 + for_each_possible_cpu(cpu) { 247 + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, 248 + next_tsk_pinned, -next_tsk_pinned); 249 + } 250 + /* Rebalance global task pinned histogram. */ 251 + bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); 252 + } 253 + } else { /* Case 3: slow path */ 254 + const struct cpumask *cpumask = cpumask_of_bp(bp); 255 + 256 + for_each_cpu(cpu, cpumask) { 257 + next_tsk_pinned = task_bp_pinned(cpu, bp, type); 258 + if (!enable) 259 + next_tsk_pinned += hw_breakpoint_weight(bp); 260 + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, 261 + next_tsk_pinned, weight); 262 + } 263 + } 264 + 265 + /* 266 + * Readers want a stable snapshot of the per-task breakpoint list. 267 + */ 268 + assert_bp_constraints_lock_held(bp); 459 269 460 270 if (enable) 461 - list_add_tail(&bp->hw.bp_list, &bp_task_head); 462 - else 463 - list_del(&bp->hw.bp_list); 271 + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); 272 + 273 + return 0; 464 274 } 465 275 466 276 __weak int arch_reserve_bp_slot(struct perf_event *bp) ··· 544 234 } 545 235 546 236 /* 547 - * Constraints to check before allowing this new breakpoint counter: 237 + * Constraints to check before allowing this new breakpoint counter. 238 + * 239 + * Note: Flexible breakpoints are currently unimplemented, but outlined in the 240 + * below algorithm for completeness. The implementation treats flexible as 241 + * pinned due to no guarantee that we currently always schedule flexible events 242 + * before a pinned event in a same CPU. 548 243 * 549 244 * == Non-pinned counter == (Considered as pinned for now) 550 245 * ··· 591 276 */ 592 277 static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) 593 278 { 594 - struct bp_busy_slots slots = {0}; 595 279 enum bp_type_idx type; 280 + int max_pinned_slots; 596 281 int weight; 597 282 int ret; 598 283 ··· 608 293 type = find_slot_idx(bp_type); 609 294 weight = hw_breakpoint_weight(bp); 610 295 611 - fetch_bp_busy_slots(&slots, bp, type); 612 - /* 613 - * Simulate the addition of this breakpoint to the constraints 614 - * and see the result. 615 - */ 616 - fetch_this_slot(&slots, weight); 617 - 618 - /* Flexible counters need to keep at least one slot */ 619 - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) 296 + /* Check if this new breakpoint can be satisfied across all CPUs. */ 297 + max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; 298 + if (max_pinned_slots > hw_breakpoint_slots_cached(type)) 620 299 return -ENOSPC; 621 300 622 301 ret = arch_reserve_bp_slot(bp); 623 302 if (ret) 624 303 return ret; 625 304 626 - toggle_bp_slot(bp, true, type, weight); 627 - 628 - return 0; 305 + return toggle_bp_slot(bp, true, type, weight); 629 306 } 630 307 631 308 int reserve_bp_slot(struct perf_event *bp) 632 309 { 633 - int ret; 310 + struct mutex *mtx = bp_constraints_lock(bp); 311 + int ret = __reserve_bp_slot(bp, bp->attr.bp_type); 634 312 635 - mutex_lock(&nr_bp_mutex); 636 - 637 - ret = __reserve_bp_slot(bp, bp->attr.bp_type); 638 - 639 - mutex_unlock(&nr_bp_mutex); 640 - 313 + bp_constraints_unlock(mtx); 641 314 return ret; 642 315 } 643 316 ··· 638 335 639 336 type = find_slot_idx(bp_type); 640 337 weight = hw_breakpoint_weight(bp); 641 - toggle_bp_slot(bp, false, type, weight); 338 + WARN_ON(toggle_bp_slot(bp, false, type, weight)); 642 339 } 643 340 644 341 void release_bp_slot(struct perf_event *bp) 645 342 { 646 - mutex_lock(&nr_bp_mutex); 343 + struct mutex *mtx = bp_constraints_lock(bp); 647 344 648 345 arch_unregister_hw_breakpoint(bp); 649 346 __release_bp_slot(bp, bp->attr.bp_type); 650 - 651 - mutex_unlock(&nr_bp_mutex); 347 + bp_constraints_unlock(mtx); 652 348 } 653 349 654 350 static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) ··· 674 372 675 373 static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) 676 374 { 677 - int ret; 375 + struct mutex *mtx = bp_constraints_lock(bp); 376 + int ret = __modify_bp_slot(bp, old_type, new_type); 678 377 679 - mutex_lock(&nr_bp_mutex); 680 - ret = __modify_bp_slot(bp, old_type, new_type); 681 - mutex_unlock(&nr_bp_mutex); 378 + bp_constraints_unlock(mtx); 682 379 return ret; 683 380 } 684 381 ··· 688 387 */ 689 388 int dbg_reserve_bp_slot(struct perf_event *bp) 690 389 { 691 - if (mutex_is_locked(&nr_bp_mutex)) 390 + int ret; 391 + 392 + if (bp_constraints_is_locked(bp)) 692 393 return -1; 693 394 694 - return __reserve_bp_slot(bp, bp->attr.bp_type); 395 + /* Locks aren't held; disable lockdep assert checking. */ 396 + lockdep_off(); 397 + ret = __reserve_bp_slot(bp, bp->attr.bp_type); 398 + lockdep_on(); 399 + 400 + return ret; 695 401 } 696 402 697 403 int dbg_release_bp_slot(struct perf_event *bp) 698 404 { 699 - if (mutex_is_locked(&nr_bp_mutex)) 405 + if (bp_constraints_is_locked(bp)) 700 406 return -1; 701 407 408 + /* Locks aren't held; disable lockdep assert checking. */ 409 + lockdep_off(); 702 410 __release_bp_slot(bp, bp->attr.bp_type); 411 + lockdep_on(); 703 412 704 413 return 0; 705 414 } ··· 915 604 } 916 605 EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); 917 606 607 + /** 608 + * hw_breakpoint_is_used - check if breakpoints are currently used 609 + * 610 + * Returns: true if breakpoints are used, false otherwise. 611 + */ 612 + bool hw_breakpoint_is_used(void) 613 + { 614 + int cpu; 615 + 616 + if (!constraints_initialized) 617 + return false; 618 + 619 + for_each_possible_cpu(cpu) { 620 + for (int type = 0; type < TYPE_MAX; ++type) { 621 + struct bp_cpuinfo *info = get_bp_info(cpu, type); 622 + 623 + if (info->cpu_pinned) 624 + return true; 625 + 626 + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { 627 + if (atomic_read(&info->tsk_pinned.count[slot])) 628 + return true; 629 + } 630 + } 631 + } 632 + 633 + for (int type = 0; type < TYPE_MAX; ++type) { 634 + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { 635 + /* 636 + * Warn, because if there are CPU pinned counters, 637 + * should never get here; bp_cpuinfo::cpu_pinned should 638 + * be consistent with the global cpu_pinned histogram. 639 + */ 640 + if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) 641 + return true; 642 + 643 + if (atomic_read(&tsk_pinned_all[type].count[slot])) 644 + return true; 645 + } 646 + } 647 + 648 + return false; 649 + } 650 + 918 651 static struct notifier_block hw_breakpoint_exceptions_nb = { 919 652 .notifier_call = hw_breakpoint_exceptions_notify, 920 653 /* we need to be notified first */ ··· 1033 678 1034 679 int __init init_hw_breakpoint(void) 1035 680 { 1036 - int cpu, err_cpu; 1037 - int i; 681 + int ret; 1038 682 1039 - for (i = 0; i < TYPE_MAX; i++) 1040 - nr_slots[i] = hw_breakpoint_slots(i); 683 + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); 684 + if (ret) 685 + return ret; 1041 686 1042 - for_each_possible_cpu(cpu) { 1043 - for (i = 0; i < TYPE_MAX; i++) { 1044 - struct bp_cpuinfo *info = get_bp_info(cpu, i); 687 + ret = init_breakpoint_slots(); 688 + if (ret) 689 + return ret; 1045 690 1046 - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), 1047 - GFP_KERNEL); 1048 - if (!info->tsk_pinned) 1049 - goto err_alloc; 1050 - } 1051 - } 1052 - 1053 - constraints_initialized = 1; 691 + constraints_initialized = true; 1054 692 1055 693 perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); 1056 694 1057 695 return register_die_notifier(&hw_breakpoint_exceptions_nb); 1058 - 1059 - err_alloc: 1060 - for_each_possible_cpu(err_cpu) { 1061 - for (i = 0; i < TYPE_MAX; i++) 1062 - kfree(get_bp_info(err_cpu, i)->tsk_pinned); 1063 - if (err_cpu == cpu) 1064 - break; 1065 - } 1066 - 1067 - return -ENOMEM; 1068 696 } 1069 - 1070 -
+333
kernel/events/hw_breakpoint_test.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * KUnit test for hw_breakpoint constraints accounting logic. 4 + * 5 + * Copyright (C) 2022, Google LLC. 6 + */ 7 + 8 + #include <kunit/test.h> 9 + #include <linux/cpumask.h> 10 + #include <linux/hw_breakpoint.h> 11 + #include <linux/kthread.h> 12 + #include <linux/perf_event.h> 13 + #include <asm/hw_breakpoint.h> 14 + 15 + #define TEST_REQUIRES_BP_SLOTS(test, slots) \ 16 + do { \ 17 + if ((slots) > get_test_bp_slots()) { \ 18 + kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \ 19 + get_test_bp_slots()); \ 20 + } \ 21 + } while (0) 22 + 23 + #define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr)) 24 + 25 + #define MAX_TEST_BREAKPOINTS 512 26 + 27 + static char break_vars[MAX_TEST_BREAKPOINTS]; 28 + static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS]; 29 + static struct task_struct *__other_task; 30 + 31 + static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx) 32 + { 33 + struct perf_event_attr attr = {}; 34 + 35 + if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS)) 36 + return NULL; 37 + 38 + hw_breakpoint_init(&attr); 39 + attr.bp_addr = (unsigned long)&break_vars[idx]; 40 + attr.bp_len = HW_BREAKPOINT_LEN_1; 41 + attr.bp_type = HW_BREAKPOINT_RW; 42 + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); 43 + } 44 + 45 + static void unregister_test_bp(struct perf_event **bp) 46 + { 47 + if (WARN_ON(IS_ERR(*bp))) 48 + return; 49 + if (WARN_ON(!*bp)) 50 + return; 51 + unregister_hw_breakpoint(*bp); 52 + *bp = NULL; 53 + } 54 + 55 + static int get_test_bp_slots(void) 56 + { 57 + static int slots; 58 + 59 + if (!slots) 60 + slots = hw_breakpoint_slots(TYPE_DATA); 61 + 62 + return slots; 63 + } 64 + 65 + static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk) 66 + { 67 + struct perf_event *bp = register_test_bp(cpu, tsk, *id); 68 + 69 + KUNIT_ASSERT_NOT_NULL(test, bp); 70 + KUNIT_ASSERT_FALSE(test, IS_ERR(bp)); 71 + KUNIT_ASSERT_NULL(test, test_bps[*id]); 72 + test_bps[(*id)++] = bp; 73 + } 74 + 75 + /* 76 + * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free. 77 + * 78 + * Returns true if this can be called again, continuing at @id. 79 + */ 80 + static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip) 81 + { 82 + for (int i = 0; i < get_test_bp_slots() - skip; ++i) 83 + fill_one_bp_slot(test, id, cpu, tsk); 84 + 85 + return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS; 86 + } 87 + 88 + static int dummy_kthread(void *arg) 89 + { 90 + return 0; 91 + } 92 + 93 + static struct task_struct *get_other_task(struct kunit *test) 94 + { 95 + struct task_struct *tsk; 96 + 97 + if (__other_task) 98 + return __other_task; 99 + 100 + tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task"); 101 + KUNIT_ASSERT_FALSE(test, IS_ERR(tsk)); 102 + __other_task = tsk; 103 + return __other_task; 104 + } 105 + 106 + static int get_test_cpu(int num) 107 + { 108 + int cpu; 109 + 110 + WARN_ON(num < 0); 111 + 112 + for_each_online_cpu(cpu) { 113 + if (num-- <= 0) 114 + break; 115 + } 116 + 117 + return cpu; 118 + } 119 + 120 + /* ===== Test cases ===== */ 121 + 122 + static void test_one_cpu(struct kunit *test) 123 + { 124 + int idx = 0; 125 + 126 + fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0); 127 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 128 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 129 + } 130 + 131 + static void test_many_cpus(struct kunit *test) 132 + { 133 + int idx = 0; 134 + int cpu; 135 + 136 + /* Test that CPUs are independent. */ 137 + for_each_online_cpu(cpu) { 138 + bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0); 139 + 140 + TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx)); 141 + if (!do_continue) 142 + break; 143 + } 144 + } 145 + 146 + static void test_one_task_on_all_cpus(struct kunit *test) 147 + { 148 + int idx = 0; 149 + 150 + fill_bp_slots(test, &idx, -1, current, 0); 151 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 152 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 153 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 154 + /* Remove one and adding back CPU-target should work. */ 155 + unregister_test_bp(&test_bps[0]); 156 + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); 157 + } 158 + 159 + static void test_two_tasks_on_all_cpus(struct kunit *test) 160 + { 161 + int idx = 0; 162 + 163 + /* Test that tasks are independent. */ 164 + fill_bp_slots(test, &idx, -1, current, 0); 165 + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); 166 + 167 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 168 + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); 169 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 170 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); 171 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 172 + /* Remove one from first task and adding back CPU-target should not work. */ 173 + unregister_test_bp(&test_bps[0]); 174 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 175 + } 176 + 177 + static void test_one_task_on_one_cpu(struct kunit *test) 178 + { 179 + int idx = 0; 180 + 181 + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); 182 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 183 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 184 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 185 + /* 186 + * Remove one and adding back CPU-target should work; this case is 187 + * special vs. above because the task's constraints are CPU-dependent. 188 + */ 189 + unregister_test_bp(&test_bps[0]); 190 + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); 191 + } 192 + 193 + static void test_one_task_mixed(struct kunit *test) 194 + { 195 + int idx = 0; 196 + 197 + TEST_REQUIRES_BP_SLOTS(test, 3); 198 + 199 + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); 200 + fill_bp_slots(test, &idx, -1, current, 1); 201 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 202 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 203 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 204 + 205 + /* Transition from CPU-dependent pinned count to CPU-independent. */ 206 + unregister_test_bp(&test_bps[0]); 207 + unregister_test_bp(&test_bps[1]); 208 + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); 209 + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); 210 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 211 + } 212 + 213 + static void test_two_tasks_on_one_cpu(struct kunit *test) 214 + { 215 + int idx = 0; 216 + 217 + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); 218 + fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0); 219 + 220 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 221 + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); 222 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 223 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); 224 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 225 + /* Can still create breakpoints on some other CPU. */ 226 + fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0); 227 + } 228 + 229 + static void test_two_tasks_on_one_all_cpus(struct kunit *test) 230 + { 231 + int idx = 0; 232 + 233 + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); 234 + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); 235 + 236 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 237 + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); 238 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 239 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); 240 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 241 + /* Cannot create breakpoints on some other CPU either. */ 242 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); 243 + } 244 + 245 + static void test_task_on_all_and_one_cpu(struct kunit *test) 246 + { 247 + int tsk_on_cpu_idx, cpu_idx; 248 + int idx = 0; 249 + 250 + TEST_REQUIRES_BP_SLOTS(test, 3); 251 + 252 + fill_bp_slots(test, &idx, -1, current, 2); 253 + /* Transitioning from only all CPU breakpoints to mixed. */ 254 + tsk_on_cpu_idx = idx; 255 + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); 256 + fill_one_bp_slot(test, &idx, -1, current); 257 + 258 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 259 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 260 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 261 + 262 + /* We should still be able to use up another CPU's slots. */ 263 + cpu_idx = idx; 264 + fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL); 265 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); 266 + 267 + /* Transitioning back to task target on all CPUs. */ 268 + unregister_test_bp(&test_bps[tsk_on_cpu_idx]); 269 + /* Still have a CPU target breakpoint in get_test_cpu(1). */ 270 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 271 + /* Remove it and try again. */ 272 + unregister_test_bp(&test_bps[cpu_idx]); 273 + fill_one_bp_slot(test, &idx, -1, current); 274 + 275 + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); 276 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); 277 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); 278 + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); 279 + } 280 + 281 + static struct kunit_case hw_breakpoint_test_cases[] = { 282 + KUNIT_CASE(test_one_cpu), 283 + KUNIT_CASE(test_many_cpus), 284 + KUNIT_CASE(test_one_task_on_all_cpus), 285 + KUNIT_CASE(test_two_tasks_on_all_cpus), 286 + KUNIT_CASE(test_one_task_on_one_cpu), 287 + KUNIT_CASE(test_one_task_mixed), 288 + KUNIT_CASE(test_two_tasks_on_one_cpu), 289 + KUNIT_CASE(test_two_tasks_on_one_all_cpus), 290 + KUNIT_CASE(test_task_on_all_and_one_cpu), 291 + {}, 292 + }; 293 + 294 + static int test_init(struct kunit *test) 295 + { 296 + /* Most test cases want 2 distinct CPUs. */ 297 + if (num_online_cpus() < 2) 298 + return -EINVAL; 299 + 300 + /* Want the system to not use breakpoints elsewhere. */ 301 + if (hw_breakpoint_is_used()) 302 + return -EBUSY; 303 + 304 + return 0; 305 + } 306 + 307 + static void test_exit(struct kunit *test) 308 + { 309 + for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) { 310 + if (test_bps[i]) 311 + unregister_test_bp(&test_bps[i]); 312 + } 313 + 314 + if (__other_task) { 315 + kthread_stop(__other_task); 316 + __other_task = NULL; 317 + } 318 + 319 + /* Verify that internal state agrees that no breakpoints are in use. */ 320 + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); 321 + } 322 + 323 + static struct kunit_suite hw_breakpoint_test_suite = { 324 + .name = "hw_breakpoint", 325 + .test_cases = hw_breakpoint_test_cases, 326 + .init = test_init, 327 + .exit = test_exit, 328 + }; 329 + 330 + kunit_test_suites(&hw_breakpoint_test_suite); 331 + 332 + MODULE_LICENSE("GPL"); 333 + MODULE_AUTHOR("Marco Elver <elver@google.com>");
+6
kernel/locking/percpu-rwsem.c
··· 192 192 __sum; \ 193 193 }) 194 194 195 + bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) 196 + { 197 + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); 198 + } 199 + EXPORT_SYMBOL_GPL(percpu_is_read_locked); 200 + 195 201 /* 196 202 * Return true if the modular sum of the sem->read_count per-CPU variable is 197 203 * zero. If this sum is zero, then it is stable due to the fact that if any
+3
kernel/trace/bpf_trace.c
··· 1706 1706 if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) 1707 1707 return -EINVAL; 1708 1708 1709 + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) 1710 + return -ENOENT; 1711 + 1709 1712 if (unlikely(!br_stack)) 1710 1713 return -ENOENT; 1711 1714
+10
lib/Kconfig.debug
··· 2556 2556 by the str*() and mem*() family of functions. For testing runtime 2557 2557 traps of FORTIFY_SOURCE, see LKDTM's "FORTIFY_*" tests. 2558 2558 2559 + config HW_BREAKPOINT_KUNIT_TEST 2560 + bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS 2561 + depends on HAVE_HW_BREAKPOINT 2562 + depends on KUNIT=y 2563 + default KUNIT_ALL_TESTS 2564 + help 2565 + Tests for hw_breakpoint constraints accounting. 2566 + 2567 + If unsure, say N. 2568 + 2559 2569 config TEST_UDELAY 2560 2570 tristate "udelay test driver" 2561 2571 help