Merge tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+1

arch/x86/Kconfig

··· 298 298 select HAVE_SYSCALL_TRACEPOINTS 299 299 select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL 300 300 select HAVE_UNSTABLE_SCHED_CLOCK 301 + select HAVE_UNWIND_USER_FP if X86_64 301 302 select HAVE_USER_RETURN_NOTIFIER 302 303 select HAVE_GENERIC_VDSO 303 304 select VDSO_GETRANDOM if X86_64

+1 -2

arch/x86/boot/compressed/sev-handle-vc.c

··· 29 29 bool insn_has_rep_prefix(struct insn *insn) 30 30 { 31 31 insn_byte_t p; 32 - int i; 33 32 34 33 insn_get_prefixes(insn); 35 34 36 - for_each_insn_prefix(insn, i, p) { 35 + for_each_insn_prefix(insn, p) { 37 36 if (p == 0xf2 || p == 0xf3) 38 37 return true; 39 38 }

+6 -1

arch/x86/events/amd/core.c

··· 763 763 if (!test_bit(idx, cpuc->active_mask)) 764 764 continue; 765 765 766 - amd_pmu_enable_event(cpuc->events[idx]); 766 + /* 767 + * FIXME: cpuc->events[idx] can become NULL in a subtle race 768 + * condition with NMI->throttle->x86_pmu_stop(). 769 + */ 770 + if (cpuc->events[idx]) 771 + amd_pmu_enable_event(cpuc->events[idx]); 767 772 } 768 773 } 769 774

+19 -47

arch/x86/events/core.c

··· 554 554 return m == b; 555 555 } 556 556 557 - int x86_pmu_max_precise(void) 557 + int x86_pmu_max_precise(struct pmu *pmu) 558 558 { 559 559 int precise = 0; 560 560 561 - /* Support for constant skid */ 562 561 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 563 - precise++; 562 + /* arch PEBS */ 563 + if (x86_pmu.arch_pebs) { 564 + precise = 2; 565 + if (hybrid(pmu, arch_pebs_cap).pdists) 566 + precise++; 564 567 568 + return precise; 569 + } 570 + 571 + /* legacy PEBS - support for constant skid */ 572 + precise++; 565 573 /* Support for IP fixup */ 566 574 if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) 567 575 precise++; ··· 577 569 if (x86_pmu.pebs_prec_dist) 578 570 precise++; 579 571 } 572 + 580 573 return precise; 581 574 } 582 575 583 576 int x86_pmu_hw_config(struct perf_event *event) 584 577 { 585 578 if (event->attr.precise_ip) { 586 - int precise = x86_pmu_max_precise(); 579 + int precise = x86_pmu_max_precise(event->pmu); 587 580 588 581 if (event->attr.precise_ip > precise) 589 582 return -EOPNOTSUPP; ··· 1353 1344 hwc->state |= PERF_HES_ARCH; 1354 1345 1355 1346 x86_pmu_stop(event, PERF_EF_UPDATE); 1347 + cpuc->events[hwc->idx] = NULL; 1356 1348 } 1357 1349 1358 1350 /* ··· 1375 1365 * if cpuc->enabled = 0, then no wrmsr as 1376 1366 * per x86_pmu_enable_event() 1377 1367 */ 1368 + cpuc->events[hwc->idx] = event; 1378 1369 x86_pmu_start(event, PERF_EF_RELOAD); 1379 1370 } 1380 1371 cpuc->n_added = 0; ··· 1542 1531 1543 1532 event->hw.state = 0; 1544 1533 1545 - cpuc->events[idx] = event; 1546 1534 __set_bit(idx, cpuc->active_mask); 1547 1535 static_call(x86_pmu_enable)(event); 1548 1536 perf_event_update_userpage(event); ··· 1620 1610 if (test_bit(hwc->idx, cpuc->active_mask)) { 1621 1611 static_call(x86_pmu_disable)(event); 1622 1612 __clear_bit(hwc->idx, cpuc->active_mask); 1623 - cpuc->events[hwc->idx] = NULL; 1624 1613 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1625 1614 hwc->state |= PERF_HES_STOPPED; 1626 1615 } ··· 1657 1648 * Not a TXN, therefore cleanup properly. 1658 1649 */ 1659 1650 x86_pmu_stop(event, PERF_EF_UPDATE); 1651 + cpuc->events[event->hw.idx] = NULL; 1660 1652 1661 1653 for (i = 0; i < cpuc->n_events; i++) { 1662 1654 if (event == cpuc->event_list[i]) ··· 2639 2629 struct device_attribute *attr, 2640 2630 char *buf) 2641 2631 { 2642 - return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); 2632 + struct pmu *pmu = dev_get_drvdata(cdev); 2633 + 2634 + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu)); 2643 2635 } 2644 2636 2645 2637 static DEVICE_ATTR_RO(max_precise); ··· 2856 2844 2857 2845 return get_desc_base(desc); 2858 2846 } 2859 - 2860 - #ifdef CONFIG_UPROBES 2861 - /* 2862 - * Heuristic-based check if uprobe is installed at the function entry. 2863 - * 2864 - * Under assumption of user code being compiled with frame pointers, 2865 - * `push %rbp/%ebp` is a good indicator that we indeed are. 2866 - * 2867 - * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. 2868 - * If we get this wrong, captured stack trace might have one extra bogus 2869 - * entry, but the rest of stack trace will still be meaningful. 2870 - */ 2871 - static bool is_uprobe_at_func_entry(struct pt_regs *regs) 2872 - { 2873 - struct arch_uprobe *auprobe; 2874 - 2875 - if (!current->utask) 2876 - return false; 2877 - 2878 - auprobe = current->utask->auprobe; 2879 - if (!auprobe) 2880 - return false; 2881 - 2882 - /* push %rbp/%ebp */ 2883 - if (auprobe->insn[0] == 0x55) 2884 - return true; 2885 - 2886 - /* endbr64 (64-bit only) */ 2887 - if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) 2888 - return true; 2889 - 2890 - return false; 2891 - } 2892 - 2893 - #else 2894 - static bool is_uprobe_at_func_entry(struct pt_regs *regs) 2895 - { 2896 - return false; 2897 - } 2898 - #endif /* CONFIG_UPROBES */ 2899 2847 2900 2848 #ifdef CONFIG_IA32_EMULATION 2901 2849

+414 -30

arch/x86/events/intel/core.c

··· 2563 2563 cpuc->fixed_ctrl_val &= ~mask; 2564 2564 } 2565 2565 2566 + static inline void __intel_pmu_update_event_ext(int idx, u64 ext) 2567 + { 2568 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2569 + u32 msr; 2570 + 2571 + if (idx < INTEL_PMC_IDX_FIXED) { 2572 + msr = MSR_IA32_PMC_V6_GP0_CFG_C + 2573 + x86_pmu.addr_offset(idx, false); 2574 + } else { 2575 + msr = MSR_IA32_PMC_V6_FX0_CFG_C + 2576 + x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); 2577 + } 2578 + 2579 + cpuc->cfg_c_val[idx] = ext; 2580 + wrmsrq(msr, ext); 2581 + } 2582 + 2583 + static void intel_pmu_disable_event_ext(struct perf_event *event) 2584 + { 2585 + /* 2586 + * Only clear CFG_C MSR for PEBS counter group events, 2587 + * it avoids the HW counter's value to be added into 2588 + * other PEBS records incorrectly after PEBS counter 2589 + * group events are disabled. 2590 + * 2591 + * For other events, it's unnecessary to clear CFG_C MSRs 2592 + * since CFG_C doesn't take effect if counter is in 2593 + * disabled state. That helps to reduce the WRMSR overhead 2594 + * in context switches. 2595 + */ 2596 + if (!is_pebs_counter_event_group(event)) 2597 + return; 2598 + 2599 + __intel_pmu_update_event_ext(event->hw.idx, 0); 2600 + } 2601 + 2602 + DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext); 2603 + 2566 2604 static void intel_pmu_disable_event(struct perf_event *event) 2567 2605 { 2568 2606 struct hw_perf_event *hwc = &event->hw; ··· 2609 2571 switch (idx) { 2610 2572 case 0 ... INTEL_PMC_IDX_FIXED - 1: 2611 2573 intel_clear_masks(event, idx); 2574 + static_call_cond(intel_pmu_disable_event_ext)(event); 2612 2575 x86_pmu_disable_event(event); 2613 2576 break; 2614 2577 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: 2578 + static_call_cond(intel_pmu_disable_event_ext)(event); 2579 + fallthrough; 2615 2580 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: 2616 2581 intel_pmu_disable_fixed(event); 2617 2582 break; ··· 2981 2940 2982 2941 DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); 2983 2942 2943 + static void intel_pmu_enable_event_ext(struct perf_event *event) 2944 + { 2945 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2946 + struct hw_perf_event *hwc = &event->hw; 2947 + union arch_pebs_index old, new; 2948 + struct arch_pebs_cap cap; 2949 + u64 ext = 0; 2950 + 2951 + cap = hybrid(cpuc->pmu, arch_pebs_cap); 2952 + 2953 + if (event->attr.precise_ip) { 2954 + u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event); 2955 + 2956 + ext |= ARCH_PEBS_EN; 2957 + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) 2958 + ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD; 2959 + 2960 + if (pebs_data_cfg && cap.caps) { 2961 + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) 2962 + ext |= ARCH_PEBS_AUX & cap.caps; 2963 + 2964 + if (pebs_data_cfg & PEBS_DATACFG_GP) 2965 + ext |= ARCH_PEBS_GPR & cap.caps; 2966 + 2967 + if (pebs_data_cfg & PEBS_DATACFG_XMMS) 2968 + ext |= ARCH_PEBS_VECR_XMM & cap.caps; 2969 + 2970 + if (pebs_data_cfg & PEBS_DATACFG_LBRS) 2971 + ext |= ARCH_PEBS_LBR & cap.caps; 2972 + 2973 + if (pebs_data_cfg & 2974 + (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT)) 2975 + ext |= ARCH_PEBS_CNTR_GP & cap.caps; 2976 + 2977 + if (pebs_data_cfg & 2978 + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT)) 2979 + ext |= ARCH_PEBS_CNTR_FIXED & cap.caps; 2980 + 2981 + if (pebs_data_cfg & PEBS_DATACFG_METRICS) 2982 + ext |= ARCH_PEBS_CNTR_METRICS & cap.caps; 2983 + } 2984 + 2985 + if (cpuc->n_pebs == cpuc->n_large_pebs) 2986 + new.thresh = ARCH_PEBS_THRESH_MULTI; 2987 + else 2988 + new.thresh = ARCH_PEBS_THRESH_SINGLE; 2989 + 2990 + rdmsrq(MSR_IA32_PEBS_INDEX, old.whole); 2991 + if (new.thresh != old.thresh || !old.en) { 2992 + if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) { 2993 + /* 2994 + * Large PEBS was enabled. 2995 + * Drain PEBS buffer before applying the single PEBS. 2996 + */ 2997 + intel_pmu_drain_pebs_buffer(); 2998 + } else { 2999 + new.wr = 0; 3000 + new.full = 0; 3001 + new.en = 1; 3002 + wrmsrq(MSR_IA32_PEBS_INDEX, new.whole); 3003 + } 3004 + } 3005 + } 3006 + 3007 + if (is_pebs_counter_event_group(event)) 3008 + ext |= ARCH_PEBS_CNTR_ALLOW; 3009 + 3010 + if (cpuc->cfg_c_val[hwc->idx] != ext) 3011 + __intel_pmu_update_event_ext(hwc->idx, ext); 3012 + } 3013 + 3014 + DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext); 3015 + 2984 3016 static void intel_pmu_enable_event(struct perf_event *event) 2985 3017 { 2986 3018 u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; ··· 3069 2955 enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; 3070 2956 intel_set_masks(event, idx); 3071 2957 static_call_cond(intel_pmu_enable_acr_event)(event); 2958 + static_call_cond(intel_pmu_enable_event_ext)(event); 3072 2959 __x86_pmu_enable_event(hwc, enable_mask); 3073 2960 break; 3074 2961 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: 3075 2962 static_call_cond(intel_pmu_enable_acr_event)(event); 2963 + static_call_cond(intel_pmu_enable_event_ext)(event); 3076 2964 fallthrough; 3077 2965 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: 3078 2966 intel_pmu_enable_fixed(event); ··· 3332 3216 } 3333 3217 3334 3218 /* 3219 + * Arch PEBS sets bit 54 in the global status register 3220 + */ 3221 + if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT, 3222 + (unsigned long *)&status)) { 3223 + handled++; 3224 + static_call(x86_pmu_drain_pebs)(regs, &data); 3225 + 3226 + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && 3227 + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) 3228 + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; 3229 + } 3230 + 3231 + /* 3335 3232 * Intel PT 3336 3233 */ 3337 3234 if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { ··· 3398 3269 * The PEBS buffer has to be drained before handling the A-PMI 3399 3270 */ 3400 3271 if (is_pebs_counter_event_group(event)) 3401 - x86_pmu.drain_pebs(regs, &data); 3272 + static_call(x86_pmu_drain_pebs)(regs, &data); 3402 3273 3403 3274 last_period = event->hw.last_period; 3404 3275 ··· 4158 4029 if (!event->attr.exclude_kernel) 4159 4030 flags &= ~PERF_SAMPLE_REGS_USER; 4160 4031 if (event->attr.sample_regs_user & ~PEBS_GP_REGS) 4161 - flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); 4032 + flags &= ~PERF_SAMPLE_REGS_USER; 4033 + if (event->attr.sample_regs_intr & ~PEBS_GP_REGS) 4034 + flags &= ~PERF_SAMPLE_REGS_INTR; 4162 4035 return flags; 4163 4036 } 4164 4037 ··· 4335 4204 return false; 4336 4205 } 4337 4206 4207 + static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu) 4208 + { 4209 + u64 caps; 4210 + 4211 + if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline) 4212 + return true; 4213 + 4214 + caps = hybrid(pmu, arch_pebs_cap).caps; 4215 + if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK)) 4216 + return true; 4217 + 4218 + return false; 4219 + } 4220 + 4338 4221 static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, 4339 4222 u64 *cause_mask, int *num) 4340 4223 { ··· 4382 4237 } 4383 4238 4384 4239 if (event->attr.precise_ip) { 4240 + struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap); 4241 + 4385 4242 if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) 4386 4243 return -EINVAL; 4387 4244 ··· 4397 4250 } 4398 4251 if (x86_pmu.pebs_aliases) 4399 4252 x86_pmu.pebs_aliases(event); 4253 + 4254 + if (x86_pmu.arch_pebs) { 4255 + u64 cntr_mask = hybrid(event->pmu, intel_ctrl) & 4256 + ~GLOBAL_CTRL_EN_PERF_METRICS; 4257 + u64 pebs_mask = event->attr.precise_ip >= 3 ? 4258 + pebs_cap.pdists : pebs_cap.counters; 4259 + if (cntr_mask != pebs_mask) 4260 + event->hw.dyn_constraint &= pebs_mask; 4261 + } 4400 4262 } 4401 4263 4402 4264 if (needs_branch_stack(event)) { ··· 4497 4341 } 4498 4342 4499 4343 if ((event->attr.sample_type & PERF_SAMPLE_READ) && 4500 - (x86_pmu.intel_cap.pebs_format >= 6) && 4501 - x86_pmu.intel_cap.pebs_baseline && 4344 + intel_pmu_has_pebs_counter_group(event->pmu) && 4502 4345 is_sampling_event(event) && 4503 4346 event->attr.precise_ip) 4504 4347 event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; ··· 5367 5212 5368 5213 static int intel_pmu_cpu_prepare(int cpu) 5369 5214 { 5370 - return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); 5215 + int ret; 5216 + 5217 + ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); 5218 + if (ret) 5219 + return ret; 5220 + 5221 + return alloc_arch_pebs_buf_on_cpu(cpu); 5371 5222 } 5372 5223 5373 5224 static void flip_smm_bit(void *data) ··· 5418 5257 u64 fixed_cntr_mask, 5419 5258 u64 intel_ctrl); 5420 5259 5260 + enum dyn_constr_type { 5261 + DYN_CONSTR_NONE, 5262 + DYN_CONSTR_BR_CNTR, 5263 + DYN_CONSTR_ACR_CNTR, 5264 + DYN_CONSTR_ACR_CAUSE, 5265 + DYN_CONSTR_PEBS, 5266 + DYN_CONSTR_PDIST, 5267 + 5268 + DYN_CONSTR_MAX, 5269 + }; 5270 + 5271 + static const char * const dyn_constr_type_name[] = { 5272 + [DYN_CONSTR_NONE] = "a normal event", 5273 + [DYN_CONSTR_BR_CNTR] = "a branch counter logging event", 5274 + [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event", 5275 + [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event", 5276 + [DYN_CONSTR_PEBS] = "a PEBS event", 5277 + [DYN_CONSTR_PDIST] = "a PEBS PDIST event", 5278 + }; 5279 + 5280 + static void __intel_pmu_check_dyn_constr(struct event_constraint *constr, 5281 + enum dyn_constr_type type, u64 mask) 5282 + { 5283 + struct event_constraint *c1, *c2; 5284 + int new_weight, check_weight; 5285 + u64 new_mask, check_mask; 5286 + 5287 + for_each_event_constraint(c1, constr) { 5288 + new_mask = c1->idxmsk64 & mask; 5289 + new_weight = hweight64(new_mask); 5290 + 5291 + /* ignore topdown perf metrics event */ 5292 + if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) 5293 + continue; 5294 + 5295 + if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) { 5296 + pr_info("The event 0x%llx is not supported as %s.\n", 5297 + c1->code, dyn_constr_type_name[type]); 5298 + } 5299 + 5300 + if (new_weight <= 1) 5301 + continue; 5302 + 5303 + for_each_event_constraint(c2, c1 + 1) { 5304 + bool check_fail = false; 5305 + 5306 + check_mask = c2->idxmsk64 & mask; 5307 + check_weight = hweight64(check_mask); 5308 + 5309 + if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN || 5310 + !check_weight) 5311 + continue; 5312 + 5313 + /* The same constraints or no overlap */ 5314 + if (new_mask == check_mask || 5315 + (new_mask ^ check_mask) == (new_mask | check_mask)) 5316 + continue; 5317 + 5318 + /* 5319 + * A scheduler issue may be triggered in the following cases. 5320 + * - Two overlap constraints have the same weight. 5321 + * E.g., A constraints: 0x3, B constraints: 0x6 5322 + * event counter failure case 5323 + * B PMC[2:1] 1 5324 + * A PMC[1:0] 0 5325 + * A PMC[1:0] FAIL 5326 + * - Two overlap constraints have different weight. 5327 + * The constraint has a low weight, but has high last bit. 5328 + * E.g., A constraints: 0x7, B constraints: 0xC 5329 + * event counter failure case 5330 + * B PMC[3:2] 2 5331 + * A PMC[2:0] 0 5332 + * A PMC[2:0] 1 5333 + * A PMC[2:0] FAIL 5334 + */ 5335 + if (new_weight == check_weight) { 5336 + check_fail = true; 5337 + } else if (new_weight < check_weight) { 5338 + if ((new_mask | check_mask) != check_mask && 5339 + fls64(new_mask) > fls64(check_mask)) 5340 + check_fail = true; 5341 + } else { 5342 + if ((new_mask | check_mask) != new_mask && 5343 + fls64(new_mask) < fls64(check_mask)) 5344 + check_fail = true; 5345 + } 5346 + 5347 + if (check_fail) { 5348 + pr_info("The two events 0x%llx and 0x%llx may not be " 5349 + "fully scheduled under some circumstances as " 5350 + "%s.\n", 5351 + c1->code, c2->code, dyn_constr_type_name[type]); 5352 + } 5353 + } 5354 + } 5355 + } 5356 + 5357 + static void intel_pmu_check_dyn_constr(struct pmu *pmu, 5358 + struct event_constraint *constr, 5359 + u64 cntr_mask) 5360 + { 5361 + enum dyn_constr_type i; 5362 + u64 mask; 5363 + 5364 + for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) { 5365 + mask = 0; 5366 + switch (i) { 5367 + case DYN_CONSTR_NONE: 5368 + mask = cntr_mask; 5369 + break; 5370 + case DYN_CONSTR_BR_CNTR: 5371 + if (x86_pmu.flags & PMU_FL_BR_CNTR) 5372 + mask = x86_pmu.lbr_counters; 5373 + break; 5374 + case DYN_CONSTR_ACR_CNTR: 5375 + mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); 5376 + break; 5377 + case DYN_CONSTR_ACR_CAUSE: 5378 + if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64)) 5379 + continue; 5380 + mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); 5381 + break; 5382 + case DYN_CONSTR_PEBS: 5383 + if (x86_pmu.arch_pebs) 5384 + mask = hybrid(pmu, arch_pebs_cap).counters; 5385 + break; 5386 + case DYN_CONSTR_PDIST: 5387 + if (x86_pmu.arch_pebs) 5388 + mask = hybrid(pmu, arch_pebs_cap).pdists; 5389 + break; 5390 + default: 5391 + pr_warn("Unsupported dynamic constraint type %d\n", i); 5392 + } 5393 + 5394 + if (mask) 5395 + __intel_pmu_check_dyn_constr(constr, i, mask); 5396 + } 5397 + } 5398 + 5399 + static void intel_pmu_check_event_constraints_all(struct pmu *pmu) 5400 + { 5401 + struct event_constraint *event_constraints = hybrid(pmu, event_constraints); 5402 + struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints); 5403 + u64 cntr_mask = hybrid(pmu, cntr_mask64); 5404 + u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64); 5405 + u64 intel_ctrl = hybrid(pmu, intel_ctrl); 5406 + 5407 + intel_pmu_check_event_constraints(event_constraints, cntr_mask, 5408 + fixed_cntr_mask, intel_ctrl); 5409 + 5410 + if (event_constraints) 5411 + intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask); 5412 + 5413 + if (pebs_constraints) 5414 + intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask); 5415 + } 5416 + 5421 5417 static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); 5422 5418 5423 5419 static inline bool intel_pmu_broken_perf_cap(void) ··· 5587 5269 return false; 5588 5270 } 5589 5271 5272 + static inline void __intel_update_pmu_caps(struct pmu *pmu) 5273 + { 5274 + struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id()); 5275 + 5276 + if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM) 5277 + dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; 5278 + } 5279 + 5280 + static inline void __intel_update_large_pebs_flags(struct pmu *pmu) 5281 + { 5282 + u64 caps = hybrid(pmu, arch_pebs_cap).caps; 5283 + 5284 + x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; 5285 + if (caps & ARCH_PEBS_LBR) 5286 + x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK; 5287 + if (caps & ARCH_PEBS_CNTR_MASK) 5288 + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; 5289 + 5290 + if (!(caps & ARCH_PEBS_AUX)) 5291 + x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC; 5292 + if (!(caps & ARCH_PEBS_GPR)) { 5293 + x86_pmu.large_pebs_flags &= 5294 + ~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER); 5295 + } 5296 + } 5297 + 5298 + #define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED)) 5299 + 5590 5300 static void update_pmu_cap(struct pmu *pmu) 5591 5301 { 5592 - unsigned int cntr, fixed_cntr, ecx, edx; 5593 - union cpuid35_eax eax; 5594 - union cpuid35_ebx ebx; 5302 + unsigned int eax, ebx, ecx, edx; 5303 + union cpuid35_eax eax_0; 5304 + union cpuid35_ebx ebx_0; 5305 + u64 cntrs_mask = 0; 5306 + u64 pebs_mask = 0; 5307 + u64 pdists_mask = 0; 5595 5308 5596 - cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); 5309 + cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx); 5597 5310 5598 - if (ebx.split.umask2) 5311 + if (ebx_0.split.umask2) 5599 5312 hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; 5600 - if (ebx.split.eq) 5313 + if (ebx_0.split.eq) 5601 5314 hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; 5602 5315 5603 - if (eax.split.cntr_subleaf) { 5316 + if (eax_0.split.cntr_subleaf) { 5604 5317 cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, 5605 - &cntr, &fixed_cntr, &ecx, &edx); 5606 - hybrid(pmu, cntr_mask64) = cntr; 5607 - hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; 5318 + &eax, &ebx, &ecx, &edx); 5319 + hybrid(pmu, cntr_mask64) = eax; 5320 + hybrid(pmu, fixed_cntr_mask64) = ebx; 5321 + cntrs_mask = counter_mask(eax, ebx); 5608 5322 } 5609 5323 5610 - if (eax.split.acr_subleaf) { 5324 + if (eax_0.split.acr_subleaf) { 5611 5325 cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, 5612 - &cntr, &fixed_cntr, &ecx, &edx); 5326 + &eax, &ebx, &ecx, &edx); 5613 5327 /* The mask of the counters which can be reloaded */ 5614 - hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); 5615 - 5328 + hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx); 5616 5329 /* The mask of the counters which can cause a reload of reloadable counters */ 5617 - hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); 5330 + hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx); 5331 + } 5332 + 5333 + /* Bits[5:4] should be set simultaneously if arch-PEBS is supported */ 5334 + if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) { 5335 + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF, 5336 + &eax, &ebx, &ecx, &edx); 5337 + hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32; 5338 + 5339 + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF, 5340 + &eax, &ebx, &ecx, &edx); 5341 + pebs_mask = counter_mask(eax, ecx); 5342 + pdists_mask = counter_mask(ebx, edx); 5343 + hybrid(pmu, arch_pebs_cap).counters = pebs_mask; 5344 + hybrid(pmu, arch_pebs_cap).pdists = pdists_mask; 5345 + 5346 + if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) { 5347 + x86_pmu.arch_pebs = 0; 5348 + } else { 5349 + __intel_update_pmu_caps(pmu); 5350 + __intel_update_large_pebs_flags(pmu); 5351 + } 5352 + } else { 5353 + WARN_ON(x86_pmu.arch_pebs == 1); 5354 + x86_pmu.arch_pebs = 0; 5618 5355 } 5619 5356 5620 5357 if (!intel_pmu_broken_perf_cap()) { ··· 5692 5319 else 5693 5320 pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; 5694 5321 5695 - intel_pmu_check_event_constraints(pmu->event_constraints, 5696 - pmu->cntr_mask64, 5697 - pmu->fixed_cntr_mask64, 5698 - pmu->intel_ctrl); 5322 + intel_pmu_check_event_constraints_all(&pmu->pmu); 5699 5323 5700 5324 intel_pmu_check_extra_regs(pmu->extra_regs); 5701 5325 } ··· 5788 5418 return; 5789 5419 5790 5420 init_debug_store_on_cpu(cpu); 5421 + init_arch_pebs_on_cpu(cpu); 5791 5422 /* 5792 5423 * Deal with CPUs that don't clear their LBRs on power-up, and that may 5793 5424 * even boot with LBRs enabled. ··· 5826 5455 x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; 5827 5456 } 5828 5457 } 5458 + 5459 + __intel_update_pmu_caps(cpuc->pmu); 5829 5460 5830 5461 if (!cpuc->shared_regs) 5831 5462 return; ··· 5888 5515 static void intel_pmu_cpu_dying(int cpu) 5889 5516 { 5890 5517 fini_debug_store_on_cpu(cpu); 5518 + fini_arch_pebs_on_cpu(cpu); 5891 5519 } 5892 5520 5893 5521 void intel_cpuc_finish(struct cpu_hw_events *cpuc) ··· 5909 5535 { 5910 5536 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 5911 5537 5538 + release_arch_pebs_buf_on_cpu(cpu); 5912 5539 intel_cpuc_finish(cpuc); 5913 5540 5914 5541 if (is_hybrid() && cpuc->pmu) ··· 6625 6250 static umode_t 6626 6251 pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) 6627 6252 { 6628 - return x86_pmu.ds_pebs ? attr->mode : 0; 6253 + return intel_pmu_has_pebs() ? attr->mode : 0; 6629 6254 } 6630 6255 6631 6256 static umode_t ··· 7315 6940 * Many features on and after V6 require dynamic constraint, 7316 6941 * e.g., Arch PEBS, ACR. 7317 6942 */ 7318 - if (version >= 6) 6943 + if (version >= 6) { 7319 6944 x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; 6945 + x86_pmu.late_setup = intel_pmu_late_setup; 6946 + } 6947 + 7320 6948 /* 7321 6949 * Install the hw-cache-events table: 7322 6950 */ ··· 8105 7727 if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) 8106 7728 update_pmu_cap(NULL); 8107 7729 7730 + if (x86_pmu.arch_pebs) { 7731 + static_call_update(intel_pmu_disable_event_ext, 7732 + intel_pmu_disable_event_ext); 7733 + static_call_update(intel_pmu_enable_event_ext, 7734 + intel_pmu_enable_event_ext); 7735 + pr_cont("Architectural PEBS, "); 7736 + } 7737 + 8108 7738 intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, 8109 7739 &x86_pmu.fixed_cntr_mask64, 8110 7740 &x86_pmu.intel_ctrl); ··· 8121 7735 if (x86_pmu.intel_cap.anythread_deprecated) 8122 7736 x86_pmu.format_attrs = intel_arch_formats_attr; 8123 7737 8124 - intel_pmu_check_event_constraints(x86_pmu.event_constraints, 8125 - x86_pmu.cntr_mask64, 8126 - x86_pmu.fixed_cntr_mask64, 8127 - x86_pmu.intel_ctrl); 7738 + intel_pmu_check_event_constraints_all(NULL); 7739 + 8128 7740 /* 8129 7741 * Access LBR MSR may cause #GP under certain circumstances. 8130 7742 * Check all LBR MSR here.

+10 -8

arch/x86/events/intel/cstate.c

··· 41 41 * MSR_CORE_C1_RES: CORE C1 Residency Counter 42 42 * perf code: 0x00 43 43 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL 44 - * MTL,SRF,GRR,ARL,LNL 44 + * MTL,SRF,GRR,ARL,LNL,PTL 45 45 * Scope: Core (each processor core has a MSR) 46 46 * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter 47 47 * perf code: 0x01 ··· 53 53 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, 54 54 * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, 55 55 * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, 56 - * GRR,ARL,LNL 56 + * GRR,ARL,LNL,PTL 57 57 * Scope: Core 58 58 * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter 59 59 * perf code: 0x03 60 60 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, 61 - * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL 61 + * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, 62 + * PTL 62 63 * Scope: Core 63 64 * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. 64 65 * perf code: 0x00 65 66 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, 66 67 * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, 67 - * RPL,SPR,MTL,ARL,LNL,SRF 68 + * RPL,SPR,MTL,ARL,LNL,SRF,PTL 68 69 * Scope: Package (physical package) 69 70 * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. 70 71 * perf code: 0x01 71 72 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, 72 73 * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL, 73 - * ADL,RPL,MTL,ARL,LNL 74 + * ADL,RPL,MTL,ARL 74 75 * Scope: Package (physical package) 75 76 * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. 76 77 * perf code: 0x02 77 78 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, 78 79 * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, 79 80 * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, 80 - * ARL,LNL 81 + * ARL,LNL,PTL 81 82 * Scope: Package (physical package) 82 83 * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. 83 84 * perf code: 0x03 ··· 97 96 * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. 98 97 * perf code: 0x06 99 98 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, 100 - * TNT,RKL,ADL,RPL,MTL,ARL,LNL 99 + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL 101 100 * Scope: Package (physical package) 102 101 * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. 103 102 * perf code: 0x00 ··· 523 522 BIT(PERF_CSTATE_CORE_C7_RES), 524 523 525 524 .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | 526 - BIT(PERF_CSTATE_PKG_C3_RES) | 527 525 BIT(PERF_CSTATE_PKG_C6_RES) | 528 526 BIT(PERF_CSTATE_PKG_C10_RES), 529 527 }; ··· 628 628 X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates), 629 629 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates), 630 630 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates), 631 + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &srf_cstates), 631 632 632 633 X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates), 633 634 X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates), ··· 653 652 X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates), 654 653 X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), 655 654 X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), 655 + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), 656 656 { }, 657 657 }; 658 658 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);

+496 -107

arch/x86/events/intel/ds.c

··· 626 626 int max, node = cpu_to_node(cpu); 627 627 void *buffer, *insn_buff, *cea; 628 628 629 - if (!x86_pmu.ds_pebs) 629 + if (!intel_pmu_has_pebs()) 630 630 return 0; 631 631 632 632 buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); 633 633 if (unlikely(!buffer)) 634 634 return -ENOMEM; 635 + 636 + if (x86_pmu.arch_pebs) { 637 + hwev->pebs_vaddr = buffer; 638 + return 0; 639 + } 635 640 636 641 /* 637 642 * HSW+ already provides us the eventing ip; no need to allocate this ··· 650 645 } 651 646 per_cpu(insn_buffer, cpu) = insn_buff; 652 647 } 653 - hwev->ds_pebs_vaddr = buffer; 648 + hwev->pebs_vaddr = buffer; 654 649 /* Update the cpu entry area mapping */ 655 650 cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; 656 651 ds->pebs_buffer_base = (unsigned long) cea; ··· 666 661 struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); 667 662 void *cea; 668 663 669 - if (!x86_pmu.ds_pebs) 664 + if (!intel_pmu_has_pebs()) 670 665 return; 671 666 672 - kfree(per_cpu(insn_buffer, cpu)); 673 - per_cpu(insn_buffer, cpu) = NULL; 667 + if (x86_pmu.ds_pebs) { 668 + kfree(per_cpu(insn_buffer, cpu)); 669 + per_cpu(insn_buffer, cpu) = NULL; 674 670 675 - /* Clear the fixmap */ 676 - cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; 677 - ds_clear_cea(cea, x86_pmu.pebs_buffer_size); 678 - dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); 679 - hwev->ds_pebs_vaddr = NULL; 671 + /* Clear the fixmap */ 672 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; 673 + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); 674 + } 675 + 676 + dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size); 677 + hwev->pebs_vaddr = NULL; 680 678 } 681 679 682 680 static int alloc_bts_buffer(int cpu) ··· 830 822 init_debug_store_on_cpu(cpu); 831 823 } 832 824 } 825 + } 826 + 827 + inline int alloc_arch_pebs_buf_on_cpu(int cpu) 828 + { 829 + if (!x86_pmu.arch_pebs) 830 + return 0; 831 + 832 + return alloc_pebs_buffer(cpu); 833 + } 834 + 835 + inline void release_arch_pebs_buf_on_cpu(int cpu) 836 + { 837 + if (!x86_pmu.arch_pebs) 838 + return; 839 + 840 + release_pebs_buffer(cpu); 841 + } 842 + 843 + void init_arch_pebs_on_cpu(int cpu) 844 + { 845 + struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu); 846 + u64 arch_pebs_base; 847 + 848 + if (!x86_pmu.arch_pebs) 849 + return; 850 + 851 + if (!cpuc->pebs_vaddr) { 852 + WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu); 853 + x86_pmu.pebs_active = 0; 854 + return; 855 + } 856 + 857 + /* 858 + * 4KB-aligned pointer of the output buffer 859 + * (__alloc_pages_node() return page aligned address) 860 + * Buffer Size = 4KB * 2^SIZE 861 + * contiguous physical buffer (__alloc_pages_node() with order) 862 + */ 863 + arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT; 864 + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base, 865 + (u32)(arch_pebs_base >> 32)); 866 + x86_pmu.pebs_active = 1; 867 + } 868 + 869 + inline void fini_arch_pebs_on_cpu(int cpu) 870 + { 871 + if (!x86_pmu.arch_pebs) 872 + return; 873 + 874 + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0); 833 875 } 834 876 835 877 /* ··· 1529 1471 } 1530 1472 } 1531 1473 1474 + u64 intel_get_arch_pebs_data_config(struct perf_event *event) 1475 + { 1476 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1477 + u64 pebs_data_cfg = 0; 1478 + u64 cntr_mask; 1479 + 1480 + if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX)) 1481 + return 0; 1482 + 1483 + pebs_data_cfg |= pebs_update_adaptive_cfg(event); 1484 + 1485 + cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) | 1486 + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) | 1487 + PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS; 1488 + pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask; 1489 + 1490 + return pebs_data_cfg; 1491 + } 1492 + 1532 1493 void intel_pmu_pebs_add(struct perf_event *event) 1533 1494 { 1534 1495 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); ··· 1609 1532 intel_pmu_drain_pebs_buffer(); 1610 1533 } 1611 1534 1535 + static void __intel_pmu_pebs_enable(struct perf_event *event) 1536 + { 1537 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1538 + struct hw_perf_event *hwc = &event->hw; 1539 + 1540 + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 1541 + cpuc->pebs_enabled |= 1ULL << hwc->idx; 1542 + } 1543 + 1612 1544 void intel_pmu_pebs_enable(struct perf_event *event) 1613 1545 { 1614 1546 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); ··· 1626 1540 struct debug_store *ds = cpuc->ds; 1627 1541 unsigned int idx = hwc->idx; 1628 1542 1629 - hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 1630 - 1631 - cpuc->pebs_enabled |= 1ULL << hwc->idx; 1543 + __intel_pmu_pebs_enable(event); 1632 1544 1633 1545 if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) 1634 1546 cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); ··· 1688 1604 pebs_update_state(needed_cb, cpuc, event, false); 1689 1605 } 1690 1606 1691 - void intel_pmu_pebs_disable(struct perf_event *event) 1607 + static void __intel_pmu_pebs_disable(struct perf_event *event) 1692 1608 { 1693 1609 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1694 1610 struct hw_perf_event *hwc = &event->hw; 1695 1611 1696 1612 intel_pmu_drain_large_pebs(cpuc); 1697 - 1698 1613 cpuc->pebs_enabled &= ~(1ULL << hwc->idx); 1614 + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; 1615 + } 1616 + 1617 + void intel_pmu_pebs_disable(struct perf_event *event) 1618 + { 1619 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1620 + struct hw_perf_event *hwc = &event->hw; 1621 + 1622 + __intel_pmu_pebs_disable(event); 1699 1623 1700 1624 if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && 1701 1625 (x86_pmu.version < 5)) ··· 1715 1623 1716 1624 if (cpuc->enabled) 1717 1625 wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); 1718 - 1719 - hwc->config |= ARCH_PERFMON_EVENTSEL_INT; 1720 1626 } 1721 1627 1722 1628 void intel_pmu_pebs_enable_all(void) ··· 2150 2060 2151 2061 #define PEBS_LATENCY_MASK 0xffff 2152 2062 2063 + static inline void __setup_perf_sample_data(struct perf_event *event, 2064 + struct pt_regs *iregs, 2065 + struct perf_sample_data *data) 2066 + { 2067 + perf_sample_data_init(data, 0, event->hw.last_period); 2068 + 2069 + /* 2070 + * We must however always use iregs for the unwinder to stay sane; the 2071 + * record BP,SP,IP can point into thin air when the record is from a 2072 + * previous PMI context or an (I)RET happened between the record and 2073 + * PMI. 2074 + */ 2075 + perf_sample_save_callchain(data, event, iregs); 2076 + } 2077 + 2078 + static inline void __setup_pebs_basic_group(struct perf_event *event, 2079 + struct pt_regs *regs, 2080 + struct perf_sample_data *data, 2081 + u64 sample_type, u64 ip, 2082 + u64 tsc, u16 retire) 2083 + { 2084 + /* The ip in basic is EventingIP */ 2085 + set_linear_ip(regs, ip); 2086 + regs->flags = PERF_EFLAGS_EXACT; 2087 + setup_pebs_time(event, data, tsc); 2088 + 2089 + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) 2090 + data->weight.var3_w = retire; 2091 + } 2092 + 2093 + static inline void __setup_pebs_gpr_group(struct perf_event *event, 2094 + struct pt_regs *regs, 2095 + struct pebs_gprs *gprs, 2096 + u64 sample_type) 2097 + { 2098 + if (event->attr.precise_ip < 2) { 2099 + set_linear_ip(regs, gprs->ip); 2100 + regs->flags &= ~PERF_EFLAGS_EXACT; 2101 + } 2102 + 2103 + if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) 2104 + adaptive_pebs_save_regs(regs, gprs); 2105 + } 2106 + 2107 + static inline void __setup_pebs_meminfo_group(struct perf_event *event, 2108 + struct perf_sample_data *data, 2109 + u64 sample_type, u64 latency, 2110 + u16 instr_latency, u64 address, 2111 + u64 aux, u64 tsx_tuning, u64 ax) 2112 + { 2113 + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { 2114 + u64 tsx_latency = intel_get_tsx_weight(tsx_tuning); 2115 + 2116 + data->weight.var2_w = instr_latency; 2117 + 2118 + /* 2119 + * Although meminfo::latency is defined as a u64, 2120 + * only the lower 32 bits include the valid data 2121 + * in practice on Ice Lake and earlier platforms. 2122 + */ 2123 + if (sample_type & PERF_SAMPLE_WEIGHT) 2124 + data->weight.full = latency ?: tsx_latency; 2125 + else 2126 + data->weight.var1_dw = (u32)latency ?: tsx_latency; 2127 + 2128 + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 2129 + } 2130 + 2131 + if (sample_type & PERF_SAMPLE_DATA_SRC) { 2132 + data->data_src.val = get_data_src(event, aux); 2133 + data->sample_flags |= PERF_SAMPLE_DATA_SRC; 2134 + } 2135 + 2136 + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { 2137 + data->addr = address; 2138 + data->sample_flags |= PERF_SAMPLE_ADDR; 2139 + } 2140 + 2141 + if (sample_type & PERF_SAMPLE_TRANSACTION) { 2142 + data->txn = intel_get_tsx_transaction(tsx_tuning, ax); 2143 + data->sample_flags |= PERF_SAMPLE_TRANSACTION; 2144 + } 2145 + } 2146 + 2153 2147 /* 2154 2148 * With adaptive PEBS the layout depends on what fields are configured. 2155 2149 */ ··· 2243 2069 struct pt_regs *regs) 2244 2070 { 2245 2071 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2072 + u64 sample_type = event->attr.sample_type; 2246 2073 struct pebs_basic *basic = __pebs; 2247 2074 void *next_record = basic + 1; 2248 - u64 sample_type, format_group; 2249 2075 struct pebs_meminfo *meminfo = NULL; 2250 2076 struct pebs_gprs *gprs = NULL; 2251 2077 struct x86_perf_regs *perf_regs; 2078 + u64 format_group; 2079 + u16 retire; 2252 2080 2253 2081 if (basic == NULL) 2254 2082 return; ··· 2258 2082 perf_regs = container_of(regs, struct x86_perf_regs, regs); 2259 2083 perf_regs->xmm_regs = NULL; 2260 2084 2261 - sample_type = event->attr.sample_type; 2262 2085 format_group = basic->format_group; 2263 - perf_sample_data_init(data, 0, event->hw.last_period); 2264 2086 2265 - setup_pebs_time(event, data, basic->tsc); 2266 - 2267 - /* 2268 - * We must however always use iregs for the unwinder to stay sane; the 2269 - * record BP,SP,IP can point into thin air when the record is from a 2270 - * previous PMI context or an (I)RET happened between the record and 2271 - * PMI. 2272 - */ 2273 - perf_sample_save_callchain(data, event, iregs); 2087 + __setup_perf_sample_data(event, iregs, data); 2274 2088 2275 2089 *regs = *iregs; 2276 - /* The ip in basic is EventingIP */ 2277 - set_linear_ip(regs, basic->ip); 2278 - regs->flags = PERF_EFLAGS_EXACT; 2279 2090 2280 - if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { 2281 - if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) 2282 - data->weight.var3_w = basic->retire_latency; 2283 - else 2284 - data->weight.var3_w = 0; 2285 - } 2091 + /* basic group */ 2092 + retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ? 2093 + basic->retire_latency : 0; 2094 + __setup_pebs_basic_group(event, regs, data, sample_type, 2095 + basic->ip, basic->tsc, retire); 2286 2096 2287 2097 /* 2288 2098 * The record for MEMINFO is in front of GP ··· 2284 2122 gprs = next_record; 2285 2123 next_record = gprs + 1; 2286 2124 2287 - if (event->attr.precise_ip < 2) { 2288 - set_linear_ip(regs, gprs->ip); 2289 - regs->flags &= ~PERF_EFLAGS_EXACT; 2290 - } 2291 - 2292 - if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) 2293 - adaptive_pebs_save_regs(regs, gprs); 2125 + __setup_pebs_gpr_group(event, regs, gprs, sample_type); 2294 2126 } 2295 2127 2296 2128 if (format_group & PEBS_DATACFG_MEMINFO) { 2297 - if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { 2298 - u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? 2299 - meminfo->cache_latency : meminfo->mem_latency; 2129 + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? 2130 + meminfo->cache_latency : meminfo->mem_latency; 2131 + u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? 2132 + meminfo->instr_latency : 0; 2133 + u64 ax = gprs ? gprs->ax : 0; 2300 2134 2301 - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) 2302 - data->weight.var2_w = meminfo->instr_latency; 2303 - 2304 - /* 2305 - * Although meminfo::latency is defined as a u64, 2306 - * only the lower 32 bits include the valid data 2307 - * in practice on Ice Lake and earlier platforms. 2308 - */ 2309 - if (sample_type & PERF_SAMPLE_WEIGHT) { 2310 - data->weight.full = latency ?: 2311 - intel_get_tsx_weight(meminfo->tsx_tuning); 2312 - } else { 2313 - data->weight.var1_dw = (u32)latency ?: 2314 - intel_get_tsx_weight(meminfo->tsx_tuning); 2315 - } 2316 - 2317 - data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 2318 - } 2319 - 2320 - if (sample_type & PERF_SAMPLE_DATA_SRC) { 2321 - data->data_src.val = get_data_src(event, meminfo->aux); 2322 - data->sample_flags |= PERF_SAMPLE_DATA_SRC; 2323 - } 2324 - 2325 - if (sample_type & PERF_SAMPLE_ADDR_TYPE) { 2326 - data->addr = meminfo->address; 2327 - data->sample_flags |= PERF_SAMPLE_ADDR; 2328 - } 2329 - 2330 - if (sample_type & PERF_SAMPLE_TRANSACTION) { 2331 - data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, 2332 - gprs ? gprs->ax : 0); 2333 - data->sample_flags |= PERF_SAMPLE_TRANSACTION; 2334 - } 2135 + __setup_pebs_meminfo_group(event, data, sample_type, latency, 2136 + instr_latency, meminfo->address, 2137 + meminfo->aux, meminfo->tsx_tuning, 2138 + ax); 2335 2139 } 2336 2140 2337 2141 if (format_group & PEBS_DATACFG_XMMS) { ··· 2346 2218 basic->format_size, 2347 2219 (u64)(next_record - __pebs), 2348 2220 format_group); 2221 + } 2222 + 2223 + static inline bool arch_pebs_record_continued(struct arch_pebs_header *header) 2224 + { 2225 + /* Continue bit or null PEBS record indicates fragment follows. */ 2226 + return header->cont || !(header->format & GENMASK_ULL(63, 16)); 2227 + } 2228 + 2229 + static void setup_arch_pebs_sample_data(struct perf_event *event, 2230 + struct pt_regs *iregs, 2231 + void *__pebs, 2232 + struct perf_sample_data *data, 2233 + struct pt_regs *regs) 2234 + { 2235 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2236 + u64 sample_type = event->attr.sample_type; 2237 + struct arch_pebs_header *header = NULL; 2238 + struct arch_pebs_aux *meminfo = NULL; 2239 + struct arch_pebs_gprs *gprs = NULL; 2240 + struct x86_perf_regs *perf_regs; 2241 + void *next_record; 2242 + void *at = __pebs; 2243 + 2244 + if (at == NULL) 2245 + return; 2246 + 2247 + perf_regs = container_of(regs, struct x86_perf_regs, regs); 2248 + perf_regs->xmm_regs = NULL; 2249 + 2250 + __setup_perf_sample_data(event, iregs, data); 2251 + 2252 + *regs = *iregs; 2253 + 2254 + again: 2255 + header = at; 2256 + next_record = at + sizeof(struct arch_pebs_header); 2257 + if (header->basic) { 2258 + struct arch_pebs_basic *basic = next_record; 2259 + u16 retire = 0; 2260 + 2261 + next_record = basic + 1; 2262 + 2263 + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) 2264 + retire = basic->valid ? basic->retire : 0; 2265 + __setup_pebs_basic_group(event, regs, data, sample_type, 2266 + basic->ip, basic->tsc, retire); 2267 + } 2268 + 2269 + /* 2270 + * The record for MEMINFO is in front of GP 2271 + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. 2272 + * Save the pointer here but process later. 2273 + */ 2274 + if (header->aux) { 2275 + meminfo = next_record; 2276 + next_record = meminfo + 1; 2277 + } 2278 + 2279 + if (header->gpr) { 2280 + gprs = next_record; 2281 + next_record = gprs + 1; 2282 + 2283 + __setup_pebs_gpr_group(event, regs, 2284 + (struct pebs_gprs *)gprs, 2285 + sample_type); 2286 + } 2287 + 2288 + if (header->aux) { 2289 + u64 ax = gprs ? gprs->ax : 0; 2290 + 2291 + __setup_pebs_meminfo_group(event, data, sample_type, 2292 + meminfo->cache_latency, 2293 + meminfo->instr_latency, 2294 + meminfo->address, meminfo->aux, 2295 + meminfo->tsx_tuning, ax); 2296 + } 2297 + 2298 + if (header->xmm) { 2299 + struct pebs_xmm *xmm; 2300 + 2301 + next_record += sizeof(struct arch_pebs_xer_header); 2302 + 2303 + xmm = next_record; 2304 + perf_regs->xmm_regs = xmm->xmm; 2305 + next_record = xmm + 1; 2306 + } 2307 + 2308 + if (header->lbr) { 2309 + struct arch_pebs_lbr_header *lbr_header = next_record; 2310 + struct lbr_entry *lbr; 2311 + int num_lbr; 2312 + 2313 + next_record = lbr_header + 1; 2314 + lbr = next_record; 2315 + 2316 + num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ? 2317 + lbr_header->depth : 2318 + header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES; 2319 + next_record += num_lbr * sizeof(struct lbr_entry); 2320 + 2321 + if (has_branch_stack(event)) { 2322 + intel_pmu_store_pebs_lbrs(lbr); 2323 + intel_pmu_lbr_save_brstack(data, cpuc, event); 2324 + } 2325 + } 2326 + 2327 + if (header->cntr) { 2328 + struct arch_pebs_cntr_header *cntr = next_record; 2329 + unsigned int nr; 2330 + 2331 + next_record += sizeof(struct arch_pebs_cntr_header); 2332 + 2333 + if (is_pebs_counter_event_group(event)) { 2334 + __setup_pebs_counter_group(cpuc, event, 2335 + (struct pebs_cntr_header *)cntr, next_record); 2336 + data->sample_flags |= PERF_SAMPLE_READ; 2337 + } 2338 + 2339 + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); 2340 + if (cntr->metrics == INTEL_CNTR_METRICS) 2341 + nr += 2; 2342 + next_record += nr * sizeof(u64); 2343 + } 2344 + 2345 + /* Parse followed fragments if there are. */ 2346 + if (arch_pebs_record_continued(header)) { 2347 + at = at + header->size; 2348 + goto again; 2349 + } 2349 2350 } 2350 2351 2351 2352 static inline void * ··· 2859 2602 } 2860 2603 } 2861 2604 2605 + static __always_inline void 2606 + __intel_pmu_handle_pebs_record(struct pt_regs *iregs, 2607 + struct pt_regs *regs, 2608 + struct perf_sample_data *data, 2609 + void *at, u64 pebs_status, 2610 + short *counts, void **last, 2611 + setup_fn setup_sample) 2612 + { 2613 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2614 + struct perf_event *event; 2615 + int bit; 2616 + 2617 + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { 2618 + event = cpuc->events[bit]; 2619 + 2620 + if (WARN_ON_ONCE(!event) || 2621 + WARN_ON_ONCE(!event->attr.precise_ip)) 2622 + continue; 2623 + 2624 + if (counts[bit]++) { 2625 + __intel_pmu_pebs_event(event, iregs, regs, data, 2626 + last[bit], setup_sample); 2627 + } 2628 + 2629 + last[bit] = at; 2630 + } 2631 + } 2632 + 2633 + static __always_inline void 2634 + __intel_pmu_handle_last_pebs_record(struct pt_regs *iregs, 2635 + struct pt_regs *regs, 2636 + struct perf_sample_data *data, 2637 + u64 mask, short *counts, void **last, 2638 + setup_fn setup_sample) 2639 + { 2640 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2641 + struct perf_event *event; 2642 + int bit; 2643 + 2644 + for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { 2645 + if (!counts[bit]) 2646 + continue; 2647 + 2648 + event = cpuc->events[bit]; 2649 + 2650 + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], 2651 + counts[bit], setup_sample); 2652 + } 2653 + 2654 + } 2655 + 2862 2656 static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) 2863 2657 { 2864 2658 short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; ··· 2919 2611 struct x86_perf_regs perf_regs; 2920 2612 struct pt_regs *regs = &perf_regs.regs; 2921 2613 struct pebs_basic *basic; 2922 - struct perf_event *event; 2923 2614 void *base, *at, *top; 2924 - int bit; 2925 2615 u64 mask; 2926 2616 2927 2617 if (!x86_pmu.pebs_active) ··· 2932 2626 2933 2627 mask = hybrid(cpuc->pmu, pebs_events_mask) | 2934 2628 (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); 2629 + mask &= cpuc->pebs_enabled; 2935 2630 2936 2631 if (unlikely(base >= top)) { 2937 2632 intel_pmu_pebs_event_update_no_drain(cpuc, mask); ··· 2950 2643 if (basic->format_size != cpuc->pebs_record_size) 2951 2644 continue; 2952 2645 2953 - pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; 2954 - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { 2955 - event = cpuc->events[bit]; 2956 - 2957 - if (WARN_ON_ONCE(!event) || 2958 - WARN_ON_ONCE(!event->attr.precise_ip)) 2959 - continue; 2960 - 2961 - if (counts[bit]++) { 2962 - __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], 2963 - setup_pebs_adaptive_sample_data); 2964 - } 2965 - last[bit] = at; 2966 - } 2646 + pebs_status = mask & basic->applicable_counters; 2647 + __intel_pmu_handle_pebs_record(iregs, regs, data, at, 2648 + pebs_status, counts, last, 2649 + setup_pebs_adaptive_sample_data); 2967 2650 } 2968 2651 2969 - for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { 2970 - if (!counts[bit]) 2652 + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last, 2653 + setup_pebs_adaptive_sample_data); 2654 + } 2655 + 2656 + static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, 2657 + struct perf_sample_data *data) 2658 + { 2659 + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; 2660 + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; 2661 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2662 + union arch_pebs_index index; 2663 + struct x86_perf_regs perf_regs; 2664 + struct pt_regs *regs = &perf_regs.regs; 2665 + void *base, *at, *top; 2666 + u64 mask; 2667 + 2668 + rdmsrq(MSR_IA32_PEBS_INDEX, index.whole); 2669 + 2670 + if (unlikely(!index.wr)) { 2671 + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); 2672 + return; 2673 + } 2674 + 2675 + base = cpuc->pebs_vaddr; 2676 + top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT); 2677 + 2678 + index.wr = 0; 2679 + index.full = 0; 2680 + index.en = 1; 2681 + if (cpuc->n_pebs == cpuc->n_large_pebs) 2682 + index.thresh = ARCH_PEBS_THRESH_MULTI; 2683 + else 2684 + index.thresh = ARCH_PEBS_THRESH_SINGLE; 2685 + wrmsrq(MSR_IA32_PEBS_INDEX, index.whole); 2686 + 2687 + mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled; 2688 + 2689 + if (!iregs) 2690 + iregs = &dummy_iregs; 2691 + 2692 + /* Process all but the last event for each counter. */ 2693 + for (at = base; at < top;) { 2694 + struct arch_pebs_header *header; 2695 + struct arch_pebs_basic *basic; 2696 + u64 pebs_status; 2697 + 2698 + header = at; 2699 + 2700 + if (WARN_ON_ONCE(!header->size)) 2701 + break; 2702 + 2703 + /* 1st fragment or single record must have basic group */ 2704 + if (!header->basic) { 2705 + at += header->size; 2971 2706 continue; 2707 + } 2972 2708 2973 - event = cpuc->events[bit]; 2709 + basic = at + sizeof(struct arch_pebs_header); 2710 + pebs_status = mask & basic->applicable_counters; 2711 + __intel_pmu_handle_pebs_record(iregs, regs, data, at, 2712 + pebs_status, counts, last, 2713 + setup_arch_pebs_sample_data); 2974 2714 2975 - __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], 2976 - counts[bit], setup_pebs_adaptive_sample_data); 2715 + /* Skip non-last fragments */ 2716 + while (arch_pebs_record_continued(header)) { 2717 + if (!header->size) 2718 + break; 2719 + at += header->size; 2720 + header = at; 2721 + } 2722 + 2723 + /* Skip last fragment or the single record */ 2724 + at += header->size; 2977 2725 } 2726 + 2727 + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, 2728 + counts, last, 2729 + setup_arch_pebs_sample_data); 2730 + } 2731 + 2732 + static void __init intel_arch_pebs_init(void) 2733 + { 2734 + /* 2735 + * Current hybrid platforms always both support arch-PEBS or not 2736 + * on all kinds of cores. So directly set x86_pmu.arch_pebs flag 2737 + * if boot cpu supports arch-PEBS. 2738 + */ 2739 + x86_pmu.arch_pebs = 1; 2740 + x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; 2741 + x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs; 2742 + x86_pmu.pebs_capable = ~0ULL; 2743 + x86_pmu.flags |= PMU_FL_PEBS_ALL; 2744 + 2745 + x86_pmu.pebs_enable = __intel_pmu_pebs_enable; 2746 + x86_pmu.pebs_disable = __intel_pmu_pebs_disable; 2978 2747 } 2979 2748 2980 2749 /* 2981 2750 * PEBS probe and setup 2982 2751 */ 2983 2752 2984 - void __init intel_pebs_init(void) 2753 + static void __init intel_ds_pebs_init(void) 2985 2754 { 2986 2755 /* 2987 2756 * No support for 32bit formats ··· 3119 2736 break; 3120 2737 3121 2738 case 6: 3122 - if (x86_pmu.intel_cap.pebs_baseline) { 2739 + if (x86_pmu.intel_cap.pebs_baseline) 3123 2740 x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; 3124 - x86_pmu.late_setup = intel_pmu_late_setup; 3125 - } 3126 2741 fallthrough; 3127 2742 case 5: 3128 2743 x86_pmu.pebs_ept = 1; ··· 3168 2787 x86_pmu.ds_pebs = 0; 3169 2788 } 3170 2789 } 2790 + } 2791 + 2792 + void __init intel_pebs_init(void) 2793 + { 2794 + if (x86_pmu.intel_cap.pebs_format == 0xf) 2795 + intel_arch_pebs_init(); 2796 + else 2797 + intel_ds_pebs_init(); 3171 2798 } 3172 2799 3173 2800 void perf_restore_debug_store(void)

+36 -5

arch/x86/events/perf_event.h

··· 283 283 * Intel DebugStore bits 284 284 */ 285 285 struct debug_store *ds; 286 - void *ds_pebs_vaddr; 287 286 void *ds_bts_vaddr; 287 + /* DS based PEBS or arch-PEBS buffer address */ 288 + void *pebs_vaddr; 288 289 u64 pebs_enabled; 289 290 int n_pebs; 290 291 int n_large_pebs; ··· 304 303 /* Intel ACR configuration */ 305 304 u64 acr_cfg_b[X86_PMC_IDX_MAX]; 306 305 u64 acr_cfg_c[X86_PMC_IDX_MAX]; 306 + /* Cached CFG_C values */ 307 + u64 cfg_c_val[X86_PMC_IDX_MAX]; 307 308 308 309 /* 309 310 * Intel LBR bits ··· 711 708 hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, 712 709 }; 713 710 711 + struct arch_pebs_cap { 712 + u64 caps; 713 + u64 counters; 714 + u64 pdists; 715 + }; 716 + 714 717 struct x86_hybrid_pmu { 715 718 struct pmu pmu; 716 719 const char *name; ··· 760 751 unsigned int late_ack :1, 761 752 mid_ack :1, 762 753 enabled_ack :1; 754 + 755 + struct arch_pebs_cap arch_pebs_cap; 763 756 764 757 u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; 765 758 }; ··· 917 906 union perf_capabilities intel_cap; 918 907 919 908 /* 920 - * Intel DebugStore bits 909 + * Intel DebugStore and PEBS bits 921 910 */ 922 911 unsigned int bts :1, 923 912 bts_active :1, ··· 928 917 pebs_no_tlb :1, 929 918 pebs_no_isolation :1, 930 919 pebs_block :1, 931 - pebs_ept :1; 920 + pebs_ept :1, 921 + arch_pebs :1; 932 922 int pebs_record_size; 933 923 int pebs_buffer_size; 934 924 u64 pebs_events_mask; ··· 940 928 unsigned long large_pebs_flags; 941 929 u64 rtm_abort_event; 942 930 u64 pebs_capable; 931 + 932 + /* 933 + * Intel Architectural PEBS 934 + */ 935 + struct arch_pebs_cap arch_pebs_cap; 943 936 944 937 /* 945 938 * Intel LBR ··· 1141 1124 .pmu_type = _pmu, \ 1142 1125 } 1143 1126 1144 - int is_x86_event(struct perf_event *event); 1145 1127 struct pmu *x86_get_pmu(unsigned int cpu); 1146 1128 extern struct x86_pmu x86_pmu __read_mostly; 1147 1129 ··· 1233 1217 1234 1218 void x86_release_hardware(void); 1235 1219 1236 - int x86_pmu_max_precise(void); 1220 + int x86_pmu_max_precise(struct pmu *pmu); 1237 1221 1238 1222 void hw_perf_lbr_event_destroy(struct perf_event *event); 1239 1223 ··· 1620 1604 1621 1605 int intel_pmu_init(void); 1622 1606 1607 + int alloc_arch_pebs_buf_on_cpu(int cpu); 1608 + 1609 + void release_arch_pebs_buf_on_cpu(int cpu); 1610 + 1611 + void init_arch_pebs_on_cpu(int cpu); 1612 + 1613 + void fini_arch_pebs_on_cpu(int cpu); 1614 + 1623 1615 void init_debug_store_on_cpu(int cpu); 1624 1616 1625 1617 void fini_debug_store_on_cpu(int cpu); ··· 1784 1760 1785 1761 void intel_pmu_pebs_data_source_lnl(void); 1786 1762 1763 + u64 intel_get_arch_pebs_data_config(struct perf_event *event); 1764 + 1787 1765 int intel_pmu_setup_lbr_filter(struct perf_event *event); 1788 1766 1789 1767 void intel_pt_interrupt(void); ··· 1816 1790 { 1817 1791 static_assert(MAX_PEBS_EVENTS == 32); 1818 1792 return fls((u32)hybrid(pmu, pebs_events_mask)); 1793 + } 1794 + 1795 + static inline bool intel_pmu_has_pebs(void) 1796 + { 1797 + return x86_pmu.ds_pebs || x86_pmu.arch_pebs; 1819 1798 } 1820 1799 1821 1800 #else /* CONFIG_CPU_SUP_INTEL */

+2

arch/x86/include/asm/insn-eval.h

··· 44 44 45 45 enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); 46 46 47 + bool insn_is_nop(struct insn *insn); 48 + 47 49 #endif /* _ASM_X86_INSN_EVAL_H */

+2 -3

arch/x86/include/asm/insn.h

··· 312 312 /** 313 313 * for_each_insn_prefix() -- Iterate prefixes in the instruction 314 314 * @insn: Pointer to struct insn. 315 - * @idx: Index storage. 316 315 * @prefix: Prefix byte. 317 316 * 318 317 * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix ··· 320 321 * Since prefixes.nbytes can be bigger than 4 if some prefixes 321 322 * are repeated, it cannot be used for looping over the prefixes. 322 323 */ 323 - #define for_each_insn_prefix(insn, idx, prefix) \ 324 - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) 324 + #define for_each_insn_prefix(insn, prefix) \ 325 + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) 325 326 326 327 #define POP_SS_OPCODE 0x1f 327 328 #define MOV_SREG_OPCODE 0x8e

+9 -1

arch/x86/include/asm/intel_ds.h

··· 4 4 #include <linux/percpu-defs.h> 5 5 6 6 #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) 7 - #define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) 7 + #define PEBS_BUFFER_SHIFT 4 8 + #define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT) 9 + 10 + /* 11 + * The largest PEBS record could consume a page, ensure 12 + * a record at least can be written after triggering PMI. 13 + */ 14 + #define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT) 15 + #define ARCH_PEBS_THRESH_SINGLE 1 8 16 9 17 /* The maximal number of PEBS events: */ 10 18 #define MAX_PEBS_EVENTS_FMT4 8

+20

arch/x86/include/asm/msr-index.h

··· 327 327 PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ 328 328 PERF_CAP_PEBS_TIMING_INFO) 329 329 330 + /* Arch PEBS */ 331 + #define MSR_IA32_PEBS_BASE 0x000003f4 332 + #define MSR_IA32_PEBS_INDEX 0x000003f5 333 + #define ARCH_PEBS_OFFSET_MASK 0x7fffff 334 + #define ARCH_PEBS_INDEX_WR_SHIFT 4 335 + 336 + #define ARCH_PEBS_RELOAD 0xffffffff 337 + #define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35) 338 + #define ARCH_PEBS_CNTR_GP BIT_ULL(36) 339 + #define ARCH_PEBS_CNTR_FIXED BIT_ULL(37) 340 + #define ARCH_PEBS_CNTR_METRICS BIT_ULL(38) 341 + #define ARCH_PEBS_LBR_SHIFT 40 342 + #define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT) 343 + #define ARCH_PEBS_VECR_XMM BIT_ULL(49) 344 + #define ARCH_PEBS_GPR BIT_ULL(61) 345 + #define ARCH_PEBS_AUX BIT_ULL(62) 346 + #define ARCH_PEBS_EN BIT_ULL(63) 347 + #define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \ 348 + ARCH_PEBS_CNTR_METRICS) 349 + 330 350 #define MSR_IA32_RTIT_CTL 0x00000570 331 351 #define RTIT_CTL_TRACEEN BIT(0) 332 352 #define RTIT_CTL_CYCLEACC BIT(1)

+112 -4

arch/x86/include/asm/perf_event.h

··· 141 141 #define ARCH_PERFMON_EVENTS_COUNT 7 142 142 143 143 #define PEBS_DATACFG_MEMINFO BIT_ULL(0) 144 - #define PEBS_DATACFG_GP BIT_ULL(1) 144 + #define PEBS_DATACFG_GP BIT_ULL(1) 145 145 #define PEBS_DATACFG_XMMS BIT_ULL(2) 146 146 #define PEBS_DATACFG_LBRS BIT_ULL(3) 147 - #define PEBS_DATACFG_LBR_SHIFT 24 148 147 #define PEBS_DATACFG_CNTR BIT_ULL(4) 148 + #define PEBS_DATACFG_METRICS BIT_ULL(5) 149 + #define PEBS_DATACFG_LBR_SHIFT 24 149 150 #define PEBS_DATACFG_CNTR_SHIFT 32 150 151 #define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) 151 152 #define PEBS_DATACFG_FIX_SHIFT 48 152 153 #define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) 153 - #define PEBS_DATACFG_METRICS BIT_ULL(5) 154 154 155 155 /* Steal the highest bit of pebs_data_cfg for SW usage */ 156 156 #define PEBS_UPDATE_DS_SW BIT_ULL(63) ··· 200 200 #define ARCH_PERFMON_EXT_LEAF 0x00000023 201 201 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 202 202 #define ARCH_PERFMON_ACR_LEAF 0x2 203 + #define ARCH_PERFMON_PEBS_CAP_LEAF 0x4 204 + #define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5 203 205 204 206 union cpuid35_eax { 205 207 struct { ··· 212 210 unsigned int acr_subleaf:1; 213 211 /* Events Sub-Leaf */ 214 212 unsigned int events_subleaf:1; 215 - unsigned int reserved:28; 213 + /* arch-PEBS Sub-Leaves */ 214 + unsigned int pebs_caps_subleaf:1; 215 + unsigned int pebs_cnts_subleaf:1; 216 + unsigned int reserved:26; 216 217 } split; 217 218 unsigned int full; 218 219 }; ··· 437 432 #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) 438 433 #define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55 439 434 #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) 435 + #define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54 436 + #define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT) 440 437 #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 441 438 442 439 #define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) ··· 508 501 }; 509 502 510 503 #define INTEL_CNTR_METRICS 0x3 504 + 505 + /* 506 + * Arch PEBS 507 + */ 508 + union arch_pebs_index { 509 + struct { 510 + u64 rsvd:4, 511 + wr:23, 512 + rsvd2:4, 513 + full:1, 514 + en:1, 515 + rsvd3:3, 516 + thresh:23, 517 + rsvd4:5; 518 + }; 519 + u64 whole; 520 + }; 521 + 522 + struct arch_pebs_header { 523 + union { 524 + u64 format; 525 + struct { 526 + u64 size:16, /* Record size */ 527 + rsvd:14, 528 + mode:1, /* 64BIT_MODE */ 529 + cont:1, 530 + rsvd2:3, 531 + cntr:5, 532 + lbr:2, 533 + rsvd3:7, 534 + xmm:1, 535 + ymmh:1, 536 + rsvd4:2, 537 + opmask:1, 538 + zmmh:1, 539 + h16zmm:1, 540 + rsvd5:5, 541 + gpr:1, 542 + aux:1, 543 + basic:1; 544 + }; 545 + }; 546 + u64 rsvd6; 547 + }; 548 + 549 + struct arch_pebs_basic { 550 + u64 ip; 551 + u64 applicable_counters; 552 + u64 tsc; 553 + u64 retire :16, /* Retire Latency */ 554 + valid :1, 555 + rsvd :47; 556 + u64 rsvd2; 557 + u64 rsvd3; 558 + }; 559 + 560 + struct arch_pebs_aux { 561 + u64 address; 562 + u64 rsvd; 563 + u64 rsvd2; 564 + u64 rsvd3; 565 + u64 rsvd4; 566 + u64 aux; 567 + u64 instr_latency :16, 568 + pad2 :16, 569 + cache_latency :16, 570 + pad3 :16; 571 + u64 tsx_tuning; 572 + }; 573 + 574 + struct arch_pebs_gprs { 575 + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; 576 + u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp; 577 + u64 rsvd; 578 + }; 579 + 580 + struct arch_pebs_xer_header { 581 + u64 xstate; 582 + u64 rsvd; 583 + }; 584 + 585 + #define ARCH_PEBS_LBR_NAN 0x0 586 + #define ARCH_PEBS_LBR_NUM_8 0x1 587 + #define ARCH_PEBS_LBR_NUM_16 0x2 588 + #define ARCH_PEBS_LBR_NUM_VAR 0x3 589 + #define ARCH_PEBS_BASE_LBR_ENTRIES 8 590 + struct arch_pebs_lbr_header { 591 + u64 rsvd; 592 + u64 ctl; 593 + u64 depth; 594 + u64 ler_from; 595 + u64 ler_to; 596 + u64 ler_info; 597 + }; 598 + 599 + struct arch_pebs_cntr_header { 600 + u32 cntr; 601 + u32 fixed; 602 + u32 metrics; 603 + u32 reserved; 604 + }; 511 605 512 606 /* 513 607 * AMD Extended Performance Monitoring and Debug cpuid feature detection

+41

arch/x86/include/asm/unwind_user.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _ASM_X86_UNWIND_USER_H 3 + #define _ASM_X86_UNWIND_USER_H 4 + 5 + #ifdef CONFIG_HAVE_UNWIND_USER_FP 6 + 7 + #include <asm/ptrace.h> 8 + #include <asm/uprobes.h> 9 + 10 + #define ARCH_INIT_USER_FP_FRAME(ws) \ 11 + .cfa_off = 2*(ws), \ 12 + .ra_off = -1*(ws), \ 13 + .fp_off = -2*(ws), \ 14 + .use_fp = true, 15 + 16 + #define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ 17 + .cfa_off = 1*(ws), \ 18 + .ra_off = -1*(ws), \ 19 + .fp_off = 0, \ 20 + .use_fp = false, 21 + 22 + static inline int unwind_user_word_size(struct pt_regs *regs) 23 + { 24 + /* We can't unwind VM86 stacks */ 25 + if (regs->flags & X86_VM_MASK) 26 + return 0; 27 + #ifdef CONFIG_X86_64 28 + if (!user_64bit_mode(regs)) 29 + return sizeof(int); 30 + #endif 31 + return sizeof(long); 32 + } 33 + 34 + static inline bool unwind_user_at_function_start(struct pt_regs *regs) 35 + { 36 + return is_uprobe_at_func_entry(regs); 37 + } 38 + 39 + #endif /* CONFIG_HAVE_UNWIND_USER_FP */ 40 + 41 + #endif /* _ASM_X86_UNWIND_USER_H */

+9

arch/x86/include/asm/uprobes.h

··· 62 62 unsigned int saved_tf; 63 63 }; 64 64 65 + #ifdef CONFIG_UPROBES 66 + extern bool is_uprobe_at_func_entry(struct pt_regs *regs); 67 + #else 68 + static bool is_uprobe_at_func_entry(struct pt_regs *regs) 69 + { 70 + return false; 71 + } 72 + #endif /* CONFIG_UPROBES */ 73 + 65 74 #endif /* _ASM_UPROBES_H */

+1 -19

arch/x86/kernel/alternative.c

··· 9 9 10 10 #include <asm/text-patching.h> 11 11 #include <asm/insn.h> 12 + #include <asm/insn-eval.h> 12 13 #include <asm/ibt.h> 13 14 #include <asm/set_memory.h> 14 15 #include <asm/nmi.h> ··· 344 343 345 344 for (;buf < target; buf++) 346 345 *buf = INT3_INSN_OPCODE; 347 - } 348 - 349 - /* 350 - * Matches NOP and NOPL, not any of the other possible NOPs. 351 - */ 352 - static bool insn_is_nop(struct insn *insn) 353 - { 354 - /* Anything NOP, but no REP NOP */ 355 - if (insn->opcode.bytes[0] == 0x90 && 356 - (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) 357 - return true; 358 - 359 - /* NOPL */ 360 - if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) 361 - return true; 362 - 363 - /* TODO: more nops */ 364 - 365 - return false; 366 346 } 367 347 368 348 /*

+1 -2

arch/x86/kernel/kprobes/core.c

··· 141 141 { 142 142 kprobe_opcode_t opcode; 143 143 insn_byte_t prefix; 144 - int i; 145 144 146 145 if (search_exception_tables((unsigned long)addr)) 147 146 return false; /* Page fault may occur on this address. */ ··· 153 154 if (insn->opcode.nbytes != 1) 154 155 return false; 155 156 156 - for_each_insn_prefix(insn, i, prefix) { 157 + for_each_insn_prefix(insn, prefix) { 157 158 insn_attr_t attr; 158 159 159 160 attr = inat_get_opcode_attribute(prefix);

+37 -33

arch/x86/kernel/uprobes.c

··· 17 17 #include <linux/kdebug.h> 18 18 #include <asm/processor.h> 19 19 #include <asm/insn.h> 20 + #include <asm/insn-eval.h> 20 21 #include <asm/mmu_context.h> 21 22 #include <asm/nops.h> 22 23 ··· 259 258 static bool is_prefix_bad(struct insn *insn) 260 259 { 261 260 insn_byte_t p; 262 - int i; 263 261 264 - for_each_insn_prefix(insn, i, p) { 262 + for_each_insn_prefix(insn, p) { 265 263 insn_attr_t attr; 266 264 267 265 attr = inat_get_opcode_attribute(p); ··· 1158 1158 mmap_write_unlock(mm); 1159 1159 } 1160 1160 1161 - static bool insn_is_nop(struct insn *insn) 1162 - { 1163 - return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; 1164 - } 1165 - 1166 - static bool insn_is_nopl(struct insn *insn) 1167 - { 1168 - if (insn->opcode.nbytes != 2) 1169 - return false; 1170 - 1171 - if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) 1172 - return false; 1173 - 1174 - if (!insn->modrm.nbytes) 1175 - return false; 1176 - 1177 - if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) 1178 - return false; 1179 - 1180 - /* 0f 1f /0 - NOPL */ 1181 - return true; 1182 - } 1183 - 1184 1161 static bool can_optimize(struct insn *insn, unsigned long vaddr) 1185 1162 { 1186 1163 if (!insn->x86_64 || insn->length != 5) 1187 1164 return false; 1188 1165 1189 - if (!insn_is_nop(insn) && !insn_is_nopl(insn)) 1166 + if (!insn_is_nop(insn)) 1190 1167 return false; 1191 1168 1192 1169 /* We can't do cross page atomic writes yet. */ ··· 1403 1426 { 1404 1427 u8 opc1 = OPCODE1(insn); 1405 1428 insn_byte_t p; 1406 - int i; 1407 1429 1408 - /* x86_nops[insn->length]; same as jmp with .offs = 0 */ 1409 - if (insn->length <= ASM_NOP_MAX && 1410 - !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) 1430 + if (insn_is_nop(insn)) 1411 1431 goto setup; 1412 1432 1413 1433 switch (opc1) { 1414 1434 case 0xeb: /* jmp 8 */ 1415 1435 case 0xe9: /* jmp 32 */ 1416 1436 break; 1417 - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ 1418 - goto setup; 1419 1437 1420 1438 case 0xe8: /* call relative */ 1421 1439 branch_clear_offset(auprobe, insn); ··· 1435 1463 * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. 1436 1464 * No one uses these insns, reject any branch insns with such prefix. 1437 1465 */ 1438 - for_each_insn_prefix(insn, i, p) { 1466 + for_each_insn_prefix(insn, p) { 1439 1467 if (p == 0x66) 1440 1468 return -ENOTSUPP; 1441 1469 } ··· 1790 1818 return regs->sp < ret->stack; 1791 1819 else 1792 1820 return regs->sp <= ret->stack; 1821 + } 1822 + 1823 + /* 1824 + * Heuristic-based check if uprobe is installed at the function entry. 1825 + * 1826 + * Under assumption of user code being compiled with frame pointers, 1827 + * `push %rbp/%ebp` is a good indicator that we indeed are. 1828 + * 1829 + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. 1830 + * If we get this wrong, captured stack trace might have one extra bogus 1831 + * entry, but the rest of stack trace will still be meaningful. 1832 + */ 1833 + bool is_uprobe_at_func_entry(struct pt_regs *regs) 1834 + { 1835 + struct arch_uprobe *auprobe; 1836 + 1837 + if (!current->utask) 1838 + return false; 1839 + 1840 + auprobe = current->utask->auprobe; 1841 + if (!auprobe) 1842 + return false; 1843 + 1844 + /* push %rbp/%ebp */ 1845 + if (auprobe->insn[0] == 0x55) 1846 + return true; 1847 + 1848 + /* endbr64 (64-bit only) */ 1849 + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) 1850 + return true; 1851 + 1852 + return false; 1793 1853 }

+147 -4

arch/x86/lib/insn-eval.c

··· 63 63 bool insn_has_rep_prefix(struct insn *insn) 64 64 { 65 65 insn_byte_t p; 66 - int i; 67 66 68 67 insn_get_prefixes(insn); 69 68 70 - for_each_insn_prefix(insn, i, p) { 69 + for_each_insn_prefix(insn, p) { 71 70 if (p == 0xf2 || p == 0xf3) 72 71 return true; 73 72 } ··· 91 92 static int get_seg_reg_override_idx(struct insn *insn) 92 93 { 93 94 int idx = INAT_SEG_REG_DEFAULT; 94 - int num_overrides = 0, i; 95 + int num_overrides = 0; 95 96 insn_byte_t p; 96 97 97 98 insn_get_prefixes(insn); 98 99 99 100 /* Look for any segment override prefixes. */ 100 - for_each_insn_prefix(insn, i, p) { 101 + for_each_insn_prefix(insn, p) { 101 102 insn_attr_t attr; 102 103 103 104 attr = inat_get_opcode_attribute(p); ··· 1674 1675 } 1675 1676 1676 1677 return type; 1678 + } 1679 + 1680 + /* 1681 + * Recognise typical NOP patterns for both 32bit and 64bit. 1682 + * 1683 + * Notably: 1684 + * - NOP, but not: REP NOP aka PAUSE 1685 + * - NOPL 1686 + * - MOV %reg, %reg 1687 + * - LEA 0(%reg),%reg 1688 + * - JMP +0 1689 + * 1690 + * Must not have false-positives; instructions identified as a NOP might be 1691 + * emulated as a NOP (uprobe) or Run Length Encoded in a larger NOP 1692 + * (alternatives). 1693 + * 1694 + * False-negatives are fine; need not be exhaustive. 1695 + */ 1696 + bool insn_is_nop(struct insn *insn) 1697 + { 1698 + u8 b3 = 0, x3 = 0, r3 = 0; 1699 + u8 b4 = 0, x4 = 0, r4 = 0, m = 0; 1700 + u8 modrm, modrm_mod, modrm_reg, modrm_rm; 1701 + u8 sib = 0, sib_scale, sib_index, sib_base; 1702 + u8 nrex, rex; 1703 + u8 p, rep = 0; 1704 + 1705 + if ((nrex = insn->rex_prefix.nbytes)) { 1706 + rex = insn->rex_prefix.bytes[nrex-1]; 1707 + 1708 + r3 = !!X86_REX_R(rex); 1709 + x3 = !!X86_REX_X(rex); 1710 + b3 = !!X86_REX_B(rex); 1711 + if (nrex > 1) { 1712 + r4 = !!X86_REX2_R(rex); 1713 + x4 = !!X86_REX2_X(rex); 1714 + b4 = !!X86_REX2_B(rex); 1715 + m = !!X86_REX2_M(rex); 1716 + } 1717 + 1718 + } else if (insn->vex_prefix.nbytes) { 1719 + /* 1720 + * Ignore VEX encoded NOPs 1721 + */ 1722 + return false; 1723 + } 1724 + 1725 + if (insn->modrm.nbytes) { 1726 + modrm = insn->modrm.bytes[0]; 1727 + modrm_mod = X86_MODRM_MOD(modrm); 1728 + modrm_reg = X86_MODRM_REG(modrm) + 8*r3 + 16*r4; 1729 + modrm_rm = X86_MODRM_RM(modrm) + 8*b3 + 16*b4; 1730 + modrm = 1; 1731 + } 1732 + 1733 + if (insn->sib.nbytes) { 1734 + sib = insn->sib.bytes[0]; 1735 + sib_scale = X86_SIB_SCALE(sib); 1736 + sib_index = X86_SIB_INDEX(sib) + 8*x3 + 16*x4; 1737 + sib_base = X86_SIB_BASE(sib) + 8*b3 + 16*b4; 1738 + sib = 1; 1739 + 1740 + modrm_rm = sib_base; 1741 + } 1742 + 1743 + for_each_insn_prefix(insn, p) { 1744 + if (p == 0xf3) /* REPE */ 1745 + rep = 1; 1746 + } 1747 + 1748 + /* 1749 + * Opcode map munging: 1750 + * 1751 + * REX2: 0 - single byte opcode 1752 + * 1 - 0f second byte opcode 1753 + */ 1754 + switch (m) { 1755 + case 0: break; 1756 + case 1: insn->opcode.value <<= 8; 1757 + insn->opcode.value |= 0x0f; 1758 + break; 1759 + default: 1760 + return false; 1761 + } 1762 + 1763 + switch (insn->opcode.bytes[0]) { 1764 + case 0x0f: /* 2nd byte */ 1765 + break; 1766 + 1767 + case 0x89: /* MOV */ 1768 + if (modrm_mod != 3) /* register-direct */ 1769 + return false; 1770 + 1771 + /* native size */ 1772 + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) 1773 + return false; 1774 + 1775 + return modrm_reg == modrm_rm; /* MOV %reg, %reg */ 1776 + 1777 + case 0x8d: /* LEA */ 1778 + if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */ 1779 + return false; 1780 + 1781 + /* native size */ 1782 + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) 1783 + return false; 1784 + 1785 + if (insn->displacement.value != 0) 1786 + return false; 1787 + 1788 + if (sib && (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */ 1789 + return false; 1790 + 1791 + for_each_insn_prefix(insn, p) { 1792 + if (p != 0x3e) /* DS */ 1793 + return false; 1794 + } 1795 + 1796 + return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */ 1797 + 1798 + case 0x90: /* NOP */ 1799 + if (b3 || b4) /* XCHG %r{8,16,24},%rax */ 1800 + return false; 1801 + 1802 + if (rep) /* REP NOP := PAUSE */ 1803 + return false; 1804 + 1805 + return true; 1806 + 1807 + case 0xe9: /* JMP.d32 */ 1808 + case 0xeb: /* JMP.d8 */ 1809 + return insn->immediate.value == 0; /* JMP +0 */ 1810 + 1811 + default: 1812 + return false; 1813 + } 1814 + 1815 + switch (insn->opcode.bytes[1]) { 1816 + case 0x1f: 1817 + return modrm_reg == 0; /* 0f 1f /0 -- NOPL */ 1818 + 1819 + default: 1820 + return false; 1821 + } 1677 1822 }

+1 -1

include/linux/irq-entry-common.h

··· 253 253 static __always_inline void exit_to_user_mode(void) 254 254 { 255 255 instrumentation_begin(); 256 + unwind_reset_info(); 256 257 trace_hardirqs_on_prepare(); 257 258 lockdep_hardirqs_on_prepare(); 258 259 instrumentation_end(); 259 260 260 - unwind_reset_info(); 261 261 user_enter_irqoff(); 262 262 arch_exit_to_user_mode(); 263 263 lockdep_hardirqs_on(CALLER_ADDR0);

+1 -1

include/linux/perf_event.h

··· 1720 1720 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); 1721 1721 extern struct perf_callchain_entry * 1722 1722 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, 1723 - u32 max_stack, bool crosstask, bool add_mark); 1723 + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie); 1724 1724 extern int get_callchain_buffers(int max_stack); 1725 1725 extern void put_callchain_buffers(void); 1726 1726 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);

+24 -26

include/linux/unwind_deferred.h

··· 6 6 #include <linux/unwind_user.h> 7 7 #include <linux/unwind_deferred_types.h> 8 8 9 - struct unwind_work; 10 - 11 - typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie); 12 - 13 - struct unwind_work { 14 - struct list_head list; 15 - unwind_callback_t func; 16 - int bit; 17 - }; 18 - 19 9 #ifdef CONFIG_UNWIND_USER 20 10 21 11 enum { ··· 34 44 static __always_inline void unwind_reset_info(void) 35 45 { 36 46 struct unwind_task_info *info = &current->unwind_info; 37 - unsigned long bits; 47 + unsigned long bits = atomic_long_read(&info->unwind_mask); 38 48 39 49 /* Was there any unwinding? */ 40 - if (unlikely(info->unwind_mask)) { 41 - bits = info->unwind_mask; 42 - do { 43 - /* Is a task_work going to run again before going back */ 44 - if (bits & UNWIND_PENDING) 45 - return; 46 - } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); 47 - current->unwind_info.id.id = 0; 50 + if (likely(!bits)) 51 + return; 48 52 49 - if (unlikely(info->cache)) { 50 - info->cache->nr_entries = 0; 51 - info->cache->unwind_completed = 0; 52 - } 53 + do { 54 + /* Is a task_work going to run again before going back */ 55 + if (bits & UNWIND_PENDING) 56 + return; 57 + } while (!atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL)); 58 + current->unwind_info.id.id = 0; 59 + 60 + if (unlikely(info->cache)) { 61 + info->cache->nr_entries = 0; 62 + info->cache->unwind_completed = 0; 53 63 } 54 64 } 55 65 ··· 58 68 static inline void unwind_task_init(struct task_struct *task) {} 59 69 static inline void unwind_task_free(struct task_struct *task) {} 60 70 61 - static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; } 62 - static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; } 63 - static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; } 71 + static inline int unwind_user_faultable(struct unwind_stacktrace *trace) 72 + { return -ENOSYS; } 73 + 74 + static inline int 75 + unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) 76 + { return -ENOSYS; } 77 + 78 + static inline int 79 + unwind_deferred_request(struct unwind_work *work, u64 *timestamp) 80 + { return -ENOSYS; } 81 + 64 82 static inline void unwind_deferred_cancel(struct unwind_work *work) {} 65 83 66 84 static inline void unwind_deferred_task_exit(struct task_struct *task) {}

+17 -1

include/linux/unwind_deferred_types.h

··· 2 2 #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H 3 3 #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H 4 4 5 + #include <linux/types.h> 6 + #include <linux/atomic.h> 7 + 5 8 struct unwind_cache { 6 9 unsigned long unwind_completed; 7 10 unsigned int nr_entries; ··· 33 30 }; 34 31 35 32 struct unwind_task_info { 36 - unsigned long unwind_mask; 33 + atomic_long_t unwind_mask; 37 34 struct unwind_cache *cache; 38 35 struct callback_head work; 39 36 union unwind_task_id id; 37 + }; 38 + 39 + struct unwind_work; 40 + struct unwind_stacktrace; 41 + 42 + typedef void (*unwind_callback_t)(struct unwind_work *work, 43 + struct unwind_stacktrace *trace, 44 + u64 cookie); 45 + 46 + struct unwind_work { 47 + struct list_head list; 48 + unwind_callback_t func; 49 + int bit; 40 50 }; 41 51 42 52 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */

+2

include/linux/unwind_user_types.h

··· 36 36 unsigned long ip; 37 37 unsigned long sp; 38 38 unsigned long fp; 39 + unsigned int ws; 39 40 enum unwind_user_type current_type; 40 41 unsigned int available_types; 42 + bool topmost; 41 43 bool done; 42 44 }; 43 45

+20 -1

include/uapi/linux/perf_event.h

··· 463 463 inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ 464 464 remove_on_exec : 1, /* event is removed from task on exec */ 465 465 sigtrap : 1, /* send synchronous SIGTRAP on event */ 466 - __reserved_1 : 26; 466 + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ 467 + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ 468 + __reserved_1 : 24; 467 469 468 470 union { 469 471 __u32 wakeup_events; /* wake up every n events */ ··· 1241 1239 */ 1242 1240 PERF_RECORD_AUX_OUTPUT_HW_ID = 21, 1243 1241 1242 + /* 1243 + * This user callchain capture was deferred until shortly before 1244 + * returning to user space. Previous samples would have kernel 1245 + * callchains only and they need to be stitched with this to make full 1246 + * callchains. 1247 + * 1248 + * struct { 1249 + * struct perf_event_header header; 1250 + * u64 cookie; 1251 + * u64 nr; 1252 + * u64 ips[nr]; 1253 + * struct sample_id sample_id; 1254 + * }; 1255 + */ 1256 + PERF_RECORD_CALLCHAIN_DEFERRED = 22, 1257 + 1244 1258 PERF_RECORD_MAX, /* non-ABI */ 1245 1259 }; 1246 1260 ··· 1287 1269 PERF_CONTEXT_HV = (__u64)-32, 1288 1270 PERF_CONTEXT_KERNEL = (__u64)-128, 1289 1271 PERF_CONTEXT_USER = (__u64)-512, 1272 + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, 1290 1273 1291 1274 PERF_CONTEXT_GUEST = (__u64)-2048, 1292 1275 PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,

+2 -2

kernel/bpf/stackmap.c

··· 315 315 max_depth = sysctl_perf_event_max_stack; 316 316 317 317 trace = get_perf_callchain(regs, kernel, user, max_depth, 318 - false, false); 318 + false, false, 0); 319 319 320 320 if (unlikely(!trace)) 321 321 /* couldn't fetch the stack trace */ ··· 452 452 trace = get_callchain_entry_for_task(task, max_depth); 453 453 else 454 454 trace = get_perf_callchain(regs, kernel, user, max_depth, 455 - crosstask, false); 455 + crosstask, false, 0); 456 456 457 457 if (unlikely(!trace) || trace->nr < skip) { 458 458 if (may_fault)

+13 -1

kernel/events/callchain.c

··· 218 218 219 219 struct perf_callchain_entry * 220 220 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, 221 - u32 max_stack, bool crosstask, bool add_mark) 221 + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) 222 222 { 223 223 struct perf_callchain_entry *entry; 224 224 struct perf_callchain_entry_ctx ctx; ··· 249 249 if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) 250 250 goto exit_put; 251 251 regs = task_pt_regs(current); 252 + } 253 + 254 + if (defer_cookie) { 255 + /* 256 + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED 257 + * which can be stitched to this one, and add 258 + * the cookie after it (it will be cut off when the 259 + * user stack is copied to the callchain). 260 + */ 261 + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); 262 + perf_callchain_store_context(&ctx, defer_cookie); 263 + goto exit_put; 252 264 } 253 265 254 266 if (add_mark)

+76 -2

kernel/events/core.c

··· 56 56 #include <linux/buildid.h> 57 57 #include <linux/task_work.h> 58 58 #include <linux/percpu-rwsem.h> 59 + #include <linux/unwind_deferred.h> 59 60 60 61 #include "internal.h" 61 62 ··· 8201 8200 8202 8201 static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; 8203 8202 8203 + static struct unwind_work perf_unwind_work; 8204 + 8204 8205 struct perf_callchain_entry * 8205 8206 perf_callchain(struct perf_event *event, struct pt_regs *regs) 8206 8207 { ··· 8211 8208 !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); 8212 8209 /* Disallow cross-task user callchains. */ 8213 8210 bool crosstask = event->ctx->task && event->ctx->task != current; 8211 + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && 8212 + event->attr.defer_callchain; 8214 8213 const u32 max_stack = event->attr.sample_max_stack; 8215 8214 struct perf_callchain_entry *callchain; 8215 + u64 defer_cookie; 8216 8216 8217 8217 if (!current->mm) 8218 8218 user = false; ··· 8223 8217 if (!kernel && !user) 8224 8218 return &__empty_callchain; 8225 8219 8226 - callchain = get_perf_callchain(regs, kernel, user, 8227 - max_stack, crosstask, true); 8220 + if (!(user && defer_user && !crosstask && 8221 + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) 8222 + defer_cookie = 0; 8223 + 8224 + callchain = get_perf_callchain(regs, kernel, user, max_stack, 8225 + crosstask, true, defer_cookie); 8226 + 8228 8227 return callchain ?: &__empty_callchain; 8229 8228 } 8230 8229 ··· 10012 10001 10013 10002 memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); 10014 10003 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); 10004 + } 10005 + 10006 + struct perf_callchain_deferred_event { 10007 + struct unwind_stacktrace *trace; 10008 + struct { 10009 + struct perf_event_header header; 10010 + u64 cookie; 10011 + u64 nr; 10012 + u64 ips[]; 10013 + } event; 10014 + }; 10015 + 10016 + static void perf_callchain_deferred_output(struct perf_event *event, void *data) 10017 + { 10018 + struct perf_callchain_deferred_event *deferred_event = data; 10019 + struct perf_output_handle handle; 10020 + struct perf_sample_data sample; 10021 + int ret, size = deferred_event->event.header.size; 10022 + 10023 + if (!event->attr.defer_output) 10024 + return; 10025 + 10026 + /* XXX do we really need sample_id_all for this ??? */ 10027 + perf_event_header__init_id(&deferred_event->event.header, &sample, event); 10028 + 10029 + ret = perf_output_begin(&handle, &sample, event, 10030 + deferred_event->event.header.size); 10031 + if (ret) 10032 + goto out; 10033 + 10034 + perf_output_put(&handle, deferred_event->event); 10035 + for (int i = 0; i < deferred_event->trace->nr; i++) { 10036 + u64 entry = deferred_event->trace->entries[i]; 10037 + perf_output_put(&handle, entry); 10038 + } 10039 + perf_event__output_id_sample(event, &handle, &sample); 10040 + 10041 + perf_output_end(&handle); 10042 + out: 10043 + deferred_event->event.header.size = size; 10044 + } 10045 + 10046 + static void perf_unwind_deferred_callback(struct unwind_work *work, 10047 + struct unwind_stacktrace *trace, u64 cookie) 10048 + { 10049 + struct perf_callchain_deferred_event deferred_event = { 10050 + .trace = trace, 10051 + .event = { 10052 + .header = { 10053 + .type = PERF_RECORD_CALLCHAIN_DEFERRED, 10054 + .misc = PERF_RECORD_MISC_USER, 10055 + .size = sizeof(deferred_event.event) + 10056 + (trace->nr * sizeof(u64)), 10057 + }, 10058 + .cookie = cookie, 10059 + .nr = trace->nr, 10060 + }, 10061 + }; 10062 + 10063 + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); 10015 10064 } 10016 10065 10017 10066 struct perf_text_poke_event { ··· 14879 14808 int ret; 14880 14809 14881 14810 idr_init(&pmu_idr); 14811 + 14812 + unwind_deferred_init(&perf_unwind_work, 14813 + perf_unwind_deferred_callback); 14882 14814 14883 14815 perf_event_init_all_cpus(); 14884 14816 init_srcu_struct(&pmus_srcu);

+6 -1

kernel/exit.c

··· 940 940 941 941 tsk->exit_code = code; 942 942 taskstats_exit(tsk, group_dead); 943 - unwind_deferred_task_exit(tsk); 944 943 trace_sched_process_exit(tsk, group_dead); 945 944 946 945 /* ··· 950 951 * gets woken up by child-exit notifications. 951 952 */ 952 953 perf_event_exit_task(tsk); 954 + /* 955 + * PF_EXITING (above) ensures unwind_deferred_request() will no 956 + * longer add new unwinds. While exit_mm() (below) will destroy the 957 + * abaility to do unwinds. So flush any pending unwinds here. 958 + */ 959 + unwind_deferred_task_exit(tsk); 953 960 954 961 exit_mm(); 955 962

+7 -1

kernel/task_work.c

··· 9 9 #ifdef CONFIG_IRQ_WORK 10 10 static void task_work_set_notify_irq(struct irq_work *entry) 11 11 { 12 - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 12 + /* 13 + * no-op IPI 14 + * 15 + * TWA_NMI_CURRENT will already have set the TIF flag, all 16 + * this interrupt does it tickle the return-to-user path. 17 + */ 13 18 } 14 19 static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = 15 20 IRQ_WORK_INIT_HARD(task_work_set_notify_irq); ··· 91 86 break; 92 87 #ifdef CONFIG_IRQ_WORK 93 88 case TWA_NMI_CURRENT: 89 + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 94 90 irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); 95 91 break; 96 92 #endif

+24 -20

kernel/unwind/deferred.c

··· 53 53 54 54 static inline bool unwind_pending(struct unwind_task_info *info) 55 55 { 56 - return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); 56 + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; 57 57 } 58 58 59 59 /* ··· 78 78 static u64 get_cookie(struct unwind_task_info *info) 79 79 { 80 80 u32 cnt = 1; 81 + 82 + lockdep_assert_irqs_disabled(); 81 83 82 84 if (info->id.cpu) 83 85 return info->id.id; ··· 128 126 129 127 cache = info->cache; 130 128 trace->entries = cache->entries; 131 - 132 - if (cache->nr_entries) { 133 - /* 134 - * The user stack has already been previously unwound in this 135 - * entry context. Skip the unwind and use the cache. 136 - */ 137 - trace->nr = cache->nr_entries; 129 + trace->nr = cache->nr_entries; 130 + /* 131 + * The user stack has already been previously unwound in this 132 + * entry context. Skip the unwind and use the cache. 133 + */ 134 + if (trace->nr) 138 135 return 0; 139 - } 140 136 141 - trace->nr = 0; 142 137 unwind_user(trace, UNWIND_MAX_ENTRIES); 143 138 144 139 cache->nr_entries = trace->nr; 145 140 146 141 /* Clear nr_entries on way back to user space */ 147 - set_bit(UNWIND_USED_BIT, &info->unwind_mask); 142 + atomic_long_or(UNWIND_USED, &info->unwind_mask); 148 143 149 144 return 0; 150 145 } ··· 159 160 160 161 /* Clear pending bit but make sure to have the current bits */ 161 162 bits = atomic_long_fetch_andnot(UNWIND_PENDING, 162 - (atomic_long_t *)&info->unwind_mask); 163 + &info->unwind_mask); 163 164 /* 164 165 * From here on out, the callback must always be called, even if it's 165 166 * just an empty trace. ··· 230 231 int unwind_deferred_request(struct unwind_work *work, u64 *cookie) 231 232 { 232 233 struct unwind_task_info *info = &current->unwind_info; 234 + int twa_mode = TWA_RESUME; 233 235 unsigned long old, bits; 234 236 unsigned long bit; 235 237 int ret; ··· 246 246 * Trigger a warning to make it obvious that an architecture 247 247 * is using this in NMI when it should not be. 248 248 */ 249 - if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) 250 - return -EINVAL; 249 + if (in_nmi()) { 250 + if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) 251 + return -EINVAL; 252 + twa_mode = TWA_NMI_CURRENT; 253 + } 251 254 252 255 /* Do not allow cancelled works to request again */ 253 256 bit = READ_ONCE(work->bit); ··· 264 261 265 262 *cookie = get_cookie(info); 266 263 267 - old = READ_ONCE(info->unwind_mask); 264 + old = atomic_long_read(&info->unwind_mask); 268 265 269 266 /* Is this already queued or executed */ 270 267 if (old & bit) ··· 277 274 * to have a callback. 278 275 */ 279 276 bits = UNWIND_PENDING | bit; 280 - old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); 277 + old = atomic_long_fetch_or(bits, &info->unwind_mask); 281 278 if (old & bits) { 282 279 /* 283 280 * If the work's bit was set, whatever set it had better ··· 288 285 } 289 286 290 287 /* The work has been claimed, now schedule it. */ 291 - ret = task_work_add(current, &info->work, TWA_RESUME); 288 + ret = task_work_add(current, &info->work, twa_mode); 292 289 293 290 if (WARN_ON_ONCE(ret)) 294 - WRITE_ONCE(info->unwind_mask, 0); 291 + atomic_long_set(&info->unwind_mask, 0); 295 292 296 293 return ret; 297 294 } ··· 323 320 guard(rcu)(); 324 321 /* Clear this bit from all threads */ 325 322 for_each_process_thread(g, t) { 326 - clear_bit(bit, &t->unwind_info.unwind_mask); 323 + atomic_long_andnot(BIT(bit), 324 + &t->unwind_info.unwind_mask); 327 325 if (t->unwind_info.cache) 328 326 clear_bit(bit, &t->unwind_info.cache->unwind_completed); 329 327 } ··· 354 350 355 351 memset(info, 0, sizeof(*info)); 356 352 init_task_work(&info->work, unwind_deferred_task_work); 357 - info->unwind_mask = 0; 353 + atomic_long_set(&info->unwind_mask, 0); 358 354 } 359 355 360 356 void unwind_task_free(struct task_struct *task)

+48 -11

kernel/unwind/user.c

··· 8 8 #include <linux/unwind_user.h> 9 9 #include <linux/uaccess.h> 10 10 11 - static const struct unwind_user_frame fp_frame = { 12 - ARCH_INIT_USER_FP_FRAME 13 - }; 14 - 15 11 #define for_each_user_frame(state) \ 16 12 for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) 17 13 18 - static int unwind_user_next_fp(struct unwind_user_state *state) 14 + static inline int 15 + get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) 19 16 { 20 - const struct unwind_user_frame *frame = &fp_frame; 17 + unsigned long __user *addr = (void __user *)base + off; 18 + #ifdef CONFIG_COMPAT 19 + if (ws == sizeof(int)) { 20 + unsigned int data; 21 + int ret = get_user(data, (unsigned int __user *)addr); 22 + *word = data; 23 + return ret; 24 + } 25 + #endif 26 + return get_user(*word, addr); 27 + } 28 + 29 + static int unwind_user_next_common(struct unwind_user_state *state, 30 + const struct unwind_user_frame *frame) 31 + { 21 32 unsigned long cfa, fp, ra; 22 - unsigned int shift; 23 33 24 34 if (frame->use_fp) { 25 35 if (state->fp < state->sp) ··· 47 37 return -EINVAL; 48 38 49 39 /* Make sure that the address is word aligned */ 50 - shift = sizeof(long) == 4 ? 2 : 3; 51 - if (cfa & ((1 << shift) - 1)) 40 + if (cfa & (state->ws - 1)) 52 41 return -EINVAL; 53 42 54 43 /* Find the Return Address (RA) */ 55 - if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) 44 + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) 56 45 return -EINVAL; 57 46 58 - if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) 47 + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) 59 48 return -EINVAL; 60 49 61 50 state->ip = ra; 62 51 state->sp = cfa; 63 52 if (frame->fp_off) 64 53 state->fp = fp; 54 + state->topmost = false; 65 55 return 0; 56 + } 57 + 58 + static int unwind_user_next_fp(struct unwind_user_state *state) 59 + { 60 + #ifdef CONFIG_HAVE_UNWIND_USER_FP 61 + struct pt_regs *regs = task_pt_regs(current); 62 + 63 + if (state->topmost && unwind_user_at_function_start(regs)) { 64 + const struct unwind_user_frame fp_entry_frame = { 65 + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) 66 + }; 67 + return unwind_user_next_common(state, &fp_entry_frame); 68 + } 69 + 70 + const struct unwind_user_frame fp_frame = { 71 + ARCH_INIT_USER_FP_FRAME(state->ws) 72 + }; 73 + return unwind_user_next_common(state, &fp_frame); 74 + #else 75 + return -EINVAL; 76 + #endif 66 77 } 67 78 68 79 static int unwind_user_next(struct unwind_user_state *state) ··· 133 102 state->ip = instruction_pointer(regs); 134 103 state->sp = user_stack_pointer(regs); 135 104 state->fp = frame_pointer(regs); 105 + state->ws = unwind_user_word_size(regs); 106 + if (!state->ws) { 107 + state->done = true; 108 + return -EINVAL; 109 + } 110 + state->topmost = true; 136 111 137 112 return 0; 138 113 }

+2 -3

tools/arch/x86/include/asm/insn.h

··· 312 312 /** 313 313 * for_each_insn_prefix() -- Iterate prefixes in the instruction 314 314 * @insn: Pointer to struct insn. 315 - * @idx: Index storage. 316 315 * @prefix: Prefix byte. 317 316 * 318 317 * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix ··· 320 321 * Since prefixes.nbytes can be bigger than 4 if some prefixes 321 322 * are repeated, it cannot be used for looping over the prefixes. 322 323 */ 323 - #define for_each_insn_prefix(insn, idx, prefix) \ 324 - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) 324 + #define for_each_insn_prefix(insn, prefix) \ 325 + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) 325 326 326 327 #define POP_SS_OPCODE 0x1f 327 328 #define MOV_SREG_OPCODE 0x8e

+20 -1

tools/include/uapi/linux/perf_event.h

··· 463 463 inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ 464 464 remove_on_exec : 1, /* event is removed from task on exec */ 465 465 sigtrap : 1, /* send synchronous SIGTRAP on event */ 466 - __reserved_1 : 26; 466 + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ 467 + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ 468 + __reserved_1 : 24; 467 469 468 470 union { 469 471 __u32 wakeup_events; /* wake up every n events */ ··· 1241 1239 */ 1242 1240 PERF_RECORD_AUX_OUTPUT_HW_ID = 21, 1243 1241 1242 + /* 1243 + * This user callchain capture was deferred until shortly before 1244 + * returning to user space. Previous samples would have kernel 1245 + * callchains only and they need to be stitched with this to make full 1246 + * callchains. 1247 + * 1248 + * struct { 1249 + * struct perf_event_header header; 1250 + * u64 cookie; 1251 + * u64 nr; 1252 + * u64 ips[nr]; 1253 + * struct sample_id sample_id; 1254 + * }; 1255 + */ 1256 + PERF_RECORD_CALLCHAIN_DEFERRED = 22, 1257 + 1244 1258 PERF_RECORD_MAX, /* non-ABI */ 1245 1259 }; 1246 1260 ··· 1287 1269 PERF_CONTEXT_HV = (__u64)-32, 1288 1270 PERF_CONTEXT_KERNEL = (__u64)-128, 1289 1271 PERF_CONTEXT_USER = (__u64)-512, 1272 + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, 1290 1273 1291 1274 PERF_CONTEXT_GUEST = (__u64)-2048, 1292 1275 PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,

Configure Feed

Configure Feed