Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
"Mostly tooling fixes, plus two uncore-PMU fixes, an uprobes fix, a
perf-cgroups fix and an AUX events fix"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86/intel/uncore: Add enable_box for client MSR uncore
perf/x86/intel/uncore: Fix uncore num_counters
uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions
perf/core: Set cgroup in CPU contexts for new cgroup events
perf/core: Fix sideband list-iteration vs. event ordering NULL pointer deference crash
perf probe ppc64le: Fix probe location when using DWARF
perf probe: Add function to post process kernel trace events
tools: Sync cpufeatures headers with the kernel
toops: Sync tools/include/uapi/linux/bpf.h with the kernel
tools: Sync cpufeatures.h and vmx.h with the kernel
perf probe: Support signedness casting
perf stat: Avoid skew when reading events
perf probe: Fix module name matching
perf probe: Adjust map->reloc offset when finding kernel symbol from map
perf hists: Trim libtraceevent trace_seq buffers
perf script: Add 'bpf-output' field to usage message

Linus Torvalds 10 years ago ad83242a 1f8083c6

+298 -93

19 changed files

expand all collapse all

arch

x86

events

intel

uncore_snb.c

uncore_snbep.c

kernel

uprobes.c

include

linux

perf_event.h

kernel

events

core.c

tools

arch

x86

include

asm

cpufeatures.h

disabled-features.h

required-features.h

uapi

asm

vmx.h

include

uapi

linux

bpf.h

perf

Documentation

perf-probe.txt

perf-script.txt

arch

powerpc

util

sym-handling.c

builtin-script.c

builtin-stat.c

util

probe-event.c

probe-event.h

probe-finder.c

sort.c

+14

arch/x86/events/intel/uncore_snb.c

reviewed

··· 100 100 } 101 101 } 102 102 103 103 + static void snb_uncore_msr_enable_box(struct intel_uncore_box *box) 104 104 + { 105 105 + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 106 106 + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); 107 107 + } 108 108 + 103 109 static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) 104 110 { 105 111 if (box->pmu->pmu_idx == 0) ··· 133 127 134 128 static struct intel_uncore_ops snb_uncore_msr_ops = { 135 129 .init_box = snb_uncore_msr_init_box, 130 130 + .enable_box = snb_uncore_msr_enable_box, 136 131 .exit_box = snb_uncore_msr_exit_box, 137 132 .disable_event = snb_uncore_msr_disable_event, 138 133 .enable_event = snb_uncore_msr_enable_event, ··· 199 192 } 200 193 } 201 194 195 195 + static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) 196 196 + { 197 197 + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, 198 198 + SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); 199 199 + } 200 200 + 202 201 static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) 203 202 { 204 203 if (box->pmu->pmu_idx == 0) ··· 213 200 214 201 static struct intel_uncore_ops skl_uncore_msr_ops = { 215 202 .init_box = skl_uncore_msr_init_box, 203 203 + .enable_box = skl_uncore_msr_enable_box, 216 204 .exit_box = skl_uncore_msr_exit_box, 217 205 .disable_event = snb_uncore_msr_disable_event, 218 206 .enable_event = snb_uncore_msr_enable_event,

+5 -5

arch/x86/events/intel/uncore_snbep.c

reviewed

··· 2626 2626 2627 2627 static struct intel_uncore_type hswep_uncore_ha = { 2628 2628 .name = "ha", 2629 2629 - .num_counters = 5, 2629 2629 + .num_counters = 4, 2630 2630 .num_boxes = 2, 2631 2631 .perf_ctr_bits = 48, 2632 2632 SNBEP_UNCORE_PCI_COMMON_INIT(), ··· 2645 2645 2646 2646 static struct intel_uncore_type hswep_uncore_imc = { 2647 2647 .name = "imc", 2648 2648 - .num_counters = 5, 2648 2648 + .num_counters = 4, 2649 2649 .num_boxes = 8, 2650 2650 .perf_ctr_bits = 48, 2651 2651 .fixed_ctr_bits = 48, ··· 2691 2691 2692 2692 static struct intel_uncore_type hswep_uncore_qpi = { 2693 2693 .name = "qpi", 2694 2694 - .num_counters = 5, 2694 2694 + .num_counters = 4, 2695 2695 .num_boxes = 3, 2696 2696 .perf_ctr_bits = 48, 2697 2697 .perf_ctr = SNBEP_PCI_PMON_CTR0, ··· 2773 2773 2774 2774 static struct intel_uncore_type hswep_uncore_r3qpi = { 2775 2775 .name = "r3qpi", 2776 2776 - .num_counters = 4, 2776 2776 + .num_counters = 3, 2777 2777 .num_boxes = 3, 2778 2778 .perf_ctr_bits = 44, 2779 2779 .constraints = hswep_uncore_r3qpi_constraints, ··· 2972 2972 2973 2973 static struct intel_uncore_type bdx_uncore_imc = { 2974 2974 .name = "imc", 2975 2975 - .num_counters = 5, 2975 2975 + .num_counters = 4, 2976 2976 .num_boxes = 8, 2977 2977 .perf_ctr_bits = 48, 2978 2978 .fixed_ctr_bits = 48,

+11 -11

arch/x86/kernel/uprobes.c

reviewed

··· 357 357 *cursor &= 0xfe; 358 358 } 359 359 /* 360 360 - * Similar treatment for VEX3 prefix. 361 361 - * TODO: add XOP/EVEX treatment when insn decoder supports them 360 360 + * Similar treatment for VEX3/EVEX prefix. 361 361 + * TODO: add XOP treatment when insn decoder supports them 362 362 */ 363 363 - if (insn->vex_prefix.nbytes == 3) { 363 363 + if (insn->vex_prefix.nbytes >= 3) { 364 364 /* 365 365 * vex2: c5 rvvvvLpp (has no b bit) 366 366 * vex3/xop: c4/8f rxbmmmmm wvvvvLpp 367 367 * evex: 62 rxbR00mm wvvvv1pp zllBVaaa 368 368 - * (evex will need setting of both b and x since 369 369 - * in non-sib encoding evex.x is 4th bit of MODRM.rm) 370 370 - * Setting VEX3.b (setting because it has inverted meaning): 368 368 + * Setting VEX3.b (setting because it has inverted meaning). 369 369 + * Setting EVEX.x since (in non-SIB encoding) EVEX.x 370 370 + * is the 4th bit of MODRM.rm, and needs the same treatment. 371 371 + * For VEX3-encoded insns, VEX3.x value has no effect in 372 372 + * non-SIB encoding, the change is superfluous but harmless. 371 373 */ 372 374 cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; 373 373 - *cursor |= 0x20; 375 375 + *cursor |= 0x60; 374 376 } 375 377 376 378 /* ··· 417 415 418 416 reg = MODRM_REG(insn); /* Fetch modrm.reg */ 419 417 reg2 = 0xff; /* Fetch vex.vvvv */ 420 420 - if (insn->vex_prefix.nbytes == 2) 421 421 - reg2 = insn->vex_prefix.bytes[1]; 422 422 - else if (insn->vex_prefix.nbytes == 3) 418 418 + if (insn->vex_prefix.nbytes) 423 419 reg2 = insn->vex_prefix.bytes[2]; 424 420 /* 425 425 - * TODO: add XOP, EXEV vvvv reading. 421 421 + * TODO: add XOP vvvv reading. 426 422 * 427 423 * vex.vvvv field is in bits 6-3, bits are inverted. 428 424 * But in 32-bit mode, high-order bit may be ignored.

include/linux/perf_event.h

reviewed

··· 743 743 u64 parent_gen; 744 744 u64 generation; 745 745 int pin_count; 746 746 + #ifdef CONFIG_CGROUP_PERF 746 747 int nr_cgroups; /* cgroup evts */ 748 748 + #endif 747 749 void *task_ctx_data; /* pmu specific data */ 748 750 struct rcu_head rcu_head; 749 751 }; ··· 771 769 unsigned int hrtimer_active; 772 770 773 771 struct pmu *unique_pmu; 772 772 + #ifdef CONFIG_CGROUP_PERF 774 773 struct perf_cgroup *cgrp; 774 774 + #endif 775 775 }; 776 776 777 777 struct perf_output_handle {

+54 -23

kernel/events/core.c

reviewed

··· 843 843 } 844 844 } 845 845 } 846 846 + 847 847 + /* 848 848 + * Update cpuctx->cgrp so that it is set when first cgroup event is added and 849 849 + * cleared when last cgroup event is removed. 850 850 + */ 851 851 + static inline void 852 852 + list_update_cgroup_event(struct perf_event *event, 853 853 + struct perf_event_context *ctx, bool add) 854 854 + { 855 855 + struct perf_cpu_context *cpuctx; 856 856 + 857 857 + if (!is_cgroup_event(event)) 858 858 + return; 859 859 + 860 860 + if (add && ctx->nr_cgroups++) 861 861 + return; 862 862 + else if (!add && --ctx->nr_cgroups) 863 863 + return; 864 864 + /* 865 865 + * Because cgroup events are always per-cpu events, 866 866 + * this will always be called from the right CPU. 867 867 + */ 868 868 + cpuctx = __get_cpu_context(ctx); 869 869 + cpuctx->cgrp = add ? event->cgrp : NULL; 870 870 + } 871 871 + 846 872 #else /* !CONFIG_CGROUP_PERF */ 847 873 848 874 static inline bool ··· 946 920 struct perf_event_context *ctx) 947 921 { 948 922 } 923 923 + 924 924 + static inline void 925 925 + list_update_cgroup_event(struct perf_event *event, 926 926 + struct perf_event_context *ctx, bool add) 927 927 + { 928 928 + } 929 929 + 949 930 #endif 950 931 951 932 /* ··· 1425 1392 static void 1426 1393 list_add_event(struct perf_event *event, struct perf_event_context *ctx) 1427 1394 { 1395 1395 + 1428 1396 lockdep_assert_held(&ctx->lock); 1429 1397 1430 1398 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); ··· 1446 1412 list_add_tail(&event->group_entry, list); 1447 1413 } 1448 1414 1449 1449 - if (is_cgroup_event(event)) 1450 1450 - ctx->nr_cgroups++; 1415 1415 + list_update_cgroup_event(event, ctx, true); 1451 1416 1452 1417 list_add_rcu(&event->event_entry, &ctx->event_list); 1453 1418 ctx->nr_events++; ··· 1614 1581 static void 1615 1582 list_del_event(struct perf_event *event, struct perf_event_context *ctx) 1616 1583 { 1617 1617 - struct perf_cpu_context *cpuctx; 1618 1618 - 1619 1584 WARN_ON_ONCE(event->ctx != ctx); 1620 1585 lockdep_assert_held(&ctx->lock); 1621 1586 ··· 1625 1594 1626 1595 event->attach_state &= ~PERF_ATTACH_CONTEXT; 1627 1596 1628 1628 - if (is_cgroup_event(event)) { 1629 1629 - ctx->nr_cgroups--; 1630 1630 - /* 1631 1631 - * Because cgroup events are always per-cpu events, this will 1632 1632 - * always be called from the right CPU. 1633 1633 - */ 1634 1634 - cpuctx = __get_cpu_context(ctx); 1635 1635 - /* 1636 1636 - * If there are no more cgroup events then clear cgrp to avoid 1637 1637 - * stale pointer in update_cgrp_time_from_cpuctx(). 1638 1638 - */ 1639 1639 - if (!ctx->nr_cgroups) 1640 1640 - cpuctx->cgrp = NULL; 1641 1641 - } 1597 1597 + list_update_cgroup_event(event, ctx, false); 1642 1598 1643 1599 ctx->nr_events--; 1644 1600 if (event->attr.inherit_stat) ··· 1734 1716 static inline int 1735 1717 event_filter_match(struct perf_event *event) 1736 1718 { 1737 1737 - return (event->cpu == -1 || event->cpu == smp_processor_id()) 1738 1738 - && perf_cgroup_match(event) && pmu_filter_match(event); 1719 1719 + return (event->cpu == -1 || event->cpu == smp_processor_id()) && 1720 1720 + perf_cgroup_match(event) && pmu_filter_match(event); 1739 1721 } 1740 1722 1741 1723 static void ··· 1755 1737 * maintained, otherwise bogus information is return 1756 1738 * via read() for time_enabled, time_running: 1757 1739 */ 1758 1758 - if (event->state == PERF_EVENT_STATE_INACTIVE 1759 1759 - && !event_filter_match(event)) { 1740 1740 + if (event->state == PERF_EVENT_STATE_INACTIVE && 1741 1741 + !event_filter_match(event)) { 1760 1742 delta = tstamp - event->tstamp_stopped; 1761 1743 event->tstamp_running += delta; 1762 1744 event->tstamp_stopped = tstamp; ··· 2254 2236 2255 2237 lockdep_assert_held(&ctx->mutex); 2256 2238 2257 2257 - event->ctx = ctx; 2258 2239 if (event->cpu != -1) 2259 2240 event->cpu = cpu; 2241 2241 + 2242 2242 + /* 2243 2243 + * Ensures that if we can observe event->ctx, both the event and ctx 2244 2244 + * will be 'complete'. See perf_iterate_sb_cpu(). 2245 2245 + */ 2246 2246 + smp_store_release(&event->ctx, ctx); 2260 2247 2261 2248 if (!task) { 2262 2249 cpu_function_call(cpu, __perf_install_in_context, event); ··· 5992 5969 struct perf_event *event; 5993 5970 5994 5971 list_for_each_entry_rcu(event, &pel->list, sb_list) { 5972 5972 + /* 5973 5973 + * Skip events that are not fully formed yet; ensure that 5974 5974 + * if we observe event->ctx, both event and ctx will be 5975 5975 + * complete enough. See perf_install_in_context(). 5976 5976 + */ 5977 5977 + if (!smp_load_acquire(&event->ctx)) 5978 5978 + continue; 5979 5979 + 5995 5980 if (event->state < PERF_EVENT_STATE_INACTIVE) 5996 5981 continue; 5997 5982 if (!event_filter_match(event))

+3 -6

tools/arch/x86/include/asm/cpufeatures.h

reviewed

··· 225 225 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ 226 226 #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ 227 227 #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ 228 228 - #define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ 229 228 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ 230 229 #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ 231 230 #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ ··· 300 301 #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ 301 302 #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ 302 303 #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ 303 303 - #define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ 304 304 - #define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ 305 305 - 306 306 - 307 304 #ifdef CONFIG_X86_32 308 305 /* 309 306 * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional ··· 307 312 */ 308 313 #define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ 309 314 #endif 310 310 - 315 315 + #define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ 316 316 + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ 317 317 + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ 311 318 #endif /* _ASM_X86_CPUFEATURES_H */

tools/arch/x86/include/asm/disabled-features.h

reviewed

··· 56 56 #define DISABLED_MASK14 0 57 57 #define DISABLED_MASK15 0 58 58 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) 59 59 + #define DISABLED_MASK17 0 60 60 + #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) 59 61 60 62 #endif /* _ASM_X86_DISABLED_FEATURES_H */

tools/arch/x86/include/asm/required-features.h

reviewed

··· 99 99 #define REQUIRED_MASK14 0 100 100 #define REQUIRED_MASK15 0 101 101 #define REQUIRED_MASK16 0 102 102 + #define REQUIRED_MASK17 0 103 103 + #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) 102 104 103 105 #endif /* _ASM_X86_REQUIRED_FEATURES_H */

+1 -3

tools/arch/x86/include/uapi/asm/vmx.h

reviewed

··· 78 78 #define EXIT_REASON_PML_FULL 62 79 79 #define EXIT_REASON_XSAVES 63 80 80 #define EXIT_REASON_XRSTORS 64 81 81 - #define EXIT_REASON_PCOMMIT 65 82 81 83 82 #define VMX_EXIT_REASONS \ 84 83 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ ··· 126 127 { EXIT_REASON_INVVPID, "INVVPID" }, \ 127 128 { EXIT_REASON_INVPCID, "INVPCID" }, \ 128 129 { EXIT_REASON_XSAVES, "XSAVES" }, \ 129 129 - { EXIT_REASON_XRSTORS, "XRSTORS" }, \ 130 130 - { EXIT_REASON_PCOMMIT, "PCOMMIT" } 130 130 + { EXIT_REASON_XRSTORS, "XRSTORS" } 131 131 132 132 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 133 133 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4

+85 -1

tools/include/uapi/linux/bpf.h

reviewed

··· 84 84 BPF_MAP_TYPE_PERCPU_HASH, 85 85 BPF_MAP_TYPE_PERCPU_ARRAY, 86 86 BPF_MAP_TYPE_STACK_TRACE, 87 87 + BPF_MAP_TYPE_CGROUP_ARRAY, 87 88 }; 88 89 89 90 enum bpf_prog_type { ··· 94 93 BPF_PROG_TYPE_SCHED_CLS, 95 94 BPF_PROG_TYPE_SCHED_ACT, 96 95 BPF_PROG_TYPE_TRACEPOINT, 96 96 + BPF_PROG_TYPE_XDP, 97 97 }; 98 98 99 99 #define BPF_PSEUDO_MAP_FD 1 ··· 315 313 */ 316 314 BPF_FUNC_skb_get_tunnel_opt, 317 315 BPF_FUNC_skb_set_tunnel_opt, 316 316 + 317 317 + /** 318 318 + * bpf_skb_change_proto(skb, proto, flags) 319 319 + * Change protocol of the skb. Currently supported is 320 320 + * v4 -> v6, v6 -> v4 transitions. The helper will also 321 321 + * resize the skb. eBPF program is expected to fill the 322 322 + * new headers via skb_store_bytes and lX_csum_replace. 323 323 + * @skb: pointer to skb 324 324 + * @proto: new skb->protocol type 325 325 + * @flags: reserved 326 326 + * Return: 0 on success or negative error 327 327 + */ 328 328 + BPF_FUNC_skb_change_proto, 329 329 + 330 330 + /** 331 331 + * bpf_skb_change_type(skb, type) 332 332 + * Change packet type of skb. 333 333 + * @skb: pointer to skb 334 334 + * @type: new skb->pkt_type type 335 335 + * Return: 0 on success or negative error 336 336 + */ 337 337 + BPF_FUNC_skb_change_type, 338 338 + 339 339 + /** 340 340 + * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb 341 341 + * @skb: pointer to skb 342 342 + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type 343 343 + * @index: index of the cgroup in the bpf_map 344 344 + * Return: 345 345 + * == 0 skb failed the cgroup2 descendant test 346 346 + * == 1 skb succeeded the cgroup2 descendant test 347 347 + * < 0 error 348 348 + */ 349 349 + BPF_FUNC_skb_in_cgroup, 350 350 + 351 351 + /** 352 352 + * bpf_get_hash_recalc(skb) 353 353 + * Retrieve and possibly recalculate skb->hash. 354 354 + * @skb: pointer to skb 355 355 + * Return: hash 356 356 + */ 357 357 + BPF_FUNC_get_hash_recalc, 358 358 + 359 359 + /** 360 360 + * u64 bpf_get_current_task(void) 361 361 + * Returns current task_struct 362 362 + * Return: current 363 363 + */ 364 364 + BPF_FUNC_get_current_task, 365 365 + 366 366 + /** 367 367 + * bpf_probe_write_user(void *dst, void *src, int len) 368 368 + * safely attempt to write to a location 369 369 + * @dst: destination address in userspace 370 370 + * @src: source address on stack 371 371 + * @len: number of bytes to copy 372 372 + * Return: 0 on success or negative error 373 373 + */ 374 374 + BPF_FUNC_probe_write_user, 375 375 + 318 376 __BPF_FUNC_MAX_ID, 319 377 }; 320 378 ··· 409 347 #define BPF_F_ZERO_CSUM_TX (1ULL << 1) 410 348 #define BPF_F_DONT_FRAGMENT (1ULL << 2) 411 349 412 412 - /* BPF_FUNC_perf_event_output flags. */ 350 350 + /* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ 413 351 #define BPF_F_INDEX_MASK 0xffffffffULL 414 352 #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK 353 353 + /* BPF_FUNC_perf_event_output for sk_buff input context. */ 354 354 + #define BPF_F_CTXLEN_MASK (0xfffffULL << 32) 415 355 416 356 /* user accessible mirror of in-kernel sk_buff. 417 357 * new fields can only be added to the end of this structure ··· 448 384 __u8 tunnel_ttl; 449 385 __u16 tunnel_ext; 450 386 __u32 tunnel_label; 387 387 + }; 388 388 + 389 389 + /* User return codes for XDP prog type. 390 390 + * A valid XDP program must return one of these defined values. All other 391 391 + * return codes are reserved for future use. Unknown return codes will result 392 392 + * in packet drop. 393 393 + */ 394 394 + enum xdp_action { 395 395 + XDP_ABORTED = 0, 396 396 + XDP_DROP, 397 397 + XDP_PASS, 398 398 + XDP_TX, 399 399 + }; 400 400 + 401 401 + /* user accessible metadata for XDP packet hook 402 402 + * new fields must be added to the end of this structure 403 403 + */ 404 404 + struct xdp_md { 405 405 + __u32 data; 406 406 + __u32 data_end; 451 407 }; 452 408 453 409 #endif /* _UAPI__LINUX_BPF_H__ */

+9 -1

tools/perf/Documentation/perf-probe.txt

reviewed

··· 176 176 177 177 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.) 178 178 '$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters. 179 179 - 'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type. 179 179 + 'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail) 180 180 181 181 On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid. 182 182 + 183 183 + TYPES 184 184 + ----- 185 185 + Basic types (u8/u16/u32/u64/s8/s16/s32/s64) are integer types. Prefix 's' and 'u' means those types are signed and unsigned respectively. Traced arguments are shown in decimal (signed) or hex (unsigned). You can also use 's' or 'u' to specify only signedness and leave its size auto-detected by perf probe. 186 186 + String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type. 187 187 + Bitfield is another special type, which takes 3 parameters, bit-width, bit-offset, and container-size (usually 32). The syntax is; 188 188 + 189 189 + b<bit-width>@<bit-offset>/<container-size> 182 190 183 191 LINE SYNTAX 184 192 -----------

+2 -2

tools/perf/Documentation/perf-script.txt

reviewed

··· 116 116 --fields:: 117 117 Comma separated list of fields to print. Options are: 118 118 comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, 119 119 - srcline, period, iregs, brstack, brstacksym, flags. 120 120 - Field list can be prepended with the type, trace, sw or hw, 119 119 + srcline, period, iregs, brstack, brstacksym, flags, bpf-output, 120 120 + callindent. Field list can be prepended with the type, trace, sw or hw, 121 121 to indicate to which event type the field list applies. 122 122 e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace 123 123

+23 -4

tools/perf/arch/powerpc/util/sym-handling.c

reviewed

··· 54 54 #endif 55 55 56 56 #if defined(_CALL_ELF) && _CALL_ELF == 2 57 57 - bool arch__prefers_symtab(void) 58 58 - { 59 59 - return true; 60 60 - } 61 57 62 58 #ifdef HAVE_LIBELF_SUPPORT 63 59 void arch__sym_update(struct symbol *s, GElf_Sym *sym) ··· 96 100 tev->point.offset += lep_offset; 97 101 } 98 102 } 103 103 + 104 104 + void arch__post_process_probe_trace_events(struct perf_probe_event *pev, 105 105 + int ntevs) 106 106 + { 107 107 + struct probe_trace_event *tev; 108 108 + struct map *map; 109 109 + struct symbol *sym = NULL; 110 110 + struct rb_node *tmp; 111 111 + int i = 0; 112 112 + 113 113 + map = get_target_map(pev->target, pev->uprobes); 114 114 + if (!map || map__load(map, NULL) < 0) 115 115 + return; 116 116 + 117 117 + for (i = 0; i < ntevs; i++) { 118 118 + tev = &pev->tevs[i]; 119 119 + map__for_each_symbol(map, sym, tmp) { 120 120 + if (map->unmap_ip(map, sym->start) == tev->point.address) 121 121 + arch__fix_tev_from_maps(pev, tev, map, sym); 122 122 + } 123 123 + } 124 124 + } 125 125 + 99 126 #endif

+1 -1

tools/perf/builtin-script.c

reviewed

··· 2116 2116 "Valid types: hw,sw,trace,raw. " 2117 2117 "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," 2118 2118 "addr,symoff,period,iregs,brstack,brstacksym,flags," 2119 2119 - "callindent", parse_output_fields), 2119 2119 + "bpf-output,callindent", parse_output_fields), 2120 2120 OPT_BOOLEAN('a', "all-cpus", &system_wide, 2121 2121 "system-wide collection from all CPUs"), 2122 2122 OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",

+23 -8

tools/perf/builtin-stat.c

reviewed

··· 331 331 return 0; 332 332 } 333 333 334 334 - static void read_counters(bool close_counters) 334 334 + static void read_counters(void) 335 335 { 336 336 struct perf_evsel *counter; 337 337 ··· 341 341 342 342 if (perf_stat_process_counter(&stat_config, counter)) 343 343 pr_warning("failed to process counter %s\n", counter->name); 344 344 - 345 345 - if (close_counters) { 346 346 - perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 347 347 - thread_map__nr(evsel_list->threads)); 348 348 - } 349 344 } 350 345 } 351 346 ··· 348 353 { 349 354 struct timespec ts, rs; 350 355 351 351 - read_counters(false); 356 356 + read_counters(); 352 357 353 358 clock_gettime(CLOCK_MONOTONIC, &ts); 354 359 diff_timespec(&rs, &ts, &ref_time); ··· 373 378 */ 374 379 if (!target__none(&target) || initial_delay) 375 380 perf_evlist__enable(evsel_list); 381 381 + } 382 382 + 383 383 + static void disable_counters(void) 384 384 + { 385 385 + /* 386 386 + * If we don't have tracee (attaching to task or cpu), counters may 387 387 + * still be running. To get accurate group ratios, we must stop groups 388 388 + * from counting before reading their constituent counters. 389 389 + */ 390 390 + if (!target__none(&target)) 391 391 + perf_evlist__disable(evsel_list); 376 392 } 377 393 378 394 static volatile int workload_exec_errno; ··· 663 657 } 664 658 } 665 659 660 660 + disable_counters(); 661 661 + 666 662 t1 = rdclock(); 667 663 668 664 update_stats(&walltime_nsecs_stats, t1 - t0); 669 665 670 670 - read_counters(true); 666 666 + /* 667 667 + * Closing a group leader splits the group, and as we only disable 668 668 + * group leaders, results in remaining events becoming enabled. To 669 669 + * avoid arbitrary skew, we must read all counters before closing any 670 670 + * group leaders. 671 671 + */ 672 672 + read_counters(); 673 673 + perf_evlist__close(evsel_list); 671 674 672 675 return WEXITSTATUS(status); 673 676 }

+37 -23

tools/perf/util/probe-event.c

reviewed

··· 170 170 module = "kernel"; 171 171 172 172 for (pos = maps__first(maps); pos; pos = map__next(pos)) { 173 173 + /* short_name is "[module]" */ 173 174 if (strncmp(pos->dso->short_name + 1, module, 174 174 - pos->dso->short_name_len - 2) == 0) { 175 175 + pos->dso->short_name_len - 2) == 0 && 176 176 + module[pos->dso->short_name_len - 2] == '\0') { 175 177 return pos; 176 178 } 177 179 } 178 180 return NULL; 179 181 } 180 182 181 181 - static struct map *get_target_map(const char *target, bool user) 183 183 + struct map *get_target_map(const char *target, bool user) 182 184 { 183 185 /* Init maps of given executable or kernel */ 184 186 if (user) ··· 387 385 if (uprobes) 388 386 address = sym->start; 389 387 else 390 390 - address = map->unmap_ip(map, sym->start); 388 388 + address = map->unmap_ip(map, sym->start) - map->reloc; 391 389 break; 392 390 } 393 391 if (!address) { ··· 666 664 return ret; 667 665 } 668 666 669 669 - /* Post processing the probe events */ 670 670 - static int post_process_probe_trace_events(struct probe_trace_event *tevs, 671 671 - int ntevs, const char *module, 672 672 - bool uprobe) 667 667 + static int 668 668 + post_process_kernel_probe_trace_events(struct probe_trace_event *tevs, 669 669 + int ntevs) 673 670 { 674 671 struct ref_reloc_sym *reloc_sym; 675 672 char *tmp; 676 673 int i, skipped = 0; 677 677 - 678 678 - if (uprobe) 679 679 - return add_exec_to_probe_trace_events(tevs, ntevs, module); 680 680 - 681 681 - /* Note that currently ref_reloc_sym based probe is not for drivers */ 682 682 - if (module) 683 683 - return add_module_to_probe_trace_events(tevs, ntevs, module); 684 674 685 675 reloc_sym = kernel_get_ref_reloc_sym(); 686 676 if (!reloc_sym) { ··· 703 709 reloc_sym->unrelocated_addr; 704 710 } 705 711 return skipped; 712 712 + } 713 713 + 714 714 + void __weak 715 715 + arch__post_process_probe_trace_events(struct perf_probe_event *pev __maybe_unused, 716 716 + int ntevs __maybe_unused) 717 717 + { 718 718 + } 719 719 + 720 720 + /* Post processing the probe events */ 721 721 + static int post_process_probe_trace_events(struct perf_probe_event *pev, 722 722 + struct probe_trace_event *tevs, 723 723 + int ntevs, const char *module, 724 724 + bool uprobe) 725 725 + { 726 726 + int ret; 727 727 + 728 728 + if (uprobe) 729 729 + ret = add_exec_to_probe_trace_events(tevs, ntevs, module); 730 730 + else if (module) 731 731 + /* Currently ref_reloc_sym based probe is not for drivers */ 732 732 + ret = add_module_to_probe_trace_events(tevs, ntevs, module); 733 733 + else 734 734 + ret = post_process_kernel_probe_trace_events(tevs, ntevs); 735 735 + 736 736 + if (ret >= 0) 737 737 + arch__post_process_probe_trace_events(pev, ntevs); 738 738 + 739 739 + return ret; 706 740 } 707 741 708 742 /* Try to find perf_probe_event with debuginfo */ ··· 771 749 772 750 if (ntevs > 0) { /* Succeeded to find trace events */ 773 751 pr_debug("Found %d probe_trace_events.\n", ntevs); 774 774 - ret = post_process_probe_trace_events(*tevs, ntevs, 752 752 + ret = post_process_probe_trace_events(pev, *tevs, ntevs, 775 753 pev->target, pev->uprobes); 776 754 if (ret < 0 || ret == ntevs) { 777 755 clear_probe_trace_events(*tevs, ntevs); ··· 2958 2936 return err; 2959 2937 } 2960 2938 2961 2961 - bool __weak arch__prefers_symtab(void) { return false; } 2962 2962 - 2963 2939 /* Concatinate two arrays */ 2964 2940 static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b) 2965 2941 { ··· 3177 3157 ret = find_probe_trace_events_from_cache(pev, tevs); 3178 3158 if (ret > 0 || pev->sdt) /* SDT can be found only in the cache */ 3179 3159 return ret == 0 ? -ENOENT : ret; /* Found in probe cache */ 3180 3180 - 3181 3181 - if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) { 3182 3182 - ret = find_probe_trace_events_from_map(pev, tevs); 3183 3183 - if (ret > 0) 3184 3184 - return ret; /* Found in symbol table */ 3185 3185 - } 3186 3160 3187 3161 /* Convert perf_probe_event with debuginfo */ 3188 3162 ret = try_to_find_probe_trace_events(pev, tevs);

+5 -1

tools/perf/util/probe-event.h

reviewed

··· 158 158 int show_available_vars(struct perf_probe_event *pevs, int npevs, 159 159 struct strfilter *filter); 160 160 int show_available_funcs(const char *module, struct strfilter *filter, bool user); 161 161 - bool arch__prefers_symtab(void); 162 161 void arch__fix_tev_from_maps(struct perf_probe_event *pev, 163 162 struct probe_trace_event *tev, struct map *map, 164 163 struct symbol *sym); ··· 171 172 172 173 int copy_to_probe_trace_arg(struct probe_trace_arg *tvar, 173 174 struct perf_probe_arg *pvar); 175 175 + 176 176 + struct map *get_target_map(const char *target, bool user); 177 177 + 178 178 + void arch__post_process_probe_trace_events(struct perf_probe_event *pev, 179 179 + int ntevs); 174 180 175 181 #endif /*_PROBE_EVENT_H */

+12 -3

tools/perf/util/probe-finder.c

reviewed

··· 297 297 char sbuf[STRERR_BUFSIZE]; 298 298 int bsize, boffs, total; 299 299 int ret; 300 300 + char sign; 300 301 301 302 /* TODO: check all types */ 302 302 - if (cast && strcmp(cast, "string") != 0) { 303 303 + if (cast && strcmp(cast, "string") != 0 && 304 304 + strcmp(cast, "s") != 0 && strcmp(cast, "u") != 0) { 303 305 /* Non string type is OK */ 306 306 + /* and respect signedness cast */ 304 307 tvar->type = strdup(cast); 305 308 return (tvar->type == NULL) ? -ENOMEM : 0; 306 309 } ··· 364 361 return (tvar->type == NULL) ? -ENOMEM : 0; 365 362 } 366 363 364 364 + if (cast && (strcmp(cast, "u") == 0)) 365 365 + sign = 'u'; 366 366 + else if (cast && (strcmp(cast, "s") == 0)) 367 367 + sign = 's'; 368 368 + else 369 369 + sign = die_is_signed_type(&type) ? 's' : 'u'; 370 370 + 367 371 ret = dwarf_bytesize(&type); 368 372 if (ret <= 0) 369 373 /* No size ... try to use default type */ ··· 383 373 dwarf_diename(&type), MAX_BASIC_TYPE_BITS); 384 374 ret = MAX_BASIC_TYPE_BITS; 385 375 } 386 386 - ret = snprintf(buf, 16, "%c%d", 387 387 - die_is_signed_type(&type) ? 's' : 'u', ret); 376 376 + ret = snprintf(buf, 16, "%c%d", sign, ret); 388 377 389 378 formatted: 390 379 if (ret < 0 || ret >= 16) {

+5 -1

tools/perf/util/sort.c

reviewed

··· 588 588 } else { 589 589 pevent_event_info(&seq, evsel->tp_format, &rec); 590 590 } 591 591 - return seq.buffer; 591 591 + /* 592 592 + * Trim the buffer, it starts at 4KB and we're not going to 593 593 + * add anything more to this buffer. 594 594 + */ 595 595 + return realloc(seq.buffer, seq.len + 1); 592 596 } 593 597 594 598 static int64_t