Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
"Misc race fixes uncovered by fuzzing efforts, a Sparse fix, two PMU
driver fixes, plus miscellanous tooling fixes"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86: Reject non sampling events with precise_ip
perf/x86/intel: Account interrupts for PEBS errors
perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race
perf/core: Fix sys_perf_event_open() vs. hotplug
perf/x86/intel: Use ULL constant to prevent undefined shift behaviour
perf/x86/intel/uncore: Fix hardcoded socket 0 assumption in the Haswell init code
perf/x86: Set pmu->module in Intel PMU modules
perf probe: Fix to probe on gcc generated symbols for offline kernel
perf probe: Fix --funcs to show correct symbols for offline module
perf symbols: Robustify reading of build-id from sysfs
perf tools: Install tools/lib/traceevent plugins with install-bin
tools lib traceevent: Fix prev/next_prio for deadline tasks
perf record: Fix --switch-output documentation and comment
perf record: Make __record_options static
tools lib subcmd: Add OPT_STRING_OPTARG_SET option
perf probe: Fix to get correct modname from elf header
samples/bpf trace_output_user: Remove duplicate sys/ioctl.h include
samples/bpf sock_example: Avoid getting ethhdr from two includes
perf sched timehist: Show total scheduling time

Linus Torvalds 9 years ago 79078c53 255e6140

+257 -92

20 changed files

expand all collapse all

arch

x86

events

core.c

intel

core.c

cstate.c

ds.c

rapl.c

uncore.c

uncore_snbep.c

include

linux

perf_event.h

kernel

events

core.c

samples

bpf

sock_example.h

trace_output_user.c

tools

lib

subcmd

parse-options.c

parse-options.h

traceevent

plugin_sched_switch.c

perf

Documentation

perf-record.txt

Makefile.perf

builtin-record.c

builtin-sched.c

util

probe-event.c

symbol-elf.c

arch/x86/events/core.c

reviewed

··· 505 505 506 506 if (event->attr.precise_ip > precise) 507 507 return -EOPNOTSUPP; 508 508 + 509 509 + /* There's no sense in having PEBS for non sampling events: */ 510 510 + if (!is_sampling_event(event)) 511 511 + return -EINVAL; 508 512 } 509 513 /* 510 514 * check that PEBS LBR correction does not conflict with

+1 -1

arch/x86/events/intel/core.c

reviewed

··· 3987 3987 x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); 3988 3988 x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; 3989 3989 } 3990 3990 - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 3990 3990 + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; 3991 3991 3992 3992 if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { 3993 3993 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",

arch/x86/events/intel/cstate.c

reviewed

··· 434 434 .stop = cstate_pmu_event_stop, 435 435 .read = cstate_pmu_event_update, 436 436 .capabilities = PERF_PMU_CAP_NO_INTERRUPT, 437 437 + .module = THIS_MODULE, 437 438 }; 438 439 439 440 static struct pmu cstate_pkg_pmu = { ··· 448 447 .stop = cstate_pmu_event_stop, 449 448 .read = cstate_pmu_event_update, 450 449 .capabilities = PERF_PMU_CAP_NO_INTERRUPT, 450 450 + .module = THIS_MODULE, 451 451 }; 452 452 453 453 static const struct cstate_model nhm_cstates __initconst = {

+5 -1

arch/x86/events/intel/ds.c

reviewed

··· 1389 1389 continue; 1390 1390 1391 1391 /* log dropped samples number */ 1392 1392 - if (error[bit]) 1392 1392 + if (error[bit]) { 1393 1393 perf_log_lost_samples(event, error[bit]); 1394 1394 + 1395 1395 + if (perf_event_account_interrupt(event)) 1396 1396 + x86_pmu_stop(event, 0); 1397 1397 + } 1394 1398 1395 1399 if (counts[bit]) { 1396 1400 __intel_pmu_pebs_event(event, iregs, base,

arch/x86/events/intel/rapl.c

reviewed

··· 697 697 rapl_pmus->pmu.start = rapl_pmu_event_start; 698 698 rapl_pmus->pmu.stop = rapl_pmu_event_stop; 699 699 rapl_pmus->pmu.read = rapl_pmu_event_read; 700 700 + rapl_pmus->pmu.module = THIS_MODULE; 700 701 return 0; 701 702 } 702 703

arch/x86/events/intel/uncore.c

reviewed

··· 733 733 .start = uncore_pmu_event_start, 734 734 .stop = uncore_pmu_event_stop, 735 735 .read = uncore_pmu_event_read, 736 736 + .module = THIS_MODULE, 736 737 }; 737 738 } else { 738 739 pmu->pmu = *pmu->type->pmu;

+1 -1

arch/x86/events/intel/uncore_snbep.c

reviewed

··· 2686 2686 2687 2687 void hswep_uncore_cpu_init(void) 2688 2688 { 2689 2689 - int pkg = topology_phys_to_logical_pkg(0); 2689 2689 + int pkg = boot_cpu_data.logical_proc_id; 2690 2690 2691 2691 if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) 2692 2692 hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;

include/linux/perf_event.h

reviewed

··· 1259 1259 extern void perf_event_disable_local(struct perf_event *event); 1260 1260 extern void perf_event_disable_inatomic(struct perf_event *event); 1261 1261 extern void perf_event_task_tick(void); 1262 1262 + extern int perf_event_account_interrupt(struct perf_event *event); 1262 1263 #else /* !CONFIG_PERF_EVENTS: */ 1263 1264 static inline void * 1264 1265 perf_aux_output_begin(struct perf_output_handle *handle,

+133 -42

kernel/events/core.c

reviewed

··· 2249 2249 struct perf_event_context *ctx = event->ctx; 2250 2250 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2251 2251 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2252 2252 - bool activate = true; 2252 2252 + bool reprogram = true; 2253 2253 int ret = 0; 2254 2254 2255 2255 raw_spin_lock(&cpuctx->ctx.lock); ··· 2257 2257 raw_spin_lock(&ctx->lock); 2258 2258 task_ctx = ctx; 2259 2259 2260 2260 - /* If we're on the wrong CPU, try again */ 2261 2261 - if (task_cpu(ctx->task) != smp_processor_id()) { 2260 2260 + reprogram = (ctx->task == current); 2261 2261 + 2262 2262 + /* 2263 2263 + * If the task is running, it must be running on this CPU, 2264 2264 + * otherwise we cannot reprogram things. 2265 2265 + * 2266 2266 + * If its not running, we don't care, ctx->lock will 2267 2267 + * serialize against it becoming runnable. 2268 2268 + */ 2269 2269 + if (task_curr(ctx->task) && !reprogram) { 2262 2270 ret = -ESRCH; 2263 2271 goto unlock; 2264 2272 } 2265 2273 2266 2266 - /* 2267 2267 - * If we're on the right CPU, see if the task we target is 2268 2268 - * current, if not we don't have to activate the ctx, a future 2269 2269 - * context switch will do that for us. 2270 2270 - */ 2271 2271 - if (ctx->task != current) 2272 2272 - activate = false; 2273 2273 - else 2274 2274 - WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2275 2275 - 2274 2274 + WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); 2276 2275 } else if (task_ctx) { 2277 2276 raw_spin_lock(&task_ctx->lock); 2278 2277 } 2279 2278 2280 2280 - if (activate) { 2279 2279 + if (reprogram) { 2281 2280 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2282 2281 add_event_to_ctx(event, ctx); 2283 2282 ctx_resched(cpuctx, task_ctx); ··· 2327 2328 /* 2328 2329 * Installing events is tricky because we cannot rely on ctx->is_active 2329 2330 * to be set in case this is the nr_events 0 -> 1 transition. 2331 2331 + * 2332 2332 + * Instead we use task_curr(), which tells us if the task is running. 2333 2333 + * However, since we use task_curr() outside of rq::lock, we can race 2334 2334 + * against the actual state. This means the result can be wrong. 2335 2335 + * 2336 2336 + * If we get a false positive, we retry, this is harmless. 2337 2337 + * 2338 2338 + * If we get a false negative, things are complicated. If we are after 2339 2339 + * perf_event_context_sched_in() ctx::lock will serialize us, and the 2340 2340 + * value must be correct. If we're before, it doesn't matter since 2341 2341 + * perf_event_context_sched_in() will program the counter. 2342 2342 + * 2343 2343 + * However, this hinges on the remote context switch having observed 2344 2344 + * our task->perf_event_ctxp[] store, such that it will in fact take 2345 2345 + * ctx::lock in perf_event_context_sched_in(). 2346 2346 + * 2347 2347 + * We do this by task_function_call(), if the IPI fails to hit the task 2348 2348 + * we know any future context switch of task must see the 2349 2349 + * perf_event_ctpx[] store. 2330 2350 */ 2331 2331 - again: 2351 2351 + 2332 2352 /* 2333 2333 - * Cannot use task_function_call() because we need to run on the task's 2334 2334 - * CPU regardless of whether its current or not. 2353 2353 + * This smp_mb() orders the task->perf_event_ctxp[] store with the 2354 2354 + * task_cpu() load, such that if the IPI then does not find the task 2355 2355 + * running, a future context switch of that task must observe the 2356 2356 + * store. 2335 2357 */ 2336 2336 - if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) 2358 2358 + smp_mb(); 2359 2359 + again: 2360 2360 + if (!task_function_call(task, __perf_install_in_context, event)) 2337 2361 return; 2338 2362 2339 2363 raw_spin_lock_irq(&ctx->lock); ··· 2370 2348 raw_spin_unlock_irq(&ctx->lock); 2371 2349 return; 2372 2350 } 2373 2373 - raw_spin_unlock_irq(&ctx->lock); 2374 2351 /* 2375 2375 - * Since !ctx->is_active doesn't mean anything, we must IPI 2376 2376 - * unconditionally. 2352 2352 + * If the task is not running, ctx->lock will avoid it becoming so, 2353 2353 + * thus we can safely install the event. 2377 2354 */ 2378 2378 - goto again; 2355 2355 + if (task_curr(task)) { 2356 2356 + raw_spin_unlock_irq(&ctx->lock); 2357 2357 + goto again; 2358 2358 + } 2359 2359 + add_event_to_ctx(event, ctx); 2360 2360 + raw_spin_unlock_irq(&ctx->lock); 2379 2361 } 2380 2362 2381 2363 /* ··· 7060 7034 perf_output_end(&handle); 7061 7035 } 7062 7036 7063 7063 - /* 7064 7064 - * Generic event overflow handling, sampling. 7065 7065 - */ 7066 7066 - 7067 7067 - static int __perf_event_overflow(struct perf_event *event, 7068 7068 - int throttle, struct perf_sample_data *data, 7069 7069 - struct pt_regs *regs) 7037 7037 + static int 7038 7038 + __perf_event_account_interrupt(struct perf_event *event, int throttle) 7070 7039 { 7071 7071 - int events = atomic_read(&event->event_limit); 7072 7040 struct hw_perf_event *hwc = &event->hw; 7073 7073 - u64 seq; 7074 7041 int ret = 0; 7075 7075 - 7076 7076 - /* 7077 7077 - * Non-sampling counters might still use the PMI to fold short 7078 7078 - * hardware counters, ignore those. 7079 7079 - */ 7080 7080 - if (unlikely(!is_sampling_event(event))) 7081 7081 - return 0; 7042 7042 + u64 seq; 7082 7043 7083 7044 seq = __this_cpu_read(perf_throttled_seq); 7084 7045 if (seq != hwc->interrupts_seq) { ··· 7092 7079 if (delta > 0 && delta < 2*TICK_NSEC) 7093 7080 perf_adjust_period(event, delta, hwc->last_period, true); 7094 7081 } 7082 7082 + 7083 7083 + return ret; 7084 7084 + } 7085 7085 + 7086 7086 + int perf_event_account_interrupt(struct perf_event *event) 7087 7087 + { 7088 7088 + return __perf_event_account_interrupt(event, 1); 7089 7089 + } 7090 7090 + 7091 7091 + /* 7092 7092 + * Generic event overflow handling, sampling. 7093 7093 + */ 7094 7094 + 7095 7095 + static int __perf_event_overflow(struct perf_event *event, 7096 7096 + int throttle, struct perf_sample_data *data, 7097 7097 + struct pt_regs *regs) 7098 7098 + { 7099 7099 + int events = atomic_read(&event->event_limit); 7100 7100 + int ret = 0; 7101 7101 + 7102 7102 + /* 7103 7103 + * Non-sampling counters might still use the PMI to fold short 7104 7104 + * hardware counters, ignore those. 7105 7105 + */ 7106 7106 + if (unlikely(!is_sampling_event(event))) 7107 7107 + return 0; 7108 7108 + 7109 7109 + ret = __perf_event_account_interrupt(event, throttle); 7095 7110 7096 7111 /* 7097 7112 * XXX event_limit might not quite work as expected on inherited ··· 9544 9503 return 0; 9545 9504 } 9546 9505 9506 9506 + /* 9507 9507 + * Variation on perf_event_ctx_lock_nested(), except we take two context 9508 9508 + * mutexes. 9509 9509 + */ 9510 9510 + static struct perf_event_context * 9511 9511 + __perf_event_ctx_lock_double(struct perf_event *group_leader, 9512 9512 + struct perf_event_context *ctx) 9513 9513 + { 9514 9514 + struct perf_event_context *gctx; 9515 9515 + 9516 9516 + again: 9517 9517 + rcu_read_lock(); 9518 9518 + gctx = READ_ONCE(group_leader->ctx); 9519 9519 + if (!atomic_inc_not_zero(&gctx->refcount)) { 9520 9520 + rcu_read_unlock(); 9521 9521 + goto again; 9522 9522 + } 9523 9523 + rcu_read_unlock(); 9524 9524 + 9525 9525 + mutex_lock_double(&gctx->mutex, &ctx->mutex); 9526 9526 + 9527 9527 + if (group_leader->ctx != gctx) { 9528 9528 + mutex_unlock(&ctx->mutex); 9529 9529 + mutex_unlock(&gctx->mutex); 9530 9530 + put_ctx(gctx); 9531 9531 + goto again; 9532 9532 + } 9533 9533 + 9534 9534 + return gctx; 9535 9535 + } 9536 9536 + 9547 9537 /** 9548 9538 * sys_perf_event_open - open a performance event, associate it to a task/cpu 9549 9539 * ··· 9818 9746 } 9819 9747 9820 9748 if (move_group) { 9821 9821 - gctx = group_leader->ctx; 9822 9822 - mutex_lock_double(&gctx->mutex, &ctx->mutex); 9749 9749 + gctx = __perf_event_ctx_lock_double(group_leader, ctx); 9750 9750 + 9823 9751 if (gctx->task == TASK_TOMBSTONE) { 9824 9752 err = -ESRCH; 9825 9753 goto err_locked; 9754 9754 + } 9755 9755 + 9756 9756 + /* 9757 9757 + * Check if we raced against another sys_perf_event_open() call 9758 9758 + * moving the software group underneath us. 9759 9759 + */ 9760 9760 + if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { 9761 9761 + /* 9762 9762 + * If someone moved the group out from under us, check 9763 9763 + * if this new event wound up on the same ctx, if so 9764 9764 + * its the regular !move_group case, otherwise fail. 9765 9765 + */ 9766 9766 + if (gctx != ctx) { 9767 9767 + err = -EINVAL; 9768 9768 + goto err_locked; 9769 9769 + } else { 9770 9770 + perf_event_ctx_unlock(group_leader, gctx); 9771 9771 + move_group = 0; 9772 9772 + } 9826 9773 } 9827 9774 } else { 9828 9775 mutex_lock(&ctx->mutex); ··· 9944 9853 perf_unpin_context(ctx); 9945 9854 9946 9855 if (move_group) 9947 9947 - mutex_unlock(&gctx->mutex); 9856 9856 + perf_event_ctx_unlock(group_leader, gctx); 9948 9857 mutex_unlock(&ctx->mutex); 9949 9858 9950 9859 if (task) { ··· 9970 9879 9971 9880 err_locked: 9972 9881 if (move_group) 9973 9973 - mutex_unlock(&gctx->mutex); 9882 9882 + perf_event_ctx_unlock(group_leader, gctx); 9974 9883 mutex_unlock(&ctx->mutex); 9975 9884 /* err_file: */ 9976 9885 fput(event_file);

+1 -1

samples/bpf/sock_example.h

reviewed

··· 4 4 #include <unistd.h> 5 5 #include <string.h> 6 6 #include <errno.h> 7 7 - #include <net/ethernet.h> 7 7 + #include <linux/if_ether.h> 8 8 #include <net/if.h> 9 9 #include <linux/if_packet.h> 10 10 #include <arpa/inet.h>

-1

samples/bpf/trace_output_user.c

reviewed

··· 9 9 #include <string.h> 10 10 #include <fcntl.h> 11 11 #include <poll.h> 12 12 - #include <sys/ioctl.h> 13 12 #include <linux/perf_event.h> 14 13 #include <linux/bpf.h> 15 14 #include <errno.h>

tools/lib/subcmd/parse-options.c

reviewed

··· 213 213 else 214 214 err = get_arg(p, opt, flags, (const char **)opt->value); 215 215 216 216 + if (opt->set) 217 217 + *(bool *)opt->set = true; 218 218 + 216 219 /* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */ 217 220 if (opt->flags & PARSE_OPT_NOEMPTY) { 218 221 const char *val = *(const char **)opt->value;

tools/lib/subcmd/parse-options.h

reviewed

··· 137 137 { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ 138 138 .value = check_vtype(v, const char **), (a), .help = (h), \ 139 139 .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) } 140 140 + #define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \ 141 141 + { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ 142 142 + .value = check_vtype(v, const char **), (a), .help = (h), \ 143 143 + .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \ 144 144 + .set = check_vtype(os, bool *)} 140 145 #define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY} 141 146 #define OPT_DATE(s, l, v, h) \ 142 147 { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }

+2 -2

tools/lib/traceevent/plugin_sched_switch.c

reviewed

··· 111 111 trace_seq_printf(s, "%lld ", val); 112 112 113 113 if (pevent_get_field_val(s, event, "prev_prio", record, &val, 0) == 0) 114 114 - trace_seq_printf(s, "[%lld] ", val); 114 114 + trace_seq_printf(s, "[%d] ", (int) val); 115 115 116 116 if (pevent_get_field_val(s, event, "prev_state", record, &val, 0) == 0) 117 117 write_state(s, val); ··· 129 129 trace_seq_printf(s, "%lld", val); 130 130 131 131 if (pevent_get_field_val(s, event, "next_prio", record, &val, 0) == 0) 132 132 - trace_seq_printf(s, " [%lld]", val); 132 132 + trace_seq_printf(s, " [%d]", (int) val); 133 133 134 134 return 0; 135 135 }

tools/perf/Documentation/perf-record.txt

reviewed

··· 430 430 particular perf.data snapshot should be kept or not. 431 431 432 432 Implies --timestamp-filename, --no-buildid and --no-buildid-cache. 433 433 + The reason for the latter two is to reduce the data file switching 434 434 + overhead. You can still switch them on with: 435 435 + 436 436 + --switch-output --no-no-buildid --no-no-buildid-cache 433 437 434 438 --dry-run:: 435 439 Parse options then exit. --dry-run can be used to detect errors in cmdline

+2 -2

tools/perf/Makefile.perf

reviewed

··· 704 704 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \ 705 705 $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr' 706 706 707 707 - install-bin: install-tools install-tests 707 707 + install-bin: install-tools install-tests install-traceevent-plugins 708 708 709 709 - install: install-bin try-install-man install-traceevent-plugins 709 709 + install: install-bin try-install-man 710 710 711 711 install-python_ext: 712 712 $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'

+2 -2

tools/perf/builtin-record.c

reviewed

··· 1405 1405 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', 1406 1406 * using pipes, etc. 1407 1407 */ 1408 1408 - struct option __record_options[] = { 1408 1408 + static struct option __record_options[] = { 1409 1409 OPT_CALLBACK('e', "event", &record.evlist, "event", 1410 1410 "event selector. use 'perf list' to list available events", 1411 1411 parse_events_option), ··· 1636 1636 * overhead. Still generate buildid if they are required 1637 1637 * explicitly using 1638 1638 * 1639 1639 - * perf record --signal-trigger --no-no-buildid \ 1639 1639 + * perf record --switch-output --no-no-buildid \ 1640 1640 * --no-no-buildid-cache 1641 1641 * 1642 1642 * Following code equals to:

+14 -3

tools/perf/builtin-sched.c

reviewed

··· 209 209 u64 skipped_samples; 210 210 const char *time_str; 211 211 struct perf_time_interval ptime; 212 212 + struct perf_time_interval hist_time; 212 213 }; 213 214 214 215 /* per thread run time data */ ··· 2461 2460 timehist_print_sample(sched, sample, &al, thread, t); 2462 2461 2463 2462 out: 2463 2463 + if (sched->hist_time.start == 0 && t >= ptime->start) 2464 2464 + sched->hist_time.start = t; 2465 2465 + if (ptime->end == 0 || t <= ptime->end) 2466 2466 + sched->hist_time.end = t; 2467 2467 + 2464 2468 if (tr) { 2465 2469 /* time of this sched_switch event becomes last time task seen */ 2466 2470 tr->last_time = sample->time; ··· 2630 2624 struct thread *t; 2631 2625 struct thread_runtime *r; 2632 2626 int i; 2627 2627 + u64 hist_time = sched->hist_time.end - sched->hist_time.start; 2633 2628 2634 2629 memset(&totals, 0, sizeof(totals)); 2635 2630 ··· 2672 2665 totals.sched_count += r->run_stats.n; 2673 2666 printf(" CPU %2d idle for ", i); 2674 2667 print_sched_time(r->total_run_time, 6); 2675 2675 - printf(" msec\n"); 2668 2668 + printf(" msec (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time); 2676 2669 } else 2677 2670 printf(" CPU %2d idle entire time window\n", i); 2678 2671 } ··· 2708 2701 2709 2702 printf("\n" 2710 2703 " Total number of unique tasks: %" PRIu64 "\n" 2711 2711 - "Total number of context switches: %" PRIu64 "\n" 2712 2712 - " Total run time (msec): ", 2704 2704 + "Total number of context switches: %" PRIu64 "\n", 2713 2705 totals.task_count, totals.sched_count); 2714 2706 2707 2707 + printf(" Total run time (msec): "); 2715 2708 print_sched_time(totals.total_run_time, 2); 2716 2709 printf("\n"); 2710 2710 + 2711 2711 + printf(" Total scheduling time (msec): "); 2712 2712 + print_sched_time(hist_time, 2); 2713 2713 + printf(" (x %d)\n", sched->max_cpu); 2717 2714 } 2718 2715 2719 2716 typedef int (*sched_handler)(struct perf_tool *tool,

+69 -36

tools/perf/util/probe-event.c

reviewed

··· 163 163 164 164 /* A file path -- this is an offline module */ 165 165 if (module && strchr(module, '/')) 166 166 - return machine__findnew_module_map(host_machine, 0, module); 166 166 + return dso__new_map(module); 167 167 168 168 if (!module) 169 169 module = "kernel"; ··· 173 173 if (strncmp(pos->dso->short_name + 1, module, 174 174 pos->dso->short_name_len - 2) == 0 && 175 175 module[pos->dso->short_name_len - 2] == '\0') { 176 176 + map__get(pos); 176 177 return pos; 177 178 } 178 179 } ··· 188 187 else 189 188 return kernel_get_module_map(target); 190 189 } 191 191 - 192 192 - static void put_target_map(struct map *map, bool user) 193 193 - { 194 194 - if (map && user) { 195 195 - /* Only the user map needs to be released */ 196 196 - map__put(map); 197 197 - } 198 198 - } 199 199 - 200 190 201 191 static int convert_exec_to_group(const char *exec, char **result) 202 192 { ··· 260 268 } 261 269 262 270 /* 263 263 - * NOTE: 264 264 - * '.gnu.linkonce.this_module' section of kernel module elf directly 265 265 - * maps to 'struct module' from linux/module.h. This section contains 266 266 - * actual module name which will be used by kernel after loading it. 267 267 - * But, we cannot use 'struct module' here since linux/module.h is not 268 268 - * exposed to user-space. Offset of 'name' has remained same from long 269 269 - * time, so hardcoding it here. 270 270 - */ 271 271 - #ifdef __LP64__ 272 272 - #define MOD_NAME_OFFSET 24 273 273 - #else 274 274 - #define MOD_NAME_OFFSET 12 275 275 - #endif 276 276 - 277 277 - /* 278 271 * @module can be module name of module file path. In case of path, 279 272 * inspect elf and find out what is actual module name. 280 273 * Caller has to free mod_name after using it. ··· 273 296 Elf_Data *data; 274 297 Elf_Scn *sec; 275 298 char *mod_name = NULL; 299 299 + int name_offset; 276 300 277 301 fd = open(module, O_RDONLY); 278 302 if (fd < 0) ··· 295 317 if (!data || !data->d_buf) 296 318 goto ret_err; 297 319 298 298 - mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET); 320 320 + /* 321 321 + * NOTE: 322 322 + * '.gnu.linkonce.this_module' section of kernel module elf directly 323 323 + * maps to 'struct module' from linux/module.h. This section contains 324 324 + * actual module name which will be used by kernel after loading it. 325 325 + * But, we cannot use 'struct module' here since linux/module.h is not 326 326 + * exposed to user-space. Offset of 'name' has remained same from long 327 327 + * time, so hardcoding it here. 328 328 + */ 329 329 + if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) 330 330 + name_offset = 12; 331 331 + else /* expect ELFCLASS64 by default */ 332 332 + name_offset = 24; 333 333 + 334 334 + mod_name = strdup((char *)data->d_buf + name_offset); 299 335 300 336 ret_err: 301 337 elf_end(elf); ··· 404 412 } 405 413 406 414 out: 407 407 - put_target_map(map, uprobes); 415 415 + map__put(map); 408 416 return ret; 409 417 410 418 } ··· 610 618 return ret ? : -ENOENT; 611 619 } 612 620 621 621 + /* 622 622 + * Rename DWARF symbols to ELF symbols -- gcc sometimes optimizes functions 623 623 + * and generate new symbols with suffixes such as .constprop.N or .isra.N 624 624 + * etc. Since those symbols are not recorded in DWARF, we have to find 625 625 + * correct generated symbols from offline ELF binary. 626 626 + * For online kernel or uprobes we don't need this because those are 627 627 + * rebased on _text, or already a section relative address. 628 628 + */ 629 629 + static int 630 630 + post_process_offline_probe_trace_events(struct probe_trace_event *tevs, 631 631 + int ntevs, const char *pathname) 632 632 + { 633 633 + struct symbol *sym; 634 634 + struct map *map; 635 635 + unsigned long stext = 0; 636 636 + u64 addr; 637 637 + int i; 638 638 + 639 639 + /* Prepare a map for offline binary */ 640 640 + map = dso__new_map(pathname); 641 641 + if (!map || get_text_start_address(pathname, &stext) < 0) { 642 642 + pr_warning("Failed to get ELF symbols for %s\n", pathname); 643 643 + return -EINVAL; 644 644 + } 645 645 + 646 646 + for (i = 0; i < ntevs; i++) { 647 647 + addr = tevs[i].point.address + tevs[i].point.offset - stext; 648 648 + sym = map__find_symbol(map, addr); 649 649 + if (!sym) 650 650 + continue; 651 651 + if (!strcmp(sym->name, tevs[i].point.symbol)) 652 652 + continue; 653 653 + /* If we have no realname, use symbol for it */ 654 654 + if (!tevs[i].point.realname) 655 655 + tevs[i].point.realname = tevs[i].point.symbol; 656 656 + else 657 657 + free(tevs[i].point.symbol); 658 658 + tevs[i].point.symbol = strdup(sym->name); 659 659 + tevs[i].point.offset = addr - sym->start; 660 660 + } 661 661 + map__put(map); 662 662 + 663 663 + return 0; 664 664 + } 665 665 + 613 666 static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs, 614 667 int ntevs, const char *exec) 615 668 { ··· 716 679 717 680 /* Skip post process if the target is an offline kernel */ 718 681 if (symbol_conf.ignore_vmlinux_buildid) 719 719 - return 0; 682 682 + return post_process_offline_probe_trace_events(tevs, ntevs, 683 683 + symbol_conf.vmlinux_name); 720 684 721 685 reloc_sym = kernel_get_ref_reloc_sym(); 722 686 if (!reloc_sym) { ··· 2907 2869 } 2908 2870 2909 2871 out: 2910 2910 - put_target_map(map, pev->uprobes); 2872 2872 + map__put(map); 2911 2873 free(syms); 2912 2874 return ret; 2913 2875 ··· 3400 3362 return ret; 3401 3363 3402 3364 /* Get a symbol map */ 3403 3403 - if (user) 3404 3404 - map = dso__new_map(target); 3405 3405 - else 3406 3406 - map = kernel_get_module_map(target); 3365 3365 + map = get_target_map(target, user); 3407 3366 if (!map) { 3408 3367 pr_err("Failed to get a map for %s\n", (target) ? : "kernel"); 3409 3368 return -EINVAL; ··· 3432 3397 } 3433 3398 3434 3399 end: 3435 3435 - if (user) { 3436 3436 - map__put(map); 3437 3437 - } 3400 3400 + map__put(map); 3438 3401 exit_probe_symbol_maps(); 3439 3402 3440 3403 return ret;

tools/perf/util/symbol-elf.c

reviewed

··· 537 537 break; 538 538 } else { 539 539 int n = namesz + descsz; 540 540 + 541 541 + if (n > (int)sizeof(bf)) { 542 542 + n = sizeof(bf); 543 543 + pr_debug("%s: truncating reading of build id in sysfs file %s: n_namesz=%u, n_descsz=%u.\n", 544 544 + __func__, filename, nhdr.n_namesz, nhdr.n_descsz); 545 545 + } 540 546 if (read(fd, bf, n) != n) 541 547 break; 542 548 }