Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

perf report: Add --latency flag

Add record/report --latency flag that allows to capture and show
latency-centric profiles rather than the default CPU-consumption-centric
profiles. For latency profiles record captures context switch events,
and report shows Latency as the first column.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/e9640464bcbc47dde2cb557003f421052ebc9eec.1739437531.git.dvyukov@google.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>

authored by

Dmitry Vyukov and committed by
Namhyung Kim
2570c02c ee1cffbe

+135 -20
+4
tools/perf/Documentation/perf-record.txt
··· 227 227 '--filter' exists, the new filter expression will be combined with 228 228 them by '&&'. 229 229 230 + --latency:: 231 + Enable data collection for latency profiling. 232 + Use perf report --latency for latency-centric profile. 233 + 230 234 -a:: 231 235 --all-cpus:: 232 236 System-wide collection from all CPUs (default if no target is specified).
+5
tools/perf/Documentation/perf-report.txt
··· 68 68 --hide-unresolved:: 69 69 Only display entries resolved to a symbol. 70 70 71 + --latency:: 72 + Show latency-centric profile rather than the default 73 + CPU-consumption-centric profile 74 + (requires perf record --latency flag). 75 + 71 76 -s:: 72 77 --sort=:: 73 78 Sort histogram entries by given key(s) - multiple keys can be specified
+20
tools/perf/builtin-record.c
··· 161 161 struct evlist *sb_evlist; 162 162 pthread_t thread_id; 163 163 int realtime_prio; 164 + bool latency; 164 165 bool switch_output_event_set; 165 166 bool no_buildid; 166 167 bool no_buildid_set; ··· 3374 3373 parse_events_option), 3375 3374 OPT_CALLBACK(0, "filter", &record.evlist, "filter", 3376 3375 "event filter", parse_filter), 3376 + OPT_BOOLEAN(0, "latency", &record.latency, 3377 + "Enable data collection for latency profiling.\n" 3378 + "\t\t\t Use perf report --latency for latency-centric profile."), 3377 3379 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist, 3378 3380 NULL, "don't record events from perf itself", 3379 3381 exclude_perf), ··· 4021 4017 usage_with_options_msg(record_usage, record_options, 4022 4018 "cgroup monitoring only available in system-wide mode"); 4023 4019 4020 + } 4021 + 4022 + if (record.latency) { 4023 + /* 4024 + * There is no fundamental reason why latency profiling 4025 + * can't work for system-wide mode, but exact semantics 4026 + * and details are to be defined. 4027 + * See the following thread for details: 4028 + * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/ 4029 + */ 4030 + if (record.opts.target.system_wide) { 4031 + pr_err("Failed: latency profiling is not supported with system-wide collection.\n"); 4032 + err = -EINVAL; 4033 + goto out_opts; 4034 + } 4035 + record.opts.record_switch_events = true; 4024 4036 } 4025 4037 4026 4038 if (rec->buildid_mmap) {
+28 -4
tools/perf/builtin-report.c
··· 112 112 u64 nr_entries; 113 113 u64 queue_size; 114 114 u64 total_cycles; 115 + u64 total_samples; 116 + u64 singlethreaded_samples; 115 117 int socket_filter; 116 118 DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); 117 119 struct branch_type_stat brtype_stat; ··· 332 330 rep->nonany_branch_mode, 333 331 &rep->total_cycles, evsel); 334 332 } 333 + 334 + rep->total_samples++; 335 + if (al.parallelism == 1) 336 + rep->singlethreaded_samples++; 335 337 336 338 ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep); 337 339 if (ret < 0) ··· 1085 1079 return ret; 1086 1080 } 1087 1081 1082 + /* Don't show Latency column for non-parallel profiles by default. */ 1083 + if (!symbol_conf.prefer_latency && rep->total_samples && 1084 + rep->singlethreaded_samples * 100 / rep->total_samples >= 99) 1085 + perf_hpp__cancel_latency(); 1086 + 1088 1087 evlist__check_mem_load_aux(session->evlist); 1089 1088 1090 1089 if (rep->stats_mode) ··· 1479 1468 "Disable raw trace ordering"), 1480 1469 OPT_BOOLEAN(0, "skip-empty", &report.skip_empty, 1481 1470 "Do not display empty (or dummy) events in the output"), 1471 + OPT_BOOLEAN(0, "latency", &symbol_conf.prefer_latency, 1472 + "Show latency-centric profile rather than the default\n" 1473 + "\t\t\t CPU-consumption-centric profile\n" 1474 + "\t\t\t (requires perf record --latency flag)."), 1482 1475 OPT_END() 1483 1476 }; 1484 1477 struct perf_data data = { ··· 1737 1722 symbol_conf.annotate_data_sample = true; 1738 1723 } 1739 1724 1725 + symbol_conf.enable_latency = true; 1740 1726 if (report.disable_order || !perf_session__has_switch_events(session)) { 1741 1727 if (symbol_conf.parallelism_list_str || 1742 - (sort_order && strstr(sort_order, "parallelism")) || 1743 - (field_order && strstr(field_order, "parallelism"))) { 1728 + symbol_conf.prefer_latency || 1729 + (sort_order && (strstr(sort_order, "latency") || 1730 + strstr(sort_order, "parallelism"))) || 1731 + (field_order && (strstr(field_order, "latency") || 1732 + strstr(field_order, "parallelism")))) { 1744 1733 if (report.disable_order) 1745 - ui__error("Use of parallelism is incompatible with --disable-order.\n"); 1734 + ui__error("Use of latency profile or parallelism is incompatible with --disable-order.\n"); 1746 1735 else 1747 - ui__error("Use of parallelism requires --switch-events during record.\n"); 1736 + ui__error("Use of latency profile or parallelism requires --latency flag during record.\n"); 1748 1737 return -1; 1749 1738 } 1739 + /* 1740 + * If user did not ask for anything related to 1741 + * latency/parallelism explicitly, just don't show it. 1742 + */ 1743 + symbol_conf.enable_latency = false; 1750 1744 } 1751 1745 1752 1746 if (sort_order && strstr(sort_order, "ipc")) {
+46 -8
tools/perf/ui/hist.c
··· 631 631 if (is_strict_order(field_order)) 632 632 return; 633 633 634 + /* 635 + * Overhead and latency columns are added in setup_overhead(), 636 + * so they are added implicitly here only if they were added 637 + * by setup_overhead() before (have was_taken flag set). 638 + * This is required because setup_overhead() has more complex 639 + * logic, in particular it does not add "overhead" if user 640 + * specified "latency" in sort order, and vise versa. 641 + */ 634 642 if (symbol_conf.cumulate_callchain) { 635 - hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC); 643 + /* 644 + * Addition of fields is idempotent, so we add latency 645 + * column twice to get desired order with simpler logic. 646 + */ 647 + if (symbol_conf.prefer_latency) 648 + hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true); 649 + hpp_dimension__add_output(PERF_HPP__OVERHEAD_ACC, true); 650 + if (symbol_conf.enable_latency) 651 + hpp_dimension__add_output(PERF_HPP__LATENCY_ACC, true); 636 652 perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self"; 637 653 } 638 654 639 - hpp_dimension__add_output(PERF_HPP__OVERHEAD); 655 + if (symbol_conf.prefer_latency) 656 + hpp_dimension__add_output(PERF_HPP__LATENCY, true); 657 + hpp_dimension__add_output(PERF_HPP__OVERHEAD, true); 658 + if (symbol_conf.enable_latency) 659 + hpp_dimension__add_output(PERF_HPP__LATENCY, true); 640 660 641 661 if (symbol_conf.show_cpu_utilization) { 642 - hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS); 643 - hpp_dimension__add_output(PERF_HPP__OVERHEAD_US); 662 + hpp_dimension__add_output(PERF_HPP__OVERHEAD_SYS, false); 663 + hpp_dimension__add_output(PERF_HPP__OVERHEAD_US, false); 644 664 645 665 if (perf_guest) { 646 - hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS); 647 - hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US); 666 + hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_SYS, false); 667 + hpp_dimension__add_output(PERF_HPP__OVERHEAD_GUEST_US, false); 648 668 } 649 669 } 650 670 651 671 if (symbol_conf.show_nr_samples) 652 - hpp_dimension__add_output(PERF_HPP__SAMPLES); 672 + hpp_dimension__add_output(PERF_HPP__SAMPLES, false); 653 673 654 674 if (symbol_conf.show_total_period) 655 - hpp_dimension__add_output(PERF_HPP__PERIOD); 675 + hpp_dimension__add_output(PERF_HPP__PERIOD, false); 656 676 } 657 677 658 678 void perf_hpp_list__column_register(struct perf_hpp_list *list, ··· 718 698 719 699 if (fmt_equal(ovh, fmt)) 720 700 fmt->name = "Overhead"; 701 + } 702 + } 703 + 704 + void perf_hpp__cancel_latency(void) 705 + { 706 + struct perf_hpp_fmt *fmt, *lat, *acc, *tmp; 707 + 708 + if (is_strict_order(field_order)) 709 + return; 710 + if (sort_order && strstr(sort_order, "latency")) 711 + return; 712 + 713 + lat = &perf_hpp__format[PERF_HPP__LATENCY]; 714 + acc = &perf_hpp__format[PERF_HPP__LATENCY_ACC]; 715 + 716 + perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) { 717 + if (fmt_equal(lat, fmt) || fmt_equal(acc, fmt)) 718 + perf_hpp__column_unregister(fmt); 721 719 } 722 720 } 723 721
+1
tools/perf/util/hist.h
··· 582 582 583 583 void perf_hpp__init(void); 584 584 void perf_hpp__cancel_cumulate(void); 585 + void perf_hpp__cancel_latency(void); 585 586 void perf_hpp__setup_output_field(struct perf_hpp_list *list); 586 587 void perf_hpp__reset_output_field(struct perf_hpp_list *list); 587 588 void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
+27 -6
tools/perf/util/sort.c
··· 2622 2622 const char *name; 2623 2623 struct perf_hpp_fmt *fmt; 2624 2624 int taken; 2625 + int was_taken; 2625 2626 }; 2626 2627 2627 2628 #define DIM(d, n) { .name = n, .fmt = &perf_hpp__format[d], } ··· 3514 3513 return -1; 3515 3514 3516 3515 hd->taken = 1; 3516 + hd->was_taken = 1; 3517 3517 perf_hpp_list__register_sort_field(list, fmt); 3518 3518 return 0; 3519 3519 } ··· 3549 3547 return 0; 3550 3548 } 3551 3549 3552 - int hpp_dimension__add_output(unsigned col) 3550 + int hpp_dimension__add_output(unsigned col, bool implicit) 3553 3551 { 3552 + struct hpp_dimension *hd; 3553 + 3554 3554 BUG_ON(col >= PERF_HPP__MAX_INDEX); 3555 - return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]); 3555 + hd = &hpp_sort_dimensions[col]; 3556 + if (implicit && !hd->was_taken) 3557 + return 0; 3558 + return __hpp_dimension__add_output(&perf_hpp_list, hd); 3556 3559 } 3557 3560 3558 3561 int sort_dimension__add(struct perf_hpp_list *list, const char *tok, ··· 3816 3809 if (sort__mode == SORT_MODE__DIFF) 3817 3810 return keys; 3818 3811 3819 - keys = prefix_if_not_in("overhead", keys); 3820 - 3821 - if (symbol_conf.cumulate_callchain) 3822 - keys = prefix_if_not_in("overhead_children", keys); 3812 + if (symbol_conf.prefer_latency) { 3813 + keys = prefix_if_not_in("overhead", keys); 3814 + keys = prefix_if_not_in("latency", keys); 3815 + if (symbol_conf.cumulate_callchain) { 3816 + keys = prefix_if_not_in("overhead_children", keys); 3817 + keys = prefix_if_not_in("latency_children", keys); 3818 + } 3819 + } else if (!keys || (!strstr(keys, "overhead") && 3820 + !strstr(keys, "latency"))) { 3821 + if (symbol_conf.enable_latency) 3822 + keys = prefix_if_not_in("latency", keys); 3823 + keys = prefix_if_not_in("overhead", keys); 3824 + if (symbol_conf.cumulate_callchain) { 3825 + if (symbol_conf.enable_latency) 3826 + keys = prefix_if_not_in("latency_children", keys); 3827 + keys = prefix_if_not_in("overhead_children", keys); 3828 + } 3829 + } 3823 3830 3824 3831 return keys; 3825 3832 }
+1 -1
tools/perf/util/sort.h
··· 141 141 142 142 bool is_strict_order(const char *order); 143 143 144 - int hpp_dimension__add_output(unsigned col); 144 + int hpp_dimension__add_output(unsigned col, bool implicit); 145 145 void reset_dimensions(void); 146 146 int sort_dimension__add(struct perf_hpp_list *list, const char *tok, 147 147 struct evlist *evlist,
+3 -1
tools/perf/util/symbol_conf.h
··· 49 49 keep_exited_threads, 50 50 annotate_data_member, 51 51 annotate_data_sample, 52 - skip_empty; 52 + skip_empty, 53 + enable_latency, 54 + prefer_latency; 53 55 const char *vmlinux_name, 54 56 *kallsyms_name, 55 57 *source_prefix,