perf stat: Support new per thread TopDown metrics

+5 -2

tools/perf/Documentation/perf-stat.txt

··· 363 363 For best results it is usually a good idea to use it with interval 364 364 mode like -I 1000, as the bottleneck of workloads can change often. 365 365 366 + This enables --metric-only, unless overridden with --no-metric-only. 367 + 368 + The following restrictions only apply to older Intel CPUs and Atom, 369 + on newer CPUs (IceLake and later) TopDown can be collected for any thread: 370 + 366 371 The top down metrics are collected per core instead of per 367 372 CPU thread. Per core mode is automatically enabled 368 373 and -a (global monitoring) is needed, requiring root rights or ··· 378 373 echo 0 > /proc/sys/kernel/nmi_watchdog 379 374 for best results. Otherwise the bottlenecks may be inconsistent 380 375 on workload with changing phases. 381 - 382 - This enables --metric-only, unless overridden with --no-metric-only. 383 376 384 377 To interpret the results it is usually needed to know on which 385 378 CPUs the workload runs on. If needed the CPUs can be forced using

+28 -2

tools/perf/builtin-stat.c

··· 128 128 NULL, 129 129 }; 130 130 131 + static const char *topdown_metric_attrs[] = { 132 + "slots", 133 + "topdown-retiring", 134 + "topdown-bad-spec", 135 + "topdown-fe-bound", 136 + "topdown-be-bound", 137 + NULL, 138 + }; 139 + 131 140 static const char *smi_cost_attrs = { 132 141 "{" 133 142 "msr/aperf/," ··· 1686 1677 char *str = NULL; 1687 1678 bool warn = false; 1688 1679 1680 + if (!force_metric_only) 1681 + stat_config.metric_only = true; 1682 + 1683 + if (topdown_filter_events(topdown_metric_attrs, &str, 1) < 0) { 1684 + pr_err("Out of memory\n"); 1685 + return -1; 1686 + } 1687 + if (topdown_metric_attrs[0] && str) { 1688 + if (!stat_config.interval && !stat_config.metric_only) { 1689 + fprintf(stat_config.output, 1690 + "Topdown accuracy may decrease when measuring long periods.\n" 1691 + "Please print the result regularly, e.g. -I1000\n"); 1692 + } 1693 + goto setup_metrics; 1694 + } 1695 + 1696 + zfree(&str); 1697 + 1689 1698 if (stat_config.aggr_mode != AGGR_GLOBAL && 1690 1699 stat_config.aggr_mode != AGGR_CORE) { 1691 1700 pr_err("top down event configuration requires --per-core mode\n"); ··· 1715 1688 return -1; 1716 1689 } 1717 1690 1718 - if (!force_metric_only) 1719 - stat_config.metric_only = true; 1720 1691 if (topdown_filter_events(topdown_attrs, &str, 1721 1692 arch_topdown_check_group(&warn)) < 0) { 1722 1693 pr_err("Out of memory\n"); ··· 1723 1698 if (topdown_attrs[0] && str) { 1724 1699 if (warn) 1725 1700 arch_topdown_group_warn(); 1701 + setup_metrics: 1726 1702 err = parse_events(evsel_list, str, &errinfo); 1727 1703 if (err) { 1728 1704 fprintf(stderr,

+89

tools/perf/util/stat-shadow.c

··· 241 241 else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) 242 242 update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES, 243 243 ctx, cpu, count); 244 + else if (perf_stat_evsel__is(counter, TOPDOWN_RETIRING)) 245 + update_runtime_stat(st, STAT_TOPDOWN_RETIRING, 246 + ctx, cpu, count); 247 + else if (perf_stat_evsel__is(counter, TOPDOWN_BAD_SPEC)) 248 + update_runtime_stat(st, STAT_TOPDOWN_BAD_SPEC, 249 + ctx, cpu, count); 250 + else if (perf_stat_evsel__is(counter, TOPDOWN_FE_BOUND)) 251 + update_runtime_stat(st, STAT_TOPDOWN_FE_BOUND, 252 + ctx, cpu, count); 253 + else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND)) 254 + update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND, 255 + ctx, cpu, count); 244 256 else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 245 257 update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT, 246 258 ctx, cpu, count); ··· 717 705 return sanitize_val(1.0 - sum); 718 706 } 719 707 708 + /* 709 + * Kernel reports metrics multiplied with slots. To get back 710 + * the ratios we need to recreate the sum. 711 + */ 712 + 713 + static double td_metric_ratio(int ctx, int cpu, 714 + enum stat_type type, 715 + struct runtime_stat *stat) 716 + { 717 + double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, ctx, cpu) + 718 + runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, ctx, cpu) + 719 + runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, ctx, cpu) + 720 + runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, ctx, cpu); 721 + double d = runtime_stat_avg(stat, type, ctx, cpu); 722 + 723 + if (sum) 724 + return d / sum; 725 + return 0; 726 + } 727 + 728 + /* 729 + * ... but only if most of the values are actually available. 730 + * We allow two missing. 731 + */ 732 + 733 + static bool full_td(int ctx, int cpu, 734 + struct runtime_stat *stat) 735 + { 736 + int c = 0; 737 + 738 + if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, ctx, cpu) > 0) 739 + c++; 740 + if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, ctx, cpu) > 0) 741 + c++; 742 + if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, ctx, cpu) > 0) 743 + c++; 744 + if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, ctx, cpu) > 0) 745 + c++; 746 + return c >= 2; 747 + } 748 + 720 749 static void print_smi_cost(struct perf_stat_config *config, 721 750 int cpu, struct evsel *evsel, 722 751 struct perf_stat_output_ctx *out, ··· 1126 1073 be_bound * 100.); 1127 1074 else 1128 1075 print_metric(config, ctxp, NULL, NULL, name, 0); 1076 + } else if (perf_stat_evsel__is(evsel, TOPDOWN_RETIRING) && 1077 + full_td(ctx, cpu, st)) { 1078 + double retiring = td_metric_ratio(ctx, cpu, 1079 + STAT_TOPDOWN_RETIRING, st); 1080 + 1081 + if (retiring > 0.7) 1082 + color = PERF_COLOR_GREEN; 1083 + print_metric(config, ctxp, color, "%8.1f%%", "retiring", 1084 + retiring * 100.); 1085 + } else if (perf_stat_evsel__is(evsel, TOPDOWN_FE_BOUND) && 1086 + full_td(ctx, cpu, st)) { 1087 + double fe_bound = td_metric_ratio(ctx, cpu, 1088 + STAT_TOPDOWN_FE_BOUND, st); 1089 + 1090 + if (fe_bound > 0.2) 1091 + color = PERF_COLOR_RED; 1092 + print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", 1093 + fe_bound * 100.); 1094 + } else if (perf_stat_evsel__is(evsel, TOPDOWN_BE_BOUND) && 1095 + full_td(ctx, cpu, st)) { 1096 + double be_bound = td_metric_ratio(ctx, cpu, 1097 + STAT_TOPDOWN_BE_BOUND, st); 1098 + 1099 + if (be_bound > 0.2) 1100 + color = PERF_COLOR_RED; 1101 + print_metric(config, ctxp, color, "%8.1f%%", "backend bound", 1102 + be_bound * 100.); 1103 + } else if (perf_stat_evsel__is(evsel, TOPDOWN_BAD_SPEC) && 1104 + full_td(ctx, cpu, st)) { 1105 + double bad_spec = td_metric_ratio(ctx, cpu, 1106 + STAT_TOPDOWN_BAD_SPEC, st); 1107 + 1108 + if (bad_spec > 0.1) 1109 + color = PERF_COLOR_RED; 1110 + print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", 1111 + bad_spec * 100.); 1129 1112 } else if (evsel->metric_expr) { 1130 1113 generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, 1131 1114 evsel->name, evsel->metric_name, NULL, 1, cpu, out, st);

+4

tools/perf/util/stat.c

··· 95 95 ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired), 96 96 ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles), 97 97 ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles), 98 + ID(TOPDOWN_RETIRING, topdown-retiring), 99 + ID(TOPDOWN_BAD_SPEC, topdown-bad-spec), 100 + ID(TOPDOWN_FE_BOUND, topdown-fe-bound), 101 + ID(TOPDOWN_BE_BOUND, topdown-be-bound), 98 102 ID(SMI_NUM, msr/smi/), 99 103 ID(APERF, msr/aperf/), 100 104 };

+8

tools/perf/util/stat.h

··· 28 28 PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED, 29 29 PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES, 30 30 PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES, 31 + PERF_STAT_EVSEL_ID__TOPDOWN_RETIRING, 32 + PERF_STAT_EVSEL_ID__TOPDOWN_BAD_SPEC, 33 + PERF_STAT_EVSEL_ID__TOPDOWN_FE_BOUND, 34 + PERF_STAT_EVSEL_ID__TOPDOWN_BE_BOUND, 31 35 PERF_STAT_EVSEL_ID__SMI_NUM, 32 36 PERF_STAT_EVSEL_ID__APERF, 33 37 PERF_STAT_EVSEL_ID__MAX, ··· 86 82 STAT_TOPDOWN_SLOTS_RETIRED, 87 83 STAT_TOPDOWN_FETCH_BUBBLES, 88 84 STAT_TOPDOWN_RECOVERY_BUBBLES, 85 + STAT_TOPDOWN_RETIRING, 86 + STAT_TOPDOWN_BAD_SPEC, 87 + STAT_TOPDOWN_FE_BOUND, 88 + STAT_TOPDOWN_BE_BOUND, 89 89 STAT_SMI_NUM, 90 90 STAT_APERF, 91 91 STAT_MAX

Configure Feed

Configure Feed