Merge tag 'perf-tools-fixes-for-v6.4-1-2023-05-20' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux

+36

tools/arch/arm64/include/uapi/asm/kvm.h

··· 198 198 __u64 reserved[2]; 199 199 }; 200 200 201 + /* 202 + * Counter/Timer offset structure. Describe the virtual/physical offset. 203 + * To be used with KVM_ARM_SET_COUNTER_OFFSET. 204 + */ 205 + struct kvm_arm_counter_offset { 206 + __u64 counter_offset; 207 + __u64 reserved; 208 + }; 209 + 201 210 #define KVM_ARM_TAGS_TO_GUEST 0 202 211 #define KVM_ARM_TAGS_FROM_GUEST 1 203 212 ··· 381 372 #endif 382 373 }; 383 374 375 + /* Device Control API on vm fd */ 376 + #define KVM_ARM_VM_SMCCC_CTRL 0 377 + #define KVM_ARM_VM_SMCCC_FILTER 0 378 + 384 379 /* Device Control API: ARM VGIC */ 385 380 #define KVM_DEV_ARM_VGIC_GRP_ADDR 0 386 381 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 ··· 424 411 #define KVM_ARM_VCPU_TIMER_CTRL 1 425 412 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 426 413 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 414 + #define KVM_ARM_VCPU_TIMER_IRQ_HVTIMER 2 415 + #define KVM_ARM_VCPU_TIMER_IRQ_HPTIMER 3 427 416 #define KVM_ARM_VCPU_PVTIME_CTRL 2 428 417 #define KVM_ARM_VCPU_PVTIME_IPA 0 429 418 ··· 483 468 484 469 /* run->fail_entry.hardware_entry_failure_reason codes. */ 485 470 #define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0) 471 + 472 + enum kvm_smccc_filter_action { 473 + KVM_SMCCC_FILTER_HANDLE = 0, 474 + KVM_SMCCC_FILTER_DENY, 475 + KVM_SMCCC_FILTER_FWD_TO_USER, 476 + 477 + #ifdef __KERNEL__ 478 + NR_SMCCC_FILTER_ACTIONS 479 + #endif 480 + }; 481 + 482 + struct kvm_smccc_filter { 483 + __u32 base; 484 + __u32 nr_functions; 485 + __u8 action; 486 + __u8 pad[15]; 487 + }; 488 + 489 + /* arm64-specific KVM_EXIT_HYPERCALL flags */ 490 + #define KVM_HYPERCALL_EXIT_SMC (1U << 0) 491 + #define KVM_HYPERCALL_EXIT_16BIT (1U << 1) 486 492 487 493 #endif 488 494

+21 -5

tools/arch/x86/include/asm/cpufeatures.h

··· 97 97 #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ 98 98 #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ 99 99 #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ 100 - #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ 100 + /* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ 101 101 #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ 102 102 #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ 103 103 #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ ··· 226 226 227 227 /* Virtualization flags: Linux defined, word 8 */ 228 228 #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ 229 - #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ 230 - #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ 231 - #define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ 232 - #define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ 229 + #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */ 230 + #define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */ 231 + #define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */ 233 232 234 233 #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ 235 234 #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ ··· 306 307 #define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ 307 308 #define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ 308 309 #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ 310 + #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ 311 + #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ 309 312 310 313 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ 311 314 #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ 312 315 #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ 313 316 #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ 317 + #define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */ 318 + #define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ 319 + #define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ 320 + #define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ 314 321 #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ 315 322 #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ 316 323 #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ 324 + #define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ 317 325 318 326 /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ 319 327 #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ ··· 337 331 #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ 338 332 #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ 339 333 #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ 334 + #define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ 340 335 #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ 341 336 #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ 342 337 ··· 370 363 #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ 371 364 #define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ 372 365 #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ 366 + #define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */ 373 367 #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ 374 368 375 369 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ ··· 435 427 #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ 436 428 #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ 437 429 430 + /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ 431 + #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ 432 + #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ 433 + #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ 434 + #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ 435 + #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ 436 + 438 437 /* 439 438 * BUG word(s) 440 439 */ ··· 482 467 #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ 483 468 #define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ 484 469 #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ 470 + #define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ 485 471 486 472 #endif /* _ASM_X86_CPUFEATURES_H */

+7 -1

tools/arch/x86/include/asm/disabled-features.h

··· 75 75 # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) 76 76 #endif 77 77 78 + #ifdef CONFIG_ADDRESS_MASKING 79 + # define DISABLE_LAM 0 80 + #else 81 + # define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) 82 + #endif 83 + 78 84 #ifdef CONFIG_INTEL_IOMMU_SVM 79 85 # define DISABLE_ENQCMD 0 80 86 #else ··· 121 115 #define DISABLED_MASK10 0 122 116 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ 123 117 DISABLE_CALL_DEPTH_TRACKING) 124 - #define DISABLED_MASK12 0 118 + #define DISABLED_MASK12 (DISABLE_LAM) 125 119 #define DISABLED_MASK13 0 126 120 #define DISABLED_MASK14 0 127 121 #define DISABLED_MASK15 0

+2

tools/arch/x86/include/asm/msr-index.h

··· 206 206 207 207 /* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */ 208 208 #define MSR_INTEGRITY_CAPS 0x000002d9 209 + #define MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT 2 210 + #define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT) 209 211 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 210 212 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) 211 213

+3

tools/arch/x86/include/uapi/asm/kvm.h

··· 559 559 #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ 560 560 #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ 561 561 562 + /* x86-specific KVM_EXIT_HYPERCALL flags. */ 563 + #define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) 564 + 562 565 #endif /* _ASM_X86_KVM_H */

+8

tools/arch/x86/include/uapi/asm/prctl.h

··· 16 16 #define ARCH_GET_XCOMP_GUEST_PERM 0x1024 17 17 #define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 18 18 19 + #define ARCH_XCOMP_TILECFG 17 20 + #define ARCH_XCOMP_TILEDATA 18 21 + 19 22 #define ARCH_MAP_VDSO_X32 0x2001 20 23 #define ARCH_MAP_VDSO_32 0x2002 21 24 #define ARCH_MAP_VDSO_64 0x2003 25 + 26 + #define ARCH_GET_UNTAG_MASK 0x4001 27 + #define ARCH_ENABLE_TAGGED_ADDR 0x4002 28 + #define ARCH_GET_MAX_TAG_BITS 0x4003 29 + #define ARCH_FORCE_TAGGED_SVA 0x4004 22 30 23 31 #endif /* _ASM_X86_PRCTL_H */

+3

tools/arch/x86/include/uapi/asm/unistd_32.h

··· 2 2 #ifndef __NR_fork 3 3 #define __NR_fork 2 4 4 #endif 5 + #ifndef __NR_execve 6 + #define __NR_execve 11 7 + #endif 5 8 #ifndef __NR_getppid 6 9 #define __NR_getppid 64 7 10 #endif

+10 -24

tools/arch/x86/lib/memcpy_64.S

··· 10 10 .section .noinstr.text, "ax" 11 11 12 12 /* 13 - * We build a jump to memcpy_orig by default which gets NOPped out on 14 - * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which 15 - * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs 16 - * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. 17 - */ 18 - 19 - /* 20 13 * memcpy - Copy a memory block. 21 14 * 22 15 * Input: ··· 19 26 * 20 27 * Output: 21 28 * rax original destination 29 + * 30 + * The FSRM alternative should be done inline (avoiding the call and 31 + * the disgusting return handling), but that would require some help 32 + * from the compiler for better calling conventions. 33 + * 34 + * The 'rep movsb' itself is small enough to replace the call, but the 35 + * two register moves blow up the code. And one of them is "needed" 36 + * only for the return value that is the same as the source input, 37 + * which the compiler could/should do much better anyway. 22 38 */ 23 39 SYM_TYPED_FUNC_START(__memcpy) 24 - ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ 25 - "jmp memcpy_erms", X86_FEATURE_ERMS 40 + ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM 26 41 27 42 movq %rdi, %rax 28 43 movq %rdx, %rcx 29 - shrq $3, %rcx 30 - andl $7, %edx 31 - rep movsq 32 - movl %edx, %ecx 33 44 rep movsb 34 45 RET 35 46 SYM_FUNC_END(__memcpy) ··· 41 44 42 45 SYM_FUNC_ALIAS(memcpy, __memcpy) 43 46 EXPORT_SYMBOL(memcpy) 44 - 45 - /* 46 - * memcpy_erms() - enhanced fast string memcpy. This is faster and 47 - * simpler than memcpy. Use memcpy_erms when possible. 48 - */ 49 - SYM_FUNC_START_LOCAL(memcpy_erms) 50 - movq %rdi, %rax 51 - movq %rdx, %rcx 52 - rep movsb 53 - RET 54 - SYM_FUNC_END(memcpy_erms) 55 47 56 48 SYM_FUNC_START_LOCAL(memcpy_orig) 57 49 movq %rdi, %rax

+11 -36

tools/arch/x86/lib/memset_64.S

··· 18 18 * rdx count (bytes) 19 19 * 20 20 * rax original destination 21 + * 22 + * The FSRS alternative should be done inline (avoiding the call and 23 + * the disgusting return handling), but that would require some help 24 + * from the compiler for better calling conventions. 25 + * 26 + * The 'rep stosb' itself is small enough to replace the call, but all 27 + * the register moves blow up the code. And two of them are "needed" 28 + * only for the return value that is the same as the source input, 29 + * which the compiler could/should do much better anyway. 21 30 */ 22 31 SYM_FUNC_START(__memset) 23 - /* 24 - * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended 25 - * to use it when possible. If not available, use fast string instructions. 26 - * 27 - * Otherwise, use original memset function. 28 - */ 29 - ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ 30 - "jmp memset_erms", X86_FEATURE_ERMS 32 + ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS 31 33 32 34 movq %rdi,%r9 35 + movb %sil,%al 33 36 movq %rdx,%rcx 34 - andl $7,%edx 35 - shrq $3,%rcx 36 - /* expand byte value */ 37 - movzbl %sil,%esi 38 - movabs $0x0101010101010101,%rax 39 - imulq %rsi,%rax 40 - rep stosq 41 - movl %edx,%ecx 42 37 rep stosb 43 38 movq %r9,%rax 44 39 RET ··· 42 47 43 48 SYM_FUNC_ALIAS(memset, __memset) 44 49 EXPORT_SYMBOL(memset) 45 - 46 - /* 47 - * ISO C memset - set a memory block to a byte value. This function uses 48 - * enhanced rep stosb to override the fast string function. 49 - * The code is simpler and shorter than the fast string function as well. 50 - * 51 - * rdi destination 52 - * rsi value (char) 53 - * rdx count (bytes) 54 - * 55 - * rax original destination 56 - */ 57 - SYM_FUNC_START_LOCAL(memset_erms) 58 - movq %rdi,%r9 59 - movb %sil,%al 60 - movq %rdx,%rcx 61 - rep stosb 62 - movq %r9,%rax 63 - RET 64 - SYM_FUNC_END(memset_erms) 65 50 66 51 SYM_FUNC_START_LOCAL(memset_orig) 67 52 movq %rdi,%r10

+1 -2

tools/include/asm/alternative.h

··· 4 4 5 5 /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ 6 6 7 - #define altinstruction_entry # 8 - #define ALTERNATIVE_2 # 7 + #define ALTERNATIVE # 9 8 10 9 #endif

+55 -2

tools/include/uapi/drm/drm.h

··· 972 972 #define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) 973 973 #define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) 974 974 #define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) 975 + /** 976 + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. 977 + * 978 + * GEM handles are not reference-counted by the kernel. User-space is 979 + * responsible for managing their lifetime. For example, if user-space imports 980 + * the same memory object twice on the same DRM file description, the same GEM 981 + * handle is returned by both imports, and user-space needs to ensure 982 + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen 983 + * when a memory object is allocated, then exported and imported again on the 984 + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception 985 + * and always returns fresh new GEM handles even if an existing GEM handle 986 + * already refers to the same memory object before the IOCTL is performed. 987 + */ 975 988 #define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) 976 989 #define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) 977 990 #define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) ··· 1025 1012 #define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) 1026 1013 #define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) 1027 1014 1015 + /** 1016 + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. 1017 + * 1018 + * User-space sets &drm_prime_handle.handle with the GEM handle to export and 1019 + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in 1020 + * &drm_prime_handle.fd. 1021 + * 1022 + * The export can fail for any driver-specific reason, e.g. because export is 1023 + * not supported for this specific GEM handle (but might be for others). 1024 + * 1025 + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. 1026 + */ 1028 1027 #define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) 1028 + /** 1029 + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. 1030 + * 1031 + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to 1032 + * import, and gets back a GEM handle in &drm_prime_handle.handle. 1033 + * &drm_prime_handle.flags is unused. 1034 + * 1035 + * If an existing GEM handle refers to the memory object backing the DMA-BUF, 1036 + * that GEM handle is returned. Therefore user-space which needs to handle 1037 + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually 1038 + * reference-count duplicated GEM handles. For more information see 1039 + * &DRM_IOCTL_GEM_CLOSE. 1040 + * 1041 + * The import can fail for any driver-specific reason, e.g. because import is 1042 + * only supported for DMA-BUFs allocated on this DRM device. 1043 + * 1044 + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. 1045 + */ 1029 1046 #define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) 1030 1047 1031 1048 #define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) ··· 1147 1104 * struct as the output. 1148 1105 * 1149 1106 * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles 1150 - * will be filled with GEM buffer handles. Planes are valid until one has a 1151 - * zero handle -- this can be used to compute the number of planes. 1107 + * will be filled with GEM buffer handles. Fresh new GEM handles are always 1108 + * returned, even if another GEM handle referring to the same memory object 1109 + * already exists on the DRM file description. The caller is responsible for 1110 + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same 1111 + * new handle will be returned for multiple planes in case they use the same 1112 + * memory object. Planes are valid until one has a zero handle -- this can be 1113 + * used to compute the number of planes. 1152 1114 * 1153 1115 * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid 1154 1116 * until one has a zero &drm_mode_fb_cmd2.pitches. ··· 1161 1113 * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set 1162 1114 * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the 1163 1115 * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. 1116 + * 1117 + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space 1118 + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately 1119 + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not 1120 + * double-close handles which are specified multiple times in the array. 1164 1121 */ 1165 1122 #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) 1166 1123

+24 -1

tools/include/uapi/drm/i915_drm.h

··· 2491 2491 #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */ 2492 2492 #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */ 2493 2493 #define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */ 2494 - struct i915_engine_class_instance engines[0]; 2494 + struct i915_engine_class_instance engines[]; 2495 2495 } __attribute__((packed)); 2496 2496 2497 2497 #define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \ ··· 2676 2676 I915_OAR_FORMAT_A32u40_A4u32_B8_C8, 2677 2677 I915_OA_FORMAT_A24u40_A14u32_B8_C8, 2678 2678 2679 + /* MTL OAM */ 2680 + I915_OAM_FORMAT_MPEC8u64_B8_C8, 2681 + I915_OAM_FORMAT_MPEC8u32_B8_C8, 2682 + 2679 2683 I915_OA_FORMAT_MAX /* non-ABI */ 2680 2684 }; 2681 2685 ··· 2761 2757 * This property is available in perf revision 5. 2762 2758 */ 2763 2759 DRM_I915_PERF_PROP_POLL_OA_PERIOD, 2760 + 2761 + /** 2762 + * Multiple engines may be mapped to the same OA unit. The OA unit is 2763 + * identified by class:instance of any engine mapped to it. 2764 + * 2765 + * This parameter specifies the engine class and must be passed along 2766 + * with DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE. 2767 + * 2768 + * This property is available in perf revision 6. 2769 + */ 2770 + DRM_I915_PERF_PROP_OA_ENGINE_CLASS, 2771 + 2772 + /** 2773 + * This parameter specifies the engine instance and must be passed along 2774 + * with DRM_I915_PERF_PROP_OA_ENGINE_CLASS. 2775 + * 2776 + * This property is available in perf revision 6. 2777 + */ 2778 + DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE, 2764 2779 2765 2780 DRM_I915_PERF_PROP_MAX /* non-ABI */ 2766 2781 };

+1 -1

tools/include/uapi/linux/const.h

··· 28 28 #define _BITUL(x) (_UL(1) << (x)) 29 29 #define _BITULL(x) (_ULL(1) << (x)) 30 30 31 - #define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) 31 + #define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1) 32 32 #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) 33 33 34 34 #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))

+1

tools/include/uapi/linux/in.h

··· 162 162 #define MCAST_MSFILTER 48 163 163 #define IP_MULTICAST_ALL 49 164 164 #define IP_UNICAST_IF 50 165 + #define IP_LOCAL_PORT_RANGE 51 165 166 166 167 #define MCAST_EXCLUDE 0 167 168 #define MCAST_INCLUDE 1

+10 -2

tools/include/uapi/linux/kvm.h

··· 341 341 __u64 nr; 342 342 __u64 args[6]; 343 343 __u64 ret; 344 - __u32 longmode; 345 - __u32 pad; 344 + 345 + union { 346 + #ifndef __KERNEL__ 347 + __u32 longmode; 348 + #endif 349 + __u64 flags; 350 + }; 346 351 } hypercall; 347 352 /* KVM_EXIT_TPR_ACCESS */ 348 353 struct { ··· 1189 1184 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 1190 1185 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 1191 1186 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 1187 + #define KVM_CAP_COUNTER_OFFSET 227 1192 1188 1193 1189 #ifdef KVM_CAP_IRQ_ROUTING 1194 1190 ··· 1549 1543 #define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) 1550 1544 #define KVM_PPC_SVM_OFF _IO(KVMIO, 0xb3) 1551 1545 #define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) 1546 + /* Available with KVM_CAP_COUNTER_OFFSET */ 1547 + #define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO, 0xb5, struct kvm_arm_counter_offset) 1552 1548 1553 1549 /* ioctl for vm fd */ 1554 1550 #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device)

+2

tools/include/uapi/linux/prctl.h

··· 290 290 #define PR_SET_VMA 0x53564d41 291 291 # define PR_SET_VMA_ANON_NAME 0 292 292 293 + #define PR_GET_AUXV 0x41555856 294 + 293 295 #define PR_SET_MEMORY_MERGE 67 294 296 #define PR_GET_MEMORY_MERGE 68 295 297 #endif /* _LINUX_PRCTL_H */

+10 -4

tools/include/uapi/sound/asound.h

··· 429 429 snd_pcm_uframes_t avail_min; /* min avail frames for wakeup */ 430 430 snd_pcm_uframes_t xfer_align; /* obsolete: xfer size need to be a multiple */ 431 431 snd_pcm_uframes_t start_threshold; /* min hw_avail frames for automatic start */ 432 - snd_pcm_uframes_t stop_threshold; /* min avail frames for automatic stop */ 433 - snd_pcm_uframes_t silence_threshold; /* min distance from noise for silence filling */ 434 - snd_pcm_uframes_t silence_size; /* silence block size */ 432 + /* 433 + * The following two thresholds alleviate playback buffer underruns; when 434 + * hw_avail drops below the threshold, the respective action is triggered: 435 + */ 436 + snd_pcm_uframes_t stop_threshold; /* - stop playback */ 437 + snd_pcm_uframes_t silence_threshold; /* - pre-fill buffer with silence */ 438 + snd_pcm_uframes_t silence_size; /* max size of silence pre-fill; when >= boundary, 439 + * fill played area with silence immediately */ 435 440 snd_pcm_uframes_t boundary; /* pointers wrap point */ 436 441 unsigned int proto; /* protocol version */ 437 442 unsigned int tstamp_type; /* timestamp type (req. proto >= 2.0.12) */ ··· 575 570 struct __snd_pcm_mmap_control64 { 576 571 __pad_before_uframe __pad1; 577 572 snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ 578 - __pad_before_uframe __pad2; 573 + __pad_before_uframe __pad2; // This should be __pad_after_uframe, but binary 574 + // backwards compatibility constraints prevent a fix. 579 575 580 576 __pad_before_uframe __pad3; 581 577 snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */

+6

tools/perf/Makefile.config

··· 216 216 dummy := $(error Error: $(BISON) is missing on this system, please install it) 217 217 endif 218 218 219 + ifeq ($(BUILD_BPF_SKEL),1) 220 + ifeq ($(call get-executable,$(CLANG)),) 221 + dummy := $(error $(CLANG) is missing on this system, please install it to be able to build with BUILD_BPF_SKEL=1) 222 + endif 223 + endif 224 + 219 225 ifneq ($(OUTPUT),) 220 226 ifeq ($(shell expr $(shell $(BISON) --version | grep bison | sed -e 's/.\+ $[0-9]\+$.$[0-9]\+$.$[0-9]\+$/\1\2\3/g') \>\= 371), 1) 221 227 BISON_FILE_PREFIX_MAP := --file-prefix-map=$(OUTPUT)=

+20 -2

tools/perf/Makefile.perf

··· 1057 1057 1058 1058 ifdef BUILD_BPF_SKEL 1059 1059 BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool 1060 - BPF_INCLUDE := -I$(SKEL_TMP_OUT)/.. -I$(LIBBPF_INCLUDE) 1060 + # Get Clang's default includes on this system, as opposed to those seen by 1061 + # '-target bpf'. This fixes "missing" files on some architectures/distros, 1062 + # such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. 1063 + # 1064 + # Use '-idirafter': Don't interfere with include mechanics except where the 1065 + # build would have failed anyways. 1066 + define get_sys_includes 1067 + $(shell $(1) $(2) -v -E - </dev/null 2>&1 \ 1068 + | sed -n '/<...> search starts here:/,/End of search list./{ s| $/.*$|-idirafter \1|p }') \ 1069 + $(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') 1070 + endef 1071 + 1072 + ifneq ($(CROSS_COMPILE),) 1073 + CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%)) 1074 + endif 1075 + 1076 + CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) 1077 + BPF_INCLUDE := -I$(SKEL_TMP_OUT)/.. -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) 1078 + TOOLS_UAPI_INCLUDE := -I$(srctree)/tools/include/uapi 1061 1079 1062 1080 $(BPFTOOL): | $(SKEL_TMP_OUT) 1063 1081 $(Q)CFLAGS= $(MAKE) -C ../bpf/bpftool \ 1064 1082 OUTPUT=$(SKEL_TMP_OUT)/ bootstrap 1065 1083 1066 1084 $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT) 1067 - $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \ 1085 + $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \ 1068 1086 -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@ 1069 1087 1070 1088 $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL)

+4 -5

tools/perf/arch/arm/util/cs-etm.c

··· 78 78 char path[PATH_MAX]; 79 79 int err; 80 80 u32 val; 81 - u64 contextid = 82 - evsel->core.attr.config & 83 - (perf_pmu__format_bits(&cs_etm_pmu->format, "contextid1") | 81 + u64 contextid = evsel->core.attr.config & 82 + (perf_pmu__format_bits(&cs_etm_pmu->format, "contextid") | 83 + perf_pmu__format_bits(&cs_etm_pmu->format, "contextid1") | 84 84 perf_pmu__format_bits(&cs_etm_pmu->format, "contextid2")); 85 85 86 86 if (!contextid) ··· 114 114 * 0b00100 Maximum of 32-bit Context ID size. 115 115 * All other values are reserved. 116 116 */ 117 - val = BMVAL(val, 5, 9); 118 - if (!val || val != 0x4) { 117 + if (BMVAL(val, 5, 9) != 0x4) { 119 118 pr_err("%s: CONTEXTIDR_EL1 isn't supported, disable with %s/contextid1=0/\n", 120 119 CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); 121 120 return -EINVAL;

+2 -2

tools/perf/arch/arm64/util/header.c

··· 29 29 char path[PATH_MAX]; 30 30 FILE *file; 31 31 32 - scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d"MIDR, 33 - sysfs, cpus->map[cpu]); 32 + scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d" MIDR, 33 + sysfs, RC_CHK_ACCESS(cpus)->map[cpu].cpu); 34 34 35 35 file = fopen(path, "r"); 36 36 if (!file) {

+1 -1

tools/perf/arch/arm64/util/pmu.c

··· 18 18 * The cpumap should cover all CPUs. Otherwise, some CPUs may 19 19 * not support some events or have different event IDs. 20 20 */ 21 - if (pmu->cpus->nr != cpu__max_cpu().cpu) 21 + if (RC_CHK_ACCESS(pmu->cpus)->nr != cpu__max_cpu().cpu) 22 22 return NULL; 23 23 24 24 return pmu;

+1 -1

tools/perf/arch/s390/entry/syscalls/syscall.tbl

··· 449 449 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 450 450 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 451 451 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self 452 - # 447 reserved for memfd_secret 452 + 447 common memfd_secret sys_memfd_secret sys_memfd_secret 453 453 448 common process_mrelease sys_process_mrelease sys_process_mrelease 454 454 449 common futex_waitv sys_futex_waitv sys_futex_waitv 455 455 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node

-4

tools/perf/bench/mem-memcpy-x86-64-asm-def.h

··· 7 7 MEMCPY_FN(__memcpy, 8 8 "x86-64-movsq", 9 9 "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") 10 - 11 - MEMCPY_FN(memcpy_erms, 12 - "x86-64-movsb", 13 - "movsb-based memcpy() in arch/x86/lib/memcpy_64.S")

+1 -1

tools/perf/bench/mem-memcpy-x86-64-asm.S

··· 2 2 3 3 /* Various wrappers to make the kernel .S file build in user-space: */ 4 4 5 - // memcpy_orig and memcpy_erms are being defined as SYM_L_LOCAL but we need it 5 + // memcpy_orig is being defined as SYM_L_LOCAL but we need it 6 6 #define SYM_FUNC_START_LOCAL(name) \ 7 7 SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) 8 8 #define memcpy MEMCPY /* don't hide glibc's memcpy() */

-4

tools/perf/bench/mem-memset-x86-64-asm-def.h

··· 7 7 MEMSET_FN(__memset, 8 8 "x86-64-stosq", 9 9 "movsq-based memset() in arch/x86/lib/memset_64.S") 10 - 11 - MEMSET_FN(memset_erms, 12 - "x86-64-stosb", 13 - "movsb-based memset() in arch/x86/lib/memset_64.S")

+1 -1

tools/perf/bench/mem-memset-x86-64-asm.S

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 - // memset_orig and memset_erms are being defined as SYM_L_LOCAL but we need it 2 + // memset_orig is being defined as SYM_L_LOCAL but we need it 3 3 #define SYM_FUNC_START_LOCAL(name) \ 4 4 SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) 5 5 #define memset MEMSET /* don't hide glibc's memset() */

+7

tools/perf/builtin-script.c

··· 3647 3647 union perf_event *event) 3648 3648 { 3649 3649 perf_event__read_stat_config(&stat_config, &event->stat_config); 3650 + 3651 + /* 3652 + * Aggregation modes are not used since post-processing scripts are 3653 + * supposed to take care of such requirements 3654 + */ 3655 + stat_config.aggr_mode = AGGR_NONE; 3656 + 3650 3657 return 0; 3651 3658 } 3652 3659

+29 -9

tools/perf/builtin-stat.c

··· 667 667 evsel_list->core.threads->err_thread = -1; 668 668 return COUNTER_RETRY; 669 669 } 670 + } else if (counter->skippable) { 671 + if (verbose > 0) 672 + ui__warning("skipping event %s that kernel failed to open .\n", 673 + evsel__name(counter)); 674 + counter->supported = false; 675 + counter->errored = true; 676 + return COUNTER_SKIP; 670 677 } 671 678 672 679 evsel__open_strerror(counter, &target, errno, msg, sizeof(msg)); ··· 1897 1890 * caused by exposing latent bugs. This is fixed properly in: 1898 1891 * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ 1899 1892 */ 1900 - if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid() && 1901 - metricgroup__parse_groups(evsel_list, "TopdownL1", 1902 - /*metric_no_group=*/false, 1903 - /*metric_no_merge=*/false, 1904 - /*metric_no_threshold=*/true, 1905 - stat_config.user_requested_cpu_list, 1906 - stat_config.system_wide, 1907 - &stat_config.metric_events) < 0) 1908 - return -1; 1893 + if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid()) { 1894 + struct evlist *metric_evlist = evlist__new(); 1895 + struct evsel *metric_evsel; 1896 + 1897 + if (!metric_evlist) 1898 + return -1; 1899 + 1900 + if (metricgroup__parse_groups(metric_evlist, "TopdownL1", 1901 + /*metric_no_group=*/false, 1902 + /*metric_no_merge=*/false, 1903 + /*metric_no_threshold=*/true, 1904 + stat_config.user_requested_cpu_list, 1905 + stat_config.system_wide, 1906 + &stat_config.metric_events) < 0) 1907 + return -1; 1908 + 1909 + evlist__for_each_entry(metric_evlist, metric_evsel) { 1910 + metric_evsel->skippable = true; 1911 + } 1912 + evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries); 1913 + evlist__delete(metric_evlist); 1914 + } 1909 1915 1910 1916 /* Platform specific attrs */ 1911 1917 if (evlist__add_default_attrs(evsel_list, default_null_attrs) < 0)

+26

tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json

··· 133 133 "MetricGroup": "TopdownL1;tma_L1_group", 134 134 "MetricName": "tma_backend_bound", 135 135 "MetricThreshold": "tma_backend_bound > 0.1", 136 + "MetricgroupNoGroup": "TopdownL1", 136 137 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.", 137 138 "ScaleUnit": "100%", 138 139 "Unit": "cpu_atom" ··· 144 143 "MetricGroup": "TopdownL1;tma_L1_group", 145 144 "MetricName": "tma_backend_bound_aux", 146 145 "MetricThreshold": "tma_backend_bound_aux > 0.2", 146 + "MetricgroupNoGroup": "TopdownL1", 147 147 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.", 148 148 "ScaleUnit": "100%", 149 149 "Unit": "cpu_atom" ··· 155 153 "MetricGroup": "TopdownL1;tma_L1_group", 156 154 "MetricName": "tma_bad_speculation", 157 155 "MetricThreshold": "tma_bad_speculation > 0.15", 156 + "MetricgroupNoGroup": "TopdownL1", 158 157 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", 159 158 "ScaleUnit": "100%", 160 159 "Unit": "cpu_atom" ··· 166 163 "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", 167 164 "MetricName": "tma_base", 168 165 "MetricThreshold": "tma_base > 0.6", 166 + "MetricgroupNoGroup": "TopdownL2", 169 167 "ScaleUnit": "100%", 170 168 "Unit": "cpu_atom" 171 169 }, ··· 186 182 "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", 187 183 "MetricName": "tma_branch_mispredicts", 188 184 "MetricThreshold": "tma_branch_mispredicts > 0.05", 185 + "MetricgroupNoGroup": "TopdownL2", 189 186 "ScaleUnit": "100%", 190 187 "Unit": "cpu_atom" 191 188 }, ··· 214 209 "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", 215 210 "MetricName": "tma_core_bound", 216 211 "MetricThreshold": "tma_core_bound > 0.1", 212 + "MetricgroupNoGroup": "TopdownL2", 217 213 "ScaleUnit": "100%", 218 214 "Unit": "cpu_atom" 219 215 }, ··· 261 255 "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", 262 256 "MetricName": "tma_fetch_bandwidth", 263 257 "MetricThreshold": "tma_fetch_bandwidth > 0.1", 258 + "MetricgroupNoGroup": "TopdownL2", 264 259 "ScaleUnit": "100%", 265 260 "Unit": "cpu_atom" 266 261 }, ··· 271 264 "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", 272 265 "MetricName": "tma_fetch_latency", 273 266 "MetricThreshold": "tma_fetch_latency > 0.15", 267 + "MetricgroupNoGroup": "TopdownL2", 274 268 "ScaleUnit": "100%", 275 269 "Unit": "cpu_atom" 276 270 }, ··· 299 291 "MetricGroup": "TopdownL1;tma_L1_group", 300 292 "MetricName": "tma_frontend_bound", 301 293 "MetricThreshold": "tma_frontend_bound > 0.2", 294 + "MetricgroupNoGroup": "TopdownL1", 302 295 "ScaleUnit": "100%", 303 296 "Unit": "cpu_atom" 304 297 }, ··· 602 593 "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", 603 594 "MetricName": "tma_machine_clears", 604 595 "MetricThreshold": "tma_machine_clears > 0.05", 596 + "MetricgroupNoGroup": "TopdownL2", 605 597 "ScaleUnit": "100%", 606 598 "Unit": "cpu_atom" 607 599 }, ··· 621 611 "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", 622 612 "MetricName": "tma_memory_bound", 623 613 "MetricThreshold": "tma_memory_bound > 0.2", 614 + "MetricgroupNoGroup": "TopdownL2", 624 615 "ScaleUnit": "100%", 625 616 "Unit": "cpu_atom" 626 617 }, ··· 640 629 "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", 641 630 "MetricName": "tma_ms_uops", 642 631 "MetricThreshold": "tma_ms_uops > 0.05", 632 + "MetricgroupNoGroup": "TopdownL2", 643 633 "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.", 644 634 "ScaleUnit": "100%", 645 635 "Unit": "cpu_atom" ··· 741 729 "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_aux_group", 742 730 "MetricName": "tma_resource_bound", 743 731 "MetricThreshold": "tma_resource_bound > 0.2", 732 + "MetricgroupNoGroup": "TopdownL2", 744 733 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count.", 745 734 "ScaleUnit": "100%", 746 735 "Unit": "cpu_atom" ··· 752 739 "MetricGroup": "TopdownL1;tma_L1_group", 753 740 "MetricName": "tma_retiring", 754 741 "MetricThreshold": "tma_retiring > 0.75", 742 + "MetricgroupNoGroup": "TopdownL1", 755 743 "ScaleUnit": "100%", 756 744 "Unit": "cpu_atom" 757 745 }, ··· 862 848 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 863 849 "MetricName": "tma_backend_bound", 864 850 "MetricThreshold": "tma_backend_bound > 0.2", 851 + "MetricgroupNoGroup": "TopdownL1", 865 852 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", 866 853 "ScaleUnit": "100%", 867 854 "Unit": "cpu_core" ··· 873 858 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 874 859 "MetricName": "tma_bad_speculation", 875 860 "MetricThreshold": "tma_bad_speculation > 0.15", 861 + "MetricgroupNoGroup": "TopdownL1", 876 862 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 877 863 "ScaleUnit": "100%", 878 864 "Unit": "cpu_core" ··· 884 868 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 885 869 "MetricName": "tma_branch_mispredicts", 886 870 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 871 + "MetricgroupNoGroup": "TopdownL2", 887 872 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 888 873 "ScaleUnit": "100%", 889 874 "Unit": "cpu_core" ··· 936 919 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 937 920 "MetricName": "tma_core_bound", 938 921 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 922 + "MetricgroupNoGroup": "TopdownL2", 939 923 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 940 924 "ScaleUnit": "100%", 941 925 "Unit": "cpu_core" ··· 1049 1031 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 1050 1032 "MetricName": "tma_fetch_bandwidth", 1051 1033 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35", 1034 + "MetricgroupNoGroup": "TopdownL2", 1052 1035 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 1053 1036 "ScaleUnit": "100%", 1054 1037 "Unit": "cpu_core" ··· 1060 1041 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 1061 1042 "MetricName": "tma_fetch_latency", 1062 1043 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 1044 + "MetricgroupNoGroup": "TopdownL2", 1063 1045 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 1064 1046 "ScaleUnit": "100%", 1065 1047 "Unit": "cpu_core" ··· 1141 1121 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 1142 1122 "MetricName": "tma_frontend_bound", 1143 1123 "MetricThreshold": "tma_frontend_bound > 0.15", 1124 + "MetricgroupNoGroup": "TopdownL1", 1144 1125 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 1145 1126 "ScaleUnit": "100%", 1146 1127 "Unit": "cpu_core" ··· 1162 1141 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1163 1142 "MetricName": "tma_heavy_operations", 1164 1143 "MetricThreshold": "tma_heavy_operations > 0.1", 1144 + "MetricgroupNoGroup": "TopdownL2", 1165 1145 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY", 1166 1146 "ScaleUnit": "100%", 1167 1147 "Unit": "cpu_core" ··· 2045 2023 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 2046 2024 "MetricName": "tma_light_operations", 2047 2025 "MetricThreshold": "tma_light_operations > 0.6", 2026 + "MetricgroupNoGroup": "TopdownL2", 2048 2027 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 2049 2028 "ScaleUnit": "100%", 2050 2029 "Unit": "cpu_core" ··· 2105 2082 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 2106 2083 "MetricName": "tma_machine_clears", 2107 2084 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 2085 + "MetricgroupNoGroup": "TopdownL2", 2108 2086 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 2109 2087 "ScaleUnit": "100%", 2110 2088 "Unit": "cpu_core" ··· 2136 2112 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 2137 2113 "MetricName": "tma_memory_bound", 2138 2114 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 2115 + "MetricgroupNoGroup": "TopdownL2", 2139 2116 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 2140 2117 "ScaleUnit": "100%", 2141 2118 "Unit": "cpu_core" ··· 2335 2310 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 2336 2311 "MetricName": "tma_retiring", 2337 2312 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 2313 + "MetricgroupNoGroup": "TopdownL1", 2338 2314 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", 2339 2315 "ScaleUnit": "100%", 2340 2316 "Unit": "cpu_core"

+14

tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json

··· 98 98 "MetricGroup": "TopdownL1;tma_L1_group", 99 99 "MetricName": "tma_backend_bound", 100 100 "MetricThreshold": "tma_backend_bound > 0.1", 101 + "MetricgroupNoGroup": "TopdownL1", 101 102 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.", 102 103 "ScaleUnit": "100%" 103 104 }, ··· 108 107 "MetricGroup": "TopdownL1;tma_L1_group", 109 108 "MetricName": "tma_backend_bound_aux", 110 109 "MetricThreshold": "tma_backend_bound_aux > 0.2", 110 + "MetricgroupNoGroup": "TopdownL1", 111 111 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.", 112 112 "ScaleUnit": "100%" 113 113 }, ··· 118 116 "MetricGroup": "TopdownL1;tma_L1_group", 119 117 "MetricName": "tma_bad_speculation", 120 118 "MetricThreshold": "tma_bad_speculation > 0.15", 119 + "MetricgroupNoGroup": "TopdownL1", 121 120 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", 122 121 "ScaleUnit": "100%" 123 122 }, ··· 128 125 "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", 129 126 "MetricName": "tma_base", 130 127 "MetricThreshold": "tma_base > 0.6", 128 + "MetricgroupNoGroup": "TopdownL2", 131 129 "ScaleUnit": "100%" 132 130 }, 133 131 { ··· 146 142 "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", 147 143 "MetricName": "tma_branch_mispredicts", 148 144 "MetricThreshold": "tma_branch_mispredicts > 0.05", 145 + "MetricgroupNoGroup": "TopdownL2", 149 146 "ScaleUnit": "100%" 150 147 }, 151 148 { ··· 171 166 "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", 172 167 "MetricName": "tma_core_bound", 173 168 "MetricThreshold": "tma_core_bound > 0.1", 169 + "MetricgroupNoGroup": "TopdownL2", 174 170 "ScaleUnit": "100%" 175 171 }, 176 172 { ··· 213 207 "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", 214 208 "MetricName": "tma_fetch_bandwidth", 215 209 "MetricThreshold": "tma_fetch_bandwidth > 0.1", 210 + "MetricgroupNoGroup": "TopdownL2", 216 211 "ScaleUnit": "100%" 217 212 }, 218 213 { ··· 222 215 "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", 223 216 "MetricName": "tma_fetch_latency", 224 217 "MetricThreshold": "tma_fetch_latency > 0.15", 218 + "MetricgroupNoGroup": "TopdownL2", 225 219 "ScaleUnit": "100%" 226 220 }, 227 221 { ··· 247 239 "MetricGroup": "TopdownL1;tma_L1_group", 248 240 "MetricName": "tma_frontend_bound", 249 241 "MetricThreshold": "tma_frontend_bound > 0.2", 242 + "MetricgroupNoGroup": "TopdownL1", 250 243 "ScaleUnit": "100%" 251 244 }, 252 245 { ··· 508 499 "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", 509 500 "MetricName": "tma_machine_clears", 510 501 "MetricThreshold": "tma_machine_clears > 0.05", 502 + "MetricgroupNoGroup": "TopdownL2", 511 503 "ScaleUnit": "100%" 512 504 }, 513 505 { ··· 525 515 "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", 526 516 "MetricName": "tma_memory_bound", 527 517 "MetricThreshold": "tma_memory_bound > 0.2", 518 + "MetricgroupNoGroup": "TopdownL2", 528 519 "ScaleUnit": "100%" 529 520 }, 530 521 { ··· 542 531 "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", 543 532 "MetricName": "tma_ms_uops", 544 533 "MetricThreshold": "tma_ms_uops > 0.05", 534 + "MetricgroupNoGroup": "TopdownL2", 545 535 "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.", 546 536 "ScaleUnit": "100%" 547 537 }, ··· 632 620 "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_aux_group", 633 621 "MetricName": "tma_resource_bound", 634 622 "MetricThreshold": "tma_resource_bound > 0.2", 623 + "MetricgroupNoGroup": "TopdownL2", 635 624 "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count.", 636 625 "ScaleUnit": "100%" 637 626 }, ··· 642 629 "MetricGroup": "TopdownL1;tma_L1_group", 643 630 "MetricName": "tma_retiring", 644 631 "MetricThreshold": "tma_retiring > 0.75", 632 + "MetricgroupNoGroup": "TopdownL1", 645 633 "ScaleUnit": "100%" 646 634 }, 647 635 {

+12

tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json

··· 103 103 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 104 104 "MetricName": "tma_backend_bound", 105 105 "MetricThreshold": "tma_backend_bound > 0.2", 106 + "MetricgroupNoGroup": "TopdownL1", 106 107 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 107 108 "ScaleUnit": "100%" 108 109 }, ··· 113 112 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 114 113 "MetricName": "tma_bad_speculation", 115 114 "MetricThreshold": "tma_bad_speculation > 0.15", 115 + "MetricgroupNoGroup": "TopdownL1", 116 116 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 117 117 "ScaleUnit": "100%" 118 118 }, ··· 124 122 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 125 123 "MetricName": "tma_branch_mispredicts", 126 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 125 + "MetricgroupNoGroup": "TopdownL2", 127 126 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 128 127 "ScaleUnit": "100%" 129 128 }, ··· 173 170 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 174 171 "MetricName": "tma_core_bound", 175 172 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 173 + "MetricgroupNoGroup": "TopdownL2", 176 174 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 177 175 "ScaleUnit": "100%" 178 176 }, ··· 267 263 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 268 264 "MetricName": "tma_fetch_bandwidth", 269 265 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 266 + "MetricgroupNoGroup": "TopdownL2", 270 267 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", 271 268 "ScaleUnit": "100%" 272 269 }, ··· 277 272 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 278 273 "MetricName": "tma_fetch_latency", 279 274 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 275 + "MetricgroupNoGroup": "TopdownL2", 280 276 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 281 277 "ScaleUnit": "100%" 282 278 }, ··· 332 326 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 333 327 "MetricName": "tma_frontend_bound", 334 328 "MetricThreshold": "tma_frontend_bound > 0.15", 329 + "MetricgroupNoGroup": "TopdownL1", 335 330 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 336 331 "ScaleUnit": "100%" 337 332 }, ··· 342 335 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 343 336 "MetricName": "tma_heavy_operations", 344 337 "MetricThreshold": "tma_heavy_operations > 0.1", 338 + "MetricgroupNoGroup": "TopdownL2", 345 339 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 346 340 "ScaleUnit": "100%" 347 341 }, ··· 836 828 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 837 829 "MetricName": "tma_light_operations", 838 830 "MetricThreshold": "tma_light_operations > 0.6", 831 + "MetricgroupNoGroup": "TopdownL2", 839 832 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 840 833 "ScaleUnit": "100%" 841 834 }, ··· 867 858 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 868 859 "MetricName": "tma_machine_clears", 869 860 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 861 + "MetricgroupNoGroup": "TopdownL2", 870 862 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 871 863 "ScaleUnit": "100%" 872 864 }, ··· 896 886 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 897 887 "MetricName": "tma_memory_bound", 898 888 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 889 + "MetricgroupNoGroup": "TopdownL2", 899 890 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 900 891 "ScaleUnit": "100%" 901 892 }, ··· 1059 1048 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1060 1049 "MetricName": "tma_retiring", 1061 1050 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1051 + "MetricgroupNoGroup": "TopdownL1", 1062 1052 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 1063 1053 "ScaleUnit": "100%" 1064 1054 },

+12

tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json

··· 97 97 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 98 98 "MetricName": "tma_backend_bound", 99 99 "MetricThreshold": "tma_backend_bound > 0.2", 100 + "MetricgroupNoGroup": "TopdownL1", 100 101 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", 101 102 "ScaleUnit": "100%" 102 103 }, ··· 107 106 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 108 107 "MetricName": "tma_bad_speculation", 109 108 "MetricThreshold": "tma_bad_speculation > 0.15", 109 + "MetricgroupNoGroup": "TopdownL1", 110 110 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 111 111 "ScaleUnit": "100%" 112 112 }, ··· 118 116 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 119 117 "MetricName": "tma_branch_mispredicts", 120 118 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 119 + "MetricgroupNoGroup": "TopdownL2", 121 120 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 122 121 "ScaleUnit": "100%" 123 122 }, ··· 167 164 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 168 165 "MetricName": "tma_core_bound", 169 166 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 167 + "MetricgroupNoGroup": "TopdownL2", 170 168 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 171 169 "ScaleUnit": "100%" 172 170 }, ··· 252 248 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 253 249 "MetricName": "tma_fetch_bandwidth", 254 250 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 251 + "MetricgroupNoGroup": "TopdownL2", 255 252 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", 256 253 "ScaleUnit": "100%" 257 254 }, ··· 262 257 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 263 258 "MetricName": "tma_fetch_latency", 264 259 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 260 + "MetricgroupNoGroup": "TopdownL2", 265 261 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 266 262 "ScaleUnit": "100%" 267 263 }, ··· 317 311 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 318 312 "MetricName": "tma_frontend_bound", 319 313 "MetricThreshold": "tma_frontend_bound > 0.15", 314 + "MetricgroupNoGroup": "TopdownL1", 320 315 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 321 316 "ScaleUnit": "100%" 322 317 }, ··· 327 320 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 328 321 "MetricName": "tma_heavy_operations", 329 322 "MetricThreshold": "tma_heavy_operations > 0.1", 323 + "MetricgroupNoGroup": "TopdownL2", 330 324 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 331 325 "ScaleUnit": "100%" 332 326 }, ··· 803 795 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 804 796 "MetricName": "tma_light_operations", 805 797 "MetricThreshold": "tma_light_operations > 0.6", 798 + "MetricgroupNoGroup": "TopdownL2", 806 799 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 807 800 "ScaleUnit": "100%" 808 801 }, ··· 834 825 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 835 826 "MetricName": "tma_machine_clears", 836 827 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 828 + "MetricgroupNoGroup": "TopdownL2", 837 829 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 838 830 "ScaleUnit": "100%" 839 831 }, ··· 863 853 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 864 854 "MetricName": "tma_memory_bound", 865 855 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 856 + "MetricgroupNoGroup": "TopdownL2", 866 857 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 867 858 "ScaleUnit": "100%" 868 859 }, ··· 1024 1013 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1025 1014 "MetricName": "tma_retiring", 1026 1015 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1016 + "MetricgroupNoGroup": "TopdownL1", 1027 1017 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", 1028 1018 "ScaleUnit": "100%" 1029 1019 },

+12

tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json

··· 103 103 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 104 104 "MetricName": "tma_backend_bound", 105 105 "MetricThreshold": "tma_backend_bound > 0.2", 106 + "MetricgroupNoGroup": "TopdownL1", 106 107 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 107 108 "ScaleUnit": "100%" 108 109 }, ··· 113 112 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 114 113 "MetricName": "tma_bad_speculation", 115 114 "MetricThreshold": "tma_bad_speculation > 0.15", 115 + "MetricgroupNoGroup": "TopdownL1", 116 116 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 117 117 "ScaleUnit": "100%" 118 118 }, ··· 124 122 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 125 123 "MetricName": "tma_branch_mispredicts", 126 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 125 + "MetricgroupNoGroup": "TopdownL2", 127 126 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 128 127 "ScaleUnit": "100%" 129 128 }, ··· 173 170 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 174 171 "MetricName": "tma_core_bound", 175 172 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 173 + "MetricgroupNoGroup": "TopdownL2", 176 174 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 177 175 "ScaleUnit": "100%" 178 176 }, ··· 267 263 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 268 264 "MetricName": "tma_fetch_bandwidth", 269 265 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 266 + "MetricgroupNoGroup": "TopdownL2", 270 267 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", 271 268 "ScaleUnit": "100%" 272 269 }, ··· 277 272 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 278 273 "MetricName": "tma_fetch_latency", 279 274 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 275 + "MetricgroupNoGroup": "TopdownL2", 280 276 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 281 277 "ScaleUnit": "100%" 282 278 }, ··· 332 326 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 333 327 "MetricName": "tma_frontend_bound", 334 328 "MetricThreshold": "tma_frontend_bound > 0.15", 329 + "MetricgroupNoGroup": "TopdownL1", 335 330 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 336 331 "ScaleUnit": "100%" 337 332 }, ··· 342 335 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 343 336 "MetricName": "tma_heavy_operations", 344 337 "MetricThreshold": "tma_heavy_operations > 0.1", 338 + "MetricgroupNoGroup": "TopdownL2", 345 339 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 346 340 "ScaleUnit": "100%" 347 341 }, ··· 837 829 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 838 830 "MetricName": "tma_light_operations", 839 831 "MetricThreshold": "tma_light_operations > 0.6", 832 + "MetricgroupNoGroup": "TopdownL2", 840 833 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 841 834 "ScaleUnit": "100%" 842 835 }, ··· 878 869 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 879 870 "MetricName": "tma_machine_clears", 880 871 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 872 + "MetricgroupNoGroup": "TopdownL2", 881 873 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 882 874 "ScaleUnit": "100%" 883 875 }, ··· 907 897 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 908 898 "MetricName": "tma_memory_bound", 909 899 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 900 + "MetricgroupNoGroup": "TopdownL2", 910 901 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 911 902 "ScaleUnit": "100%" 912 903 }, ··· 1090 1079 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1091 1080 "MetricName": "tma_retiring", 1092 1081 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1082 + "MetricgroupNoGroup": "TopdownL1", 1093 1083 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 1094 1084 "ScaleUnit": "100%" 1095 1085 },

+12

tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json

··· 101 101 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 102 102 "MetricName": "tma_backend_bound", 103 103 "MetricThreshold": "tma_backend_bound > 0.2", 104 + "MetricgroupNoGroup": "TopdownL1", 104 105 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 105 106 "ScaleUnit": "100%" 106 107 }, ··· 111 110 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 112 111 "MetricName": "tma_bad_speculation", 113 112 "MetricThreshold": "tma_bad_speculation > 0.15", 113 + "MetricgroupNoGroup": "TopdownL1", 114 114 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 115 115 "ScaleUnit": "100%" 116 116 }, ··· 122 120 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 123 121 "MetricName": "tma_branch_mispredicts", 124 122 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 123 + "MetricgroupNoGroup": "TopdownL2", 125 124 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 126 125 "ScaleUnit": "100%" 127 126 }, ··· 170 167 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 171 168 "MetricName": "tma_core_bound", 172 169 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 170 + "MetricgroupNoGroup": "TopdownL2", 173 171 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 174 172 "ScaleUnit": "100%" 175 173 }, ··· 275 271 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 276 272 "MetricName": "tma_fetch_bandwidth", 277 273 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 274 + "MetricgroupNoGroup": "TopdownL2", 278 275 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 279 276 "ScaleUnit": "100%" 280 277 }, ··· 285 280 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 286 281 "MetricName": "tma_fetch_latency", 287 282 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 283 + "MetricgroupNoGroup": "TopdownL2", 288 284 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 289 285 "ScaleUnit": "100%" 290 286 }, ··· 360 354 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 361 355 "MetricName": "tma_frontend_bound", 362 356 "MetricThreshold": "tma_frontend_bound > 0.15", 357 + "MetricgroupNoGroup": "TopdownL1", 363 358 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 364 359 "ScaleUnit": "100%" 365 360 }, ··· 379 372 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 380 373 "MetricName": "tma_heavy_operations", 381 374 "MetricThreshold": "tma_heavy_operations > 0.1", 375 + "MetricgroupNoGroup": "TopdownL2", 382 376 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 383 377 "ScaleUnit": "100%" 384 378 }, ··· 1150 1142 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1151 1143 "MetricName": "tma_light_operations", 1152 1144 "MetricThreshold": "tma_light_operations > 0.6", 1145 + "MetricgroupNoGroup": "TopdownL2", 1153 1146 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 1154 1147 "ScaleUnit": "100%" 1155 1148 }, ··· 1205 1196 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 1206 1197 "MetricName": "tma_machine_clears", 1207 1198 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 1199 + "MetricgroupNoGroup": "TopdownL2", 1208 1200 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 1209 1201 "ScaleUnit": "100%" 1210 1202 }, ··· 1234 1224 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1235 1225 "MetricName": "tma_memory_bound", 1236 1226 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 1227 + "MetricgroupNoGroup": "TopdownL2", 1237 1228 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 1238 1229 "ScaleUnit": "100%" 1239 1230 }, ··· 1469 1458 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1470 1459 "MetricName": "tma_retiring", 1471 1460 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1461 + "MetricgroupNoGroup": "TopdownL1", 1472 1462 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 1473 1463 "ScaleUnit": "100%" 1474 1464 },

+12

tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json

··· 103 103 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 104 104 "MetricName": "tma_backend_bound", 105 105 "MetricThreshold": "tma_backend_bound > 0.2", 106 + "MetricgroupNoGroup": "TopdownL1", 106 107 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 107 108 "ScaleUnit": "100%" 108 109 }, ··· 113 112 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 114 113 "MetricName": "tma_bad_speculation", 115 114 "MetricThreshold": "tma_bad_speculation > 0.15", 115 + "MetricgroupNoGroup": "TopdownL1", 116 116 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 117 117 "ScaleUnit": "100%" 118 118 }, ··· 124 122 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 125 123 "MetricName": "tma_branch_mispredicts", 126 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 125 + "MetricgroupNoGroup": "TopdownL2", 127 126 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 128 127 "ScaleUnit": "100%" 129 128 }, ··· 164 161 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 165 162 "MetricName": "tma_core_bound", 166 163 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 164 + "MetricgroupNoGroup": "TopdownL2", 167 165 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 168 166 "ScaleUnit": "100%" 169 167 }, ··· 258 254 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 259 255 "MetricName": "tma_fetch_bandwidth", 260 256 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 257 + "MetricgroupNoGroup": "TopdownL2", 261 258 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", 262 259 "ScaleUnit": "100%" 263 260 }, ··· 268 263 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 269 264 "MetricName": "tma_fetch_latency", 270 265 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 266 + "MetricgroupNoGroup": "TopdownL2", 271 267 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 272 268 "ScaleUnit": "100%" 273 269 }, ··· 278 272 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 279 273 "MetricName": "tma_frontend_bound", 280 274 "MetricThreshold": "tma_frontend_bound > 0.15", 275 + "MetricgroupNoGroup": "TopdownL1", 281 276 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 282 277 "ScaleUnit": "100%" 283 278 }, ··· 288 281 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 289 282 "MetricName": "tma_heavy_operations", 290 283 "MetricThreshold": "tma_heavy_operations > 0.1", 284 + "MetricgroupNoGroup": "TopdownL2", 291 285 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 292 286 "ScaleUnit": "100%" 293 287 }, ··· 671 663 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 672 664 "MetricName": "tma_light_operations", 673 665 "MetricThreshold": "tma_light_operations > 0.6", 666 + "MetricgroupNoGroup": "TopdownL2", 674 667 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 675 668 "ScaleUnit": "100%" 676 669 }, ··· 702 693 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 703 694 "MetricName": "tma_machine_clears", 704 695 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 696 + "MetricgroupNoGroup": "TopdownL2", 705 697 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 706 698 "ScaleUnit": "100%" 707 699 }, ··· 731 721 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 732 722 "MetricName": "tma_memory_bound", 733 723 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 724 + "MetricgroupNoGroup": "TopdownL2", 734 725 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 735 726 "ScaleUnit": "100%" 736 727 }, ··· 885 874 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 886 875 "MetricName": "tma_retiring", 887 876 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 877 + "MetricgroupNoGroup": "TopdownL1", 888 878 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 889 879 "ScaleUnit": "100%" 890 880 },

+12

tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json

··· 103 103 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 104 104 "MetricName": "tma_backend_bound", 105 105 "MetricThreshold": "tma_backend_bound > 0.2", 106 + "MetricgroupNoGroup": "TopdownL1", 106 107 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 107 108 "ScaleUnit": "100%" 108 109 }, ··· 113 112 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 114 113 "MetricName": "tma_bad_speculation", 115 114 "MetricThreshold": "tma_bad_speculation > 0.15", 115 + "MetricgroupNoGroup": "TopdownL1", 116 116 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 117 117 "ScaleUnit": "100%" 118 118 }, ··· 124 122 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 125 123 "MetricName": "tma_branch_mispredicts", 126 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 125 + "MetricgroupNoGroup": "TopdownL2", 127 126 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 128 127 "ScaleUnit": "100%" 129 128 }, ··· 164 161 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 165 162 "MetricName": "tma_core_bound", 166 163 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 164 + "MetricgroupNoGroup": "TopdownL2", 167 165 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 168 166 "ScaleUnit": "100%" 169 167 }, ··· 258 254 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 259 255 "MetricName": "tma_fetch_bandwidth", 260 256 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 257 + "MetricgroupNoGroup": "TopdownL2", 261 258 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", 262 259 "ScaleUnit": "100%" 263 260 }, ··· 268 263 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 269 264 "MetricName": "tma_fetch_latency", 270 265 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 266 + "MetricgroupNoGroup": "TopdownL2", 271 267 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 272 268 "ScaleUnit": "100%" 273 269 }, ··· 278 272 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 279 273 "MetricName": "tma_frontend_bound", 280 274 "MetricThreshold": "tma_frontend_bound > 0.15", 275 + "MetricgroupNoGroup": "TopdownL1", 281 276 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 282 277 "ScaleUnit": "100%" 283 278 }, ··· 288 281 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 289 282 "MetricName": "tma_heavy_operations", 290 283 "MetricThreshold": "tma_heavy_operations > 0.1", 284 + "MetricgroupNoGroup": "TopdownL2", 291 285 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 292 286 "ScaleUnit": "100%" 293 287 }, ··· 672 664 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 673 665 "MetricName": "tma_light_operations", 674 666 "MetricThreshold": "tma_light_operations > 0.6", 667 + "MetricgroupNoGroup": "TopdownL2", 675 668 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 676 669 "ScaleUnit": "100%" 677 670 }, ··· 713 704 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 714 705 "MetricName": "tma_machine_clears", 715 706 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 707 + "MetricgroupNoGroup": "TopdownL2", 716 708 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 717 709 "ScaleUnit": "100%" 718 710 }, ··· 742 732 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 743 733 "MetricName": "tma_memory_bound", 744 734 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 735 + "MetricgroupNoGroup": "TopdownL2", 745 736 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 746 737 "ScaleUnit": "100%" 747 738 }, ··· 916 905 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 917 906 "MetricName": "tma_retiring", 918 907 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 908 + "MetricgroupNoGroup": "TopdownL1", 919 909 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 920 910 "ScaleUnit": "100%" 921 911 },

+12

tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json

··· 115 115 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 116 116 "MetricName": "tma_backend_bound", 117 117 "MetricThreshold": "tma_backend_bound > 0.2", 118 + "MetricgroupNoGroup": "TopdownL1", 118 119 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", 119 120 "ScaleUnit": "100%" 120 121 }, ··· 125 124 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 126 125 "MetricName": "tma_bad_speculation", 127 126 "MetricThreshold": "tma_bad_speculation > 0.15", 127 + "MetricgroupNoGroup": "TopdownL1", 128 128 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 129 129 "ScaleUnit": "100%" 130 130 }, ··· 143 141 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 144 142 "MetricName": "tma_branch_mispredicts", 145 143 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 144 + "MetricgroupNoGroup": "TopdownL2", 146 145 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 147 146 "ScaleUnit": "100%" 148 147 }, ··· 190 187 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 191 188 "MetricName": "tma_core_bound", 192 189 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 190 + "MetricgroupNoGroup": "TopdownL2", 193 191 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 194 192 "ScaleUnit": "100%" 195 193 }, ··· 292 288 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 293 289 "MetricName": "tma_fetch_bandwidth", 294 290 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35", 291 + "MetricgroupNoGroup": "TopdownL2", 295 292 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 296 293 "ScaleUnit": "100%" 297 294 }, ··· 302 297 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 303 298 "MetricName": "tma_fetch_latency", 304 299 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 300 + "MetricgroupNoGroup": "TopdownL2", 305 301 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 306 302 "ScaleUnit": "100%" 307 303 }, ··· 375 369 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 376 370 "MetricName": "tma_frontend_bound", 377 371 "MetricThreshold": "tma_frontend_bound > 0.15", 372 + "MetricgroupNoGroup": "TopdownL1", 378 373 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 379 374 "ScaleUnit": "100%" 380 375 }, ··· 385 378 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 386 379 "MetricName": "tma_heavy_operations", 387 380 "MetricThreshold": "tma_heavy_operations > 0.1", 381 + "MetricgroupNoGroup": "TopdownL2", 388 382 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 389 383 "ScaleUnit": "100%" 390 384 }, ··· 1119 1111 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1120 1112 "MetricName": "tma_light_operations", 1121 1113 "MetricThreshold": "tma_light_operations > 0.6", 1114 + "MetricgroupNoGroup": "TopdownL2", 1122 1115 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 1123 1116 "ScaleUnit": "100%" 1124 1117 }, ··· 1173 1164 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 1174 1165 "MetricName": "tma_machine_clears", 1175 1166 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 1167 + "MetricgroupNoGroup": "TopdownL2", 1176 1168 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 1177 1169 "ScaleUnit": "100%" 1178 1170 }, ··· 1201 1191 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1202 1192 "MetricName": "tma_memory_bound", 1203 1193 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 1194 + "MetricgroupNoGroup": "TopdownL2", 1204 1195 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 1205 1196 "ScaleUnit": "100%" 1206 1197 }, ··· 1371 1360 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1372 1361 "MetricName": "tma_retiring", 1373 1362 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1363 + "MetricgroupNoGroup": "TopdownL1", 1374 1364 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", 1375 1365 "ScaleUnit": "100%" 1376 1366 },

+12

tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json

··· 80 80 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 81 81 "MetricName": "tma_backend_bound", 82 82 "MetricThreshold": "tma_backend_bound > 0.2", 83 + "MetricgroupNoGroup": "TopdownL1", 83 84 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", 84 85 "ScaleUnit": "100%" 85 86 }, ··· 90 89 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 91 90 "MetricName": "tma_bad_speculation", 92 91 "MetricThreshold": "tma_bad_speculation > 0.15", 92 + "MetricgroupNoGroup": "TopdownL1", 93 93 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 94 94 "ScaleUnit": "100%" 95 95 }, ··· 108 106 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 109 107 "MetricName": "tma_branch_mispredicts", 110 108 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 109 + "MetricgroupNoGroup": "TopdownL2", 111 110 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 112 111 "ScaleUnit": "100%" 113 112 }, ··· 155 152 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 156 153 "MetricName": "tma_core_bound", 157 154 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 155 + "MetricgroupNoGroup": "TopdownL2", 158 156 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 159 157 "ScaleUnit": "100%" 160 158 }, ··· 257 253 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 258 254 "MetricName": "tma_fetch_bandwidth", 259 255 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35", 256 + "MetricgroupNoGroup": "TopdownL2", 260 257 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 261 258 "ScaleUnit": "100%" 262 259 }, ··· 267 262 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 268 263 "MetricName": "tma_fetch_latency", 269 264 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 265 + "MetricgroupNoGroup": "TopdownL2", 270 266 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 271 267 "ScaleUnit": "100%" 272 268 }, ··· 340 334 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 341 335 "MetricName": "tma_frontend_bound", 342 336 "MetricThreshold": "tma_frontend_bound > 0.15", 337 + "MetricgroupNoGroup": "TopdownL1", 343 338 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 344 339 "ScaleUnit": "100%" 345 340 }, ··· 350 343 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 351 344 "MetricName": "tma_heavy_operations", 352 345 "MetricThreshold": "tma_heavy_operations > 0.1", 346 + "MetricgroupNoGroup": "TopdownL2", 353 347 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 354 348 "ScaleUnit": "100%" 355 349 }, ··· 1142 1134 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1143 1135 "MetricName": "tma_light_operations", 1144 1136 "MetricThreshold": "tma_light_operations > 0.6", 1137 + "MetricgroupNoGroup": "TopdownL2", 1145 1138 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 1146 1139 "ScaleUnit": "100%" 1147 1140 }, ··· 1196 1187 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 1197 1188 "MetricName": "tma_machine_clears", 1198 1189 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 1190 + "MetricgroupNoGroup": "TopdownL2", 1199 1191 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 1200 1192 "ScaleUnit": "100%" 1201 1193 }, ··· 1224 1214 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1225 1215 "MetricName": "tma_memory_bound", 1226 1216 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 1217 + "MetricgroupNoGroup": "TopdownL2", 1227 1218 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 1228 1219 "ScaleUnit": "100%" 1229 1220 }, ··· 1421 1410 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1422 1411 "MetricName": "tma_retiring", 1423 1412 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1413 + "MetricgroupNoGroup": "TopdownL1", 1424 1414 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", 1425 1415 "ScaleUnit": "100%" 1426 1416 },

+12

tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json

··· 103 103 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 104 104 "MetricName": "tma_backend_bound", 105 105 "MetricThreshold": "tma_backend_bound > 0.2", 106 + "MetricgroupNoGroup": "TopdownL1", 106 107 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 107 108 "ScaleUnit": "100%" 108 109 }, ··· 113 112 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 114 113 "MetricName": "tma_bad_speculation", 115 114 "MetricThreshold": "tma_bad_speculation > 0.15", 115 + "MetricgroupNoGroup": "TopdownL1", 116 116 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 117 117 "ScaleUnit": "100%" 118 118 }, ··· 124 122 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 125 123 "MetricName": "tma_branch_mispredicts", 126 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 125 + "MetricgroupNoGroup": "TopdownL2", 127 126 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 128 127 "ScaleUnit": "100%" 129 128 }, ··· 164 161 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 165 162 "MetricName": "tma_core_bound", 166 163 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 164 + "MetricgroupNoGroup": "TopdownL2", 167 165 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 168 166 "ScaleUnit": "100%" 169 167 }, ··· 258 254 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 259 255 "MetricName": "tma_fetch_bandwidth", 260 256 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 257 + "MetricgroupNoGroup": "TopdownL2", 261 258 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", 262 259 "ScaleUnit": "100%" 263 260 }, ··· 268 263 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 269 264 "MetricName": "tma_fetch_latency", 270 265 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 266 + "MetricgroupNoGroup": "TopdownL2", 271 267 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 272 268 "ScaleUnit": "100%" 273 269 }, ··· 305 299 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 306 300 "MetricName": "tma_frontend_bound", 307 301 "MetricThreshold": "tma_frontend_bound > 0.15", 302 + "MetricgroupNoGroup": "TopdownL1", 308 303 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 309 304 "ScaleUnit": "100%" 310 305 }, ··· 315 308 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 316 309 "MetricName": "tma_heavy_operations", 317 310 "MetricThreshold": "tma_heavy_operations > 0.1", 311 + "MetricgroupNoGroup": "TopdownL2", 318 312 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 319 313 "ScaleUnit": "100%" 320 314 }, ··· 732 724 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 733 725 "MetricName": "tma_light_operations", 734 726 "MetricThreshold": "tma_light_operations > 0.6", 727 + "MetricgroupNoGroup": "TopdownL2", 735 728 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 736 729 "ScaleUnit": "100%" 737 730 }, ··· 763 754 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 764 755 "MetricName": "tma_machine_clears", 765 756 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 757 + "MetricgroupNoGroup": "TopdownL2", 766 758 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 767 759 "ScaleUnit": "100%" 768 760 }, ··· 792 782 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 793 783 "MetricName": "tma_memory_bound", 794 784 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 785 + "MetricgroupNoGroup": "TopdownL2", 795 786 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 796 787 "ScaleUnit": "100%" 797 788 }, ··· 928 917 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 929 918 "MetricName": "tma_retiring", 930 919 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 920 + "MetricgroupNoGroup": "TopdownL1", 931 921 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 932 922 "ScaleUnit": "100%" 933 923 },

+12

tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json

··· 103 103 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 104 104 "MetricName": "tma_backend_bound", 105 105 "MetricThreshold": "tma_backend_bound > 0.2", 106 + "MetricgroupNoGroup": "TopdownL1", 106 107 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 107 108 "ScaleUnit": "100%" 108 109 }, ··· 113 112 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 114 113 "MetricName": "tma_bad_speculation", 115 114 "MetricThreshold": "tma_bad_speculation > 0.15", 115 + "MetricgroupNoGroup": "TopdownL1", 116 116 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 117 117 "ScaleUnit": "100%" 118 118 }, ··· 124 122 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 125 123 "MetricName": "tma_branch_mispredicts", 126 124 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 125 + "MetricgroupNoGroup": "TopdownL2", 127 126 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 128 127 "ScaleUnit": "100%" 129 128 }, ··· 164 161 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 165 162 "MetricName": "tma_core_bound", 166 163 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 164 + "MetricgroupNoGroup": "TopdownL2", 167 165 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 168 166 "ScaleUnit": "100%" 169 167 }, ··· 258 254 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 259 255 "MetricName": "tma_fetch_bandwidth", 260 256 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 257 + "MetricgroupNoGroup": "TopdownL2", 261 258 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", 262 259 "ScaleUnit": "100%" 263 260 }, ··· 268 263 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 269 264 "MetricName": "tma_fetch_latency", 270 265 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 266 + "MetricgroupNoGroup": "TopdownL2", 271 267 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 272 268 "ScaleUnit": "100%" 273 269 }, ··· 305 299 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 306 300 "MetricName": "tma_frontend_bound", 307 301 "MetricThreshold": "tma_frontend_bound > 0.15", 302 + "MetricgroupNoGroup": "TopdownL1", 308 303 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 309 304 "ScaleUnit": "100%" 310 305 }, ··· 315 308 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 316 309 "MetricName": "tma_heavy_operations", 317 310 "MetricThreshold": "tma_heavy_operations > 0.1", 311 + "MetricgroupNoGroup": "TopdownL2", 318 312 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 319 313 "ScaleUnit": "100%" 320 314 }, ··· 733 725 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 734 726 "MetricName": "tma_light_operations", 735 727 "MetricThreshold": "tma_light_operations > 0.6", 728 + "MetricgroupNoGroup": "TopdownL2", 736 729 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 737 730 "ScaleUnit": "100%" 738 731 }, ··· 774 765 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 775 766 "MetricName": "tma_machine_clears", 776 767 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 768 + "MetricgroupNoGroup": "TopdownL2", 777 769 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 778 770 "ScaleUnit": "100%" 779 771 }, ··· 803 793 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 804 794 "MetricName": "tma_memory_bound", 805 795 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 796 + "MetricgroupNoGroup": "TopdownL2", 806 797 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 807 798 "ScaleUnit": "100%" 808 799 }, ··· 959 948 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 960 949 "MetricName": "tma_retiring", 961 950 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 951 + "MetricgroupNoGroup": "TopdownL1", 962 952 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 963 953 "ScaleUnit": "100%" 964 954 },

+12

tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json

··· 76 76 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 77 77 "MetricName": "tma_backend_bound", 78 78 "MetricThreshold": "tma_backend_bound > 0.2", 79 + "MetricgroupNoGroup": "TopdownL1", 79 80 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 80 81 "ScaleUnit": "100%" 81 82 }, ··· 86 85 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 87 86 "MetricName": "tma_bad_speculation", 88 87 "MetricThreshold": "tma_bad_speculation > 0.15", 88 + "MetricgroupNoGroup": "TopdownL1", 89 89 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 90 90 "ScaleUnit": "100%" 91 91 }, ··· 97 95 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 98 96 "MetricName": "tma_branch_mispredicts", 99 97 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 98 + "MetricgroupNoGroup": "TopdownL2", 100 99 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 101 100 "ScaleUnit": "100%" 102 101 }, ··· 117 114 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 118 115 "MetricName": "tma_core_bound", 119 116 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 117 + "MetricgroupNoGroup": "TopdownL2", 120 118 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 121 119 "ScaleUnit": "100%" 122 120 }, ··· 164 160 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 165 161 "MetricName": "tma_fetch_bandwidth", 166 162 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 163 + "MetricgroupNoGroup": "TopdownL2", 167 164 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp", 168 165 "ScaleUnit": "100%" 169 166 }, ··· 174 169 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 175 170 "MetricName": "tma_fetch_latency", 176 171 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 172 + "MetricgroupNoGroup": "TopdownL2", 177 173 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 178 174 "ScaleUnit": "100%" 179 175 }, ··· 211 205 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 212 206 "MetricName": "tma_frontend_bound", 213 207 "MetricThreshold": "tma_frontend_bound > 0.15", 208 + "MetricgroupNoGroup": "TopdownL1", 214 209 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 215 210 "ScaleUnit": "100%" 216 211 }, ··· 221 214 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 222 215 "MetricName": "tma_heavy_operations", 223 216 "MetricThreshold": "tma_heavy_operations > 0.1", 217 + "MetricgroupNoGroup": "TopdownL2", 224 218 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 225 219 "ScaleUnit": "100%" 226 220 }, ··· 420 412 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 421 413 "MetricName": "tma_light_operations", 422 414 "MetricThreshold": "tma_light_operations > 0.6", 415 + "MetricgroupNoGroup": "TopdownL2", 423 416 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 424 417 "ScaleUnit": "100%" 425 418 }, ··· 431 422 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 432 423 "MetricName": "tma_machine_clears", 433 424 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 425 + "MetricgroupNoGroup": "TopdownL2", 434 426 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 435 427 "ScaleUnit": "100%" 436 428 }, ··· 460 450 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 461 451 "MetricName": "tma_memory_bound", 462 452 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 453 + "MetricgroupNoGroup": "TopdownL2", 463 454 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 464 455 "ScaleUnit": "100%" 465 456 }, ··· 498 487 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 499 488 "MetricName": "tma_retiring", 500 489 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 490 + "MetricgroupNoGroup": "TopdownL1", 501 491 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 502 492 "ScaleUnit": "100%" 503 493 },

+12

tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json

··· 76 76 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 77 77 "MetricName": "tma_backend_bound", 78 78 "MetricThreshold": "tma_backend_bound > 0.2", 79 + "MetricgroupNoGroup": "TopdownL1", 79 80 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 80 81 "ScaleUnit": "100%" 81 82 }, ··· 86 85 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 87 86 "MetricName": "tma_bad_speculation", 88 87 "MetricThreshold": "tma_bad_speculation > 0.15", 88 + "MetricgroupNoGroup": "TopdownL1", 89 89 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 90 90 "ScaleUnit": "100%" 91 91 }, ··· 97 95 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 98 96 "MetricName": "tma_branch_mispredicts", 99 97 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 98 + "MetricgroupNoGroup": "TopdownL2", 100 99 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", 101 100 "ScaleUnit": "100%" 102 101 }, ··· 117 114 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 118 115 "MetricName": "tma_core_bound", 119 116 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 117 + "MetricgroupNoGroup": "TopdownL2", 120 118 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 121 119 "ScaleUnit": "100%" 122 120 }, ··· 164 160 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 165 161 "MetricName": "tma_fetch_bandwidth", 166 162 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 163 + "MetricgroupNoGroup": "TopdownL2", 167 164 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp", 168 165 "ScaleUnit": "100%" 169 166 }, ··· 174 169 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 175 170 "MetricName": "tma_fetch_latency", 176 171 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 172 + "MetricgroupNoGroup": "TopdownL2", 177 173 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", 178 174 "ScaleUnit": "100%" 179 175 }, ··· 211 205 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 212 206 "MetricName": "tma_frontend_bound", 213 207 "MetricThreshold": "tma_frontend_bound > 0.15", 208 + "MetricgroupNoGroup": "TopdownL1", 214 209 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", 215 210 "ScaleUnit": "100%" 216 211 }, ··· 221 214 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 222 215 "MetricName": "tma_heavy_operations", 223 216 "MetricThreshold": "tma_heavy_operations > 0.1", 217 + "MetricgroupNoGroup": "TopdownL2", 224 218 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 225 219 "ScaleUnit": "100%" 226 220 }, ··· 419 411 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 420 412 "MetricName": "tma_light_operations", 421 413 "MetricThreshold": "tma_light_operations > 0.6", 414 + "MetricgroupNoGroup": "TopdownL2", 422 415 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 423 416 "ScaleUnit": "100%" 424 417 }, ··· 430 421 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 431 422 "MetricName": "tma_machine_clears", 432 423 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 424 + "MetricgroupNoGroup": "TopdownL2", 433 425 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 434 426 "ScaleUnit": "100%" 435 427 }, ··· 459 449 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 460 450 "MetricName": "tma_memory_bound", 461 451 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 452 + "MetricgroupNoGroup": "TopdownL2", 462 453 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 463 454 "ScaleUnit": "100%" 464 455 }, ··· 497 486 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 498 487 "MetricName": "tma_retiring", 499 488 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 489 + "MetricgroupNoGroup": "TopdownL1", 500 490 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 501 491 "ScaleUnit": "100%" 502 492 },

+12

tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json

··· 87 87 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 88 88 "MetricName": "tma_backend_bound", 89 89 "MetricThreshold": "tma_backend_bound > 0.2", 90 + "MetricgroupNoGroup": "TopdownL1", 90 91 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", 91 92 "ScaleUnit": "100%" 92 93 }, ··· 97 96 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 98 97 "MetricName": "tma_bad_speculation", 99 98 "MetricThreshold": "tma_bad_speculation > 0.15", 99 + "MetricgroupNoGroup": "TopdownL1", 100 100 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 101 101 "ScaleUnit": "100%" 102 102 }, ··· 107 105 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 108 106 "MetricName": "tma_branch_mispredicts", 109 107 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 108 + "MetricgroupNoGroup": "TopdownL2", 110 109 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 111 110 "ScaleUnit": "100%" 112 111 }, ··· 154 151 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 155 152 "MetricName": "tma_core_bound", 156 153 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 154 + "MetricgroupNoGroup": "TopdownL2", 157 155 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 158 156 "ScaleUnit": "100%" 159 157 }, ··· 256 252 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 257 253 "MetricName": "tma_fetch_bandwidth", 258 254 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 6 > 0.35", 255 + "MetricgroupNoGroup": "TopdownL2", 259 256 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 260 257 "ScaleUnit": "100%" 261 258 }, ··· 266 261 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 267 262 "MetricName": "tma_fetch_latency", 268 263 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 264 + "MetricgroupNoGroup": "TopdownL2", 269 265 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 270 266 "ScaleUnit": "100%" 271 267 }, ··· 357 351 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 358 352 "MetricName": "tma_frontend_bound", 359 353 "MetricThreshold": "tma_frontend_bound > 0.15", 354 + "MetricgroupNoGroup": "TopdownL1", 360 355 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 361 356 "ScaleUnit": "100%" 362 357 }, ··· 376 369 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 377 370 "MetricName": "tma_heavy_operations", 378 371 "MetricThreshold": "tma_heavy_operations > 0.1", 372 + "MetricgroupNoGroup": "TopdownL2", 379 373 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY", 380 374 "ScaleUnit": "100%" 381 375 }, ··· 1224 1216 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1225 1217 "MetricName": "tma_light_operations", 1226 1218 "MetricThreshold": "tma_light_operations > 0.6", 1219 + "MetricgroupNoGroup": "TopdownL2", 1227 1220 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 1228 1221 "ScaleUnit": "100%" 1229 1222 }, ··· 1278 1269 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 1279 1270 "MetricName": "tma_machine_clears", 1280 1271 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 1272 + "MetricgroupNoGroup": "TopdownL2", 1281 1273 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 1282 1274 "ScaleUnit": "100%" 1283 1275 }, ··· 1314 1304 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1315 1305 "MetricName": "tma_memory_bound", 1316 1306 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 1307 + "MetricgroupNoGroup": "TopdownL2", 1317 1308 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 1318 1309 "ScaleUnit": "100%" 1319 1310 }, ··· 1520 1509 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1521 1510 "MetricName": "tma_retiring", 1522 1511 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1512 + "MetricgroupNoGroup": "TopdownL1", 1523 1513 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", 1524 1514 "ScaleUnit": "100%" 1525 1515 },

+12

tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json

··· 101 101 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 102 102 "MetricName": "tma_backend_bound", 103 103 "MetricThreshold": "tma_backend_bound > 0.2", 104 + "MetricgroupNoGroup": "TopdownL1", 104 105 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 105 106 "ScaleUnit": "100%" 106 107 }, ··· 111 110 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 112 111 "MetricName": "tma_bad_speculation", 113 112 "MetricThreshold": "tma_bad_speculation > 0.15", 113 + "MetricgroupNoGroup": "TopdownL1", 114 114 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 115 115 "ScaleUnit": "100%" 116 116 }, ··· 122 120 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 123 121 "MetricName": "tma_branch_mispredicts", 124 122 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 123 + "MetricgroupNoGroup": "TopdownL2", 125 124 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 126 125 "ScaleUnit": "100%" 127 126 }, ··· 170 167 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 171 168 "MetricName": "tma_core_bound", 172 169 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 170 + "MetricgroupNoGroup": "TopdownL2", 173 171 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 174 172 "ScaleUnit": "100%" 175 173 }, ··· 275 271 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 276 272 "MetricName": "tma_fetch_bandwidth", 277 273 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 274 + "MetricgroupNoGroup": "TopdownL2", 278 275 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 279 276 "ScaleUnit": "100%" 280 277 }, ··· 285 280 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 286 281 "MetricName": "tma_fetch_latency", 287 282 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 283 + "MetricgroupNoGroup": "TopdownL2", 288 284 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 289 285 "ScaleUnit": "100%" 290 286 }, ··· 351 345 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 352 346 "MetricName": "tma_frontend_bound", 353 347 "MetricThreshold": "tma_frontend_bound > 0.15", 348 + "MetricgroupNoGroup": "TopdownL1", 354 349 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 355 350 "ScaleUnit": "100%" 356 351 }, ··· 370 363 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 371 364 "MetricName": "tma_heavy_operations", 372 365 "MetricThreshold": "tma_heavy_operations > 0.1", 366 + "MetricgroupNoGroup": "TopdownL2", 373 367 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 374 368 "ScaleUnit": "100%" 375 369 }, ··· 1073 1065 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1074 1066 "MetricName": "tma_light_operations", 1075 1067 "MetricThreshold": "tma_light_operations > 0.6", 1068 + "MetricgroupNoGroup": "TopdownL2", 1076 1069 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 1077 1070 "ScaleUnit": "100%" 1078 1071 }, ··· 1119 1110 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 1120 1111 "MetricName": "tma_machine_clears", 1121 1112 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 1113 + "MetricgroupNoGroup": "TopdownL2", 1122 1114 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 1123 1115 "ScaleUnit": "100%" 1124 1116 }, ··· 1148 1138 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1149 1139 "MetricName": "tma_memory_bound", 1150 1140 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 1141 + "MetricgroupNoGroup": "TopdownL2", 1151 1142 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 1152 1143 "ScaleUnit": "100%" 1153 1144 }, ··· 1354 1343 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1355 1344 "MetricName": "tma_retiring", 1356 1345 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1346 + "MetricgroupNoGroup": "TopdownL1", 1357 1347 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 1358 1348 "ScaleUnit": "100%" 1359 1349 },

+12

tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json

··· 101 101 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 102 102 "MetricName": "tma_backend_bound", 103 103 "MetricThreshold": "tma_backend_bound > 0.2", 104 + "MetricgroupNoGroup": "TopdownL1", 104 105 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", 105 106 "ScaleUnit": "100%" 106 107 }, ··· 111 110 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 112 111 "MetricName": "tma_bad_speculation", 113 112 "MetricThreshold": "tma_bad_speculation > 0.15", 113 + "MetricgroupNoGroup": "TopdownL1", 114 114 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 115 115 "ScaleUnit": "100%" 116 116 }, ··· 122 120 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 123 121 "MetricName": "tma_branch_mispredicts", 124 122 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 123 + "MetricgroupNoGroup": "TopdownL2", 125 124 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 126 125 "ScaleUnit": "100%" 127 126 }, ··· 170 167 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 171 168 "MetricName": "tma_core_bound", 172 169 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 170 + "MetricgroupNoGroup": "TopdownL2", 173 171 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 174 172 "ScaleUnit": "100%" 175 173 }, ··· 275 271 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 276 272 "MetricName": "tma_fetch_bandwidth", 277 273 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", 274 + "MetricgroupNoGroup": "TopdownL2", 278 275 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 279 276 "ScaleUnit": "100%" 280 277 }, ··· 285 280 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 286 281 "MetricName": "tma_fetch_latency", 287 282 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 283 + "MetricgroupNoGroup": "TopdownL2", 288 284 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 289 285 "ScaleUnit": "100%" 290 286 }, ··· 360 354 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 361 355 "MetricName": "tma_frontend_bound", 362 356 "MetricThreshold": "tma_frontend_bound > 0.15", 357 + "MetricgroupNoGroup": "TopdownL1", 363 358 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 364 359 "ScaleUnit": "100%" 365 360 }, ··· 379 372 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 380 373 "MetricName": "tma_heavy_operations", 381 374 "MetricThreshold": "tma_heavy_operations > 0.1", 375 + "MetricgroupNoGroup": "TopdownL2", 382 376 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 383 377 "ScaleUnit": "100%" 384 378 }, ··· 1131 1123 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1132 1124 "MetricName": "tma_light_operations", 1133 1125 "MetricThreshold": "tma_light_operations > 0.6", 1126 + "MetricgroupNoGroup": "TopdownL2", 1134 1127 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 1135 1128 "ScaleUnit": "100%" 1136 1129 }, ··· 1186 1177 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 1187 1178 "MetricName": "tma_machine_clears", 1188 1179 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 1180 + "MetricgroupNoGroup": "TopdownL2", 1189 1181 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 1190 1182 "ScaleUnit": "100%" 1191 1183 }, ··· 1215 1205 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1216 1206 "MetricName": "tma_memory_bound", 1217 1207 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 1208 + "MetricgroupNoGroup": "TopdownL2", 1218 1209 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 1219 1210 "ScaleUnit": "100%" 1220 1211 }, ··· 1440 1429 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1441 1430 "MetricName": "tma_retiring", 1442 1431 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1432 + "MetricgroupNoGroup": "TopdownL1", 1443 1433 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", 1444 1434 "ScaleUnit": "100%" 1445 1435 },

+12

tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json

··· 109 109 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 110 110 "MetricName": "tma_backend_bound", 111 111 "MetricThreshold": "tma_backend_bound > 0.2", 112 + "MetricgroupNoGroup": "TopdownL1", 112 113 "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS", 113 114 "ScaleUnit": "100%" 114 115 }, ··· 119 118 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 120 119 "MetricName": "tma_bad_speculation", 121 120 "MetricThreshold": "tma_bad_speculation > 0.15", 121 + "MetricgroupNoGroup": "TopdownL1", 122 122 "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", 123 123 "ScaleUnit": "100%" 124 124 }, ··· 137 135 "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", 138 136 "MetricName": "tma_branch_mispredicts", 139 137 "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", 138 + "MetricgroupNoGroup": "TopdownL2", 140 139 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_info_mispredictions, tma_mispredicts_resteers", 141 140 "ScaleUnit": "100%" 142 141 }, ··· 184 181 "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 185 182 "MetricName": "tma_core_bound", 186 183 "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", 184 + "MetricgroupNoGroup": "TopdownL2", 187 185 "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", 188 186 "ScaleUnit": "100%" 189 187 }, ··· 286 282 "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", 287 283 "MetricName": "tma_fetch_bandwidth", 288 284 "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 5 > 0.35", 285 + "MetricgroupNoGroup": "TopdownL2", 289 286 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_dsb_misses, tma_info_iptb, tma_lcp", 290 287 "ScaleUnit": "100%" 291 288 }, ··· 296 291 "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", 297 292 "MetricName": "tma_fetch_latency", 298 293 "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", 294 + "MetricgroupNoGroup": "TopdownL2", 299 295 "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS", 300 296 "ScaleUnit": "100%" 301 297 }, ··· 369 363 "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", 370 364 "MetricName": "tma_frontend_bound", 371 365 "MetricThreshold": "tma_frontend_bound > 0.15", 366 + "MetricgroupNoGroup": "TopdownL1", 372 367 "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS", 373 368 "ScaleUnit": "100%" 374 369 }, ··· 379 372 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 380 373 "MetricName": "tma_heavy_operations", 381 374 "MetricThreshold": "tma_heavy_operations > 0.1", 375 + "MetricgroupNoGroup": "TopdownL2", 382 376 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", 383 377 "ScaleUnit": "100%" 384 378 }, ··· 1133 1125 "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", 1134 1126 "MetricName": "tma_light_operations", 1135 1127 "MetricThreshold": "tma_light_operations > 0.6", 1128 + "MetricgroupNoGroup": "TopdownL2", 1136 1129 "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", 1137 1130 "ScaleUnit": "100%" 1138 1131 }, ··· 1187 1178 "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", 1188 1179 "MetricName": "tma_machine_clears", 1189 1180 "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", 1181 + "MetricgroupNoGroup": "TopdownL2", 1190 1182 "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", 1191 1183 "ScaleUnit": "100%" 1192 1184 }, ··· 1215 1205 "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", 1216 1206 "MetricName": "tma_memory_bound", 1217 1207 "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", 1208 + "MetricgroupNoGroup": "TopdownL2", 1218 1209 "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", 1219 1210 "ScaleUnit": "100%" 1220 1211 }, ··· 1385 1374 "MetricGroup": "TmaL1;TopdownL1;tma_L1_group", 1386 1375 "MetricName": "tma_retiring", 1387 1376 "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", 1377 + "MetricgroupNoGroup": "TopdownL1", 1388 1378 "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS", 1389 1379 "ScaleUnit": "100%" 1390 1380 },

+3 -1

tools/perf/pmu-events/jevents.py

··· 52 52 # Attributes that are in pmu_metric rather than pmu_event. 53 53 _json_metric_attributes = [ 54 54 'metric_name', 'metric_group', 'metric_expr', 'metric_threshold', 'desc', 55 - 'long_desc', 'unit', 'compat', 'aggr_mode', 'event_grouping' 55 + 'long_desc', 'unit', 'compat', 'metricgroup_no_group', 'aggr_mode', 56 + 'event_grouping' 56 57 ] 57 58 # Attributes that are bools or enum int values, encoded as '0', '1',... 58 59 _json_enum_attributes = ['aggr_mode', 'deprecated', 'event_grouping', 'perpkg'] ··· 304 303 self.deprecated = jd.get('Deprecated') 305 304 self.metric_name = jd.get('MetricName') 306 305 self.metric_group = jd.get('MetricGroup') 306 + self.metricgroup_no_group = jd.get('MetricgroupNoGroup') 307 307 self.event_grouping = convert_metric_constraint(jd.get('MetricConstraint')) 308 308 self.metric_expr = None 309 309 if 'MetricExpr' in jd:

+1

tools/perf/pmu-events/pmu-events.h

··· 59 59 const char *compat; 60 60 const char *desc; 61 61 const char *long_desc; 62 + const char *metricgroup_no_group; 62 63 enum aggr_mode_class aggr_mode; 63 64 enum metric_event_groups event_grouping; 64 65 };

+3 -3

tools/perf/tests/attr.py

··· 152 152 # - expected values assignments 153 153 class Test(object): 154 154 def __init__(self, path, options): 155 - parser = configparser.SafeConfigParser() 155 + parser = configparser.ConfigParser() 156 156 parser.read(path) 157 157 158 158 log.warning("running '%s'" % path) ··· 247 247 return True 248 248 249 249 def load_events(self, path, events): 250 - parser_event = configparser.SafeConfigParser() 250 + parser_event = configparser.ConfigParser() 251 251 parser_event.read(path) 252 252 253 253 # The event record section header contains 'event' word, ··· 261 261 # Read parent event if there's any 262 262 if (':' in section): 263 263 base = section[section.index(':') + 1:] 264 - parser_base = configparser.SafeConfigParser() 264 + parser_base = configparser.ConfigParser() 265 265 parser_base.read(self.test_dir + '/' + base) 266 266 base_items = parser_base.items('event') 267 267

+1 -1

tools/perf/tests/attr/base-stat

··· 16 16 exclusive=0 17 17 exclude_user=0 18 18 exclude_kernel=0|1 19 - exclude_hv=0 19 + exclude_hv=0|1 20 20 exclude_idle=0 21 21 mmap=0 22 22 comm=0

+57 -39

tools/perf/tests/attr/test-stat-default

··· 40 40 type=0 41 41 config=7 42 42 optional=1 43 - 44 43 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND 45 44 [event7:base-stat] 46 45 fd=7 ··· 88 89 read_format=15 89 90 optional=1 90 91 91 - # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 92 + # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 92 93 [event13:base-stat] 93 94 fd=13 94 - group_fd=11 95 - type=4 96 - config=33024 97 - disabled=0 98 - enable_on_exec=0 99 - read_format=15 100 - optional=1 101 - 102 - # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 103 - [event14:base-stat] 104 - fd=14 105 95 group_fd=11 106 96 type=4 107 97 config=33280 ··· 100 112 optional=1 101 113 102 114 # PERF_TYPE_RAW / topdown-be-bound (0x8300) 103 - [event15:base-stat] 104 - fd=15 115 + [event14:base-stat] 116 + fd=14 105 117 group_fd=11 106 118 type=4 107 119 config=33536 ··· 110 122 read_format=15 111 123 optional=1 112 124 113 - # PERF_TYPE_RAW / topdown-heavy-ops (0x8400) 125 + # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 126 + [event15:base-stat] 127 + fd=15 128 + group_fd=11 129 + type=4 130 + config=33024 131 + disabled=0 132 + enable_on_exec=0 133 + read_format=15 134 + optional=1 135 + 136 + # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING 114 137 [event16:base-stat] 115 138 fd=16 116 - group_fd=11 117 139 type=4 118 - config=33792 119 - disabled=0 120 - enable_on_exec=0 121 - read_format=15 140 + config=4109 122 141 optional=1 123 142 124 - # PERF_TYPE_RAW / topdown-br-mispredict (0x8500) 143 + # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/ 125 144 [event17:base-stat] 126 145 fd=17 127 - group_fd=11 128 146 type=4 129 - config=34048 130 - disabled=0 131 - enable_on_exec=0 132 - read_format=15 147 + config=17039629 133 148 optional=1 134 149 135 - # PERF_TYPE_RAW / topdown-fetch-lat (0x8600) 150 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD 136 151 [event18:base-stat] 137 152 fd=18 138 - group_fd=11 139 153 type=4 140 - config=34304 141 - disabled=0 142 - enable_on_exec=0 143 - read_format=15 154 + config=60 144 155 optional=1 145 156 146 - # PERF_TYPE_RAW / topdown-mem-bound (0x8700) 157 + # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY 147 158 [event19:base-stat] 148 159 fd=19 149 - group_fd=11 150 160 type=4 151 - config=34560 152 - disabled=0 153 - enable_on_exec=0 154 - read_format=15 161 + config=2097421 162 + optional=1 163 + 164 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK 165 + [event20:base-stat] 166 + fd=20 167 + type=4 168 + config=316 169 + optional=1 170 + 171 + # PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE 172 + [event21:base-stat] 173 + fd=21 174 + type=4 175 + config=412 176 + optional=1 177 + 178 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE 179 + [event22:base-stat] 180 + fd=22 181 + type=4 182 + config=572 183 + optional=1 184 + 185 + # PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS 186 + [event23:base-stat] 187 + fd=23 188 + type=4 189 + config=706 190 + optional=1 191 + 192 + # PERF_TYPE_RAW / UOPS_ISSUED.ANY 193 + [event24:base-stat] 194 + fd=24 195 + type=4 196 + config=270 155 197 optional=1

+65 -46

tools/perf/tests/attr/test-stat-detailed-1

··· 90 90 read_format=15 91 91 optional=1 92 92 93 - # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 93 + # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 94 94 [event13:base-stat] 95 95 fd=13 96 - group_fd=11 97 - type=4 98 - config=33024 99 - disabled=0 100 - enable_on_exec=0 101 - read_format=15 102 - optional=1 103 - 104 - # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 105 - [event14:base-stat] 106 - fd=14 107 96 group_fd=11 108 97 type=4 109 98 config=33280 ··· 102 113 optional=1 103 114 104 115 # PERF_TYPE_RAW / topdown-be-bound (0x8300) 105 - [event15:base-stat] 106 - fd=15 116 + [event14:base-stat] 117 + fd=14 107 118 group_fd=11 108 119 type=4 109 120 config=33536 ··· 112 123 read_format=15 113 124 optional=1 114 125 115 - # PERF_TYPE_RAW / topdown-heavy-ops (0x8400) 126 + # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 127 + [event15:base-stat] 128 + fd=15 129 + group_fd=11 130 + type=4 131 + config=33024 132 + disabled=0 133 + enable_on_exec=0 134 + read_format=15 135 + optional=1 136 + 137 + # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING 116 138 [event16:base-stat] 117 139 fd=16 118 - group_fd=11 119 140 type=4 120 - config=33792 121 - disabled=0 122 - enable_on_exec=0 123 - read_format=15 141 + config=4109 124 142 optional=1 125 143 126 - # PERF_TYPE_RAW / topdown-br-mispredict (0x8500) 144 + # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/ 127 145 [event17:base-stat] 128 146 fd=17 129 - group_fd=11 130 147 type=4 131 - config=34048 132 - disabled=0 133 - enable_on_exec=0 134 - read_format=15 148 + config=17039629 135 149 optional=1 136 150 137 - # PERF_TYPE_RAW / topdown-fetch-lat (0x8600) 151 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD 138 152 [event18:base-stat] 139 153 fd=18 140 - group_fd=11 141 154 type=4 142 - config=34304 143 - disabled=0 144 - enable_on_exec=0 145 - read_format=15 155 + config=60 146 156 optional=1 147 157 148 - # PERF_TYPE_RAW / topdown-mem-bound (0x8700) 158 + # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY 149 159 [event19:base-stat] 150 160 fd=19 151 - group_fd=11 152 161 type=4 153 - config=34560 154 - disabled=0 155 - enable_on_exec=0 156 - read_format=15 162 + config=2097421 163 + optional=1 164 + 165 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK 166 + [event20:base-stat] 167 + fd=20 168 + type=4 169 + config=316 170 + optional=1 171 + 172 + # PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE 173 + [event21:base-stat] 174 + fd=21 175 + type=4 176 + config=412 177 + optional=1 178 + 179 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE 180 + [event22:base-stat] 181 + fd=22 182 + type=4 183 + config=572 184 + optional=1 185 + 186 + # PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS 187 + [event23:base-stat] 188 + fd=23 189 + type=4 190 + config=706 191 + optional=1 192 + 193 + # PERF_TYPE_RAW / UOPS_ISSUED.ANY 194 + [event24:base-stat] 195 + fd=24 196 + type=4 197 + config=270 157 198 optional=1 158 199 159 200 # PERF_TYPE_HW_CACHE / 160 201 # PERF_COUNT_HW_CACHE_L1D << 0 | 161 202 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 162 203 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 163 - [event20:base-stat] 164 - fd=20 204 + [event25:base-stat] 205 + fd=25 165 206 type=3 166 207 config=0 167 208 optional=1 ··· 200 181 # PERF_COUNT_HW_CACHE_L1D << 0 | 201 182 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 202 183 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 203 - [event21:base-stat] 204 - fd=21 184 + [event26:base-stat] 185 + fd=26 205 186 type=3 206 187 config=65536 207 188 optional=1 ··· 210 191 # PERF_COUNT_HW_CACHE_LL << 0 | 211 192 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 212 193 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 213 - [event22:base-stat] 214 - fd=22 194 + [event27:base-stat] 195 + fd=27 215 196 type=3 216 197 config=2 217 198 optional=1 ··· 220 201 # PERF_COUNT_HW_CACHE_LL << 0 | 221 202 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 222 203 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 223 - [event23:base-stat] 224 - fd=23 204 + [event28:base-stat] 205 + fd=28 225 206 type=3 226 207 config=65538 227 208 optional=1

+77 -58

tools/perf/tests/attr/test-stat-detailed-2

··· 90 90 read_format=15 91 91 optional=1 92 92 93 - # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 93 + # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 94 94 [event13:base-stat] 95 95 fd=13 96 - group_fd=11 97 - type=4 98 - config=33024 99 - disabled=0 100 - enable_on_exec=0 101 - read_format=15 102 - optional=1 103 - 104 - # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 105 - [event14:base-stat] 106 - fd=14 107 96 group_fd=11 108 97 type=4 109 98 config=33280 ··· 102 113 optional=1 103 114 104 115 # PERF_TYPE_RAW / topdown-be-bound (0x8300) 105 - [event15:base-stat] 106 - fd=15 116 + [event14:base-stat] 117 + fd=14 107 118 group_fd=11 108 119 type=4 109 120 config=33536 ··· 112 123 read_format=15 113 124 optional=1 114 125 115 - # PERF_TYPE_RAW / topdown-heavy-ops (0x8400) 126 + # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 127 + [event15:base-stat] 128 + fd=15 129 + group_fd=11 130 + type=4 131 + config=33024 132 + disabled=0 133 + enable_on_exec=0 134 + read_format=15 135 + optional=1 136 + 137 + # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING 116 138 [event16:base-stat] 117 139 fd=16 118 - group_fd=11 119 140 type=4 120 - config=33792 121 - disabled=0 122 - enable_on_exec=0 123 - read_format=15 141 + config=4109 124 142 optional=1 125 143 126 - # PERF_TYPE_RAW / topdown-br-mispredict (0x8500) 144 + # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/ 127 145 [event17:base-stat] 128 146 fd=17 129 - group_fd=11 130 147 type=4 131 - config=34048 132 - disabled=0 133 - enable_on_exec=0 134 - read_format=15 148 + config=17039629 135 149 optional=1 136 150 137 - # PERF_TYPE_RAW / topdown-fetch-lat (0x8600) 151 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD 138 152 [event18:base-stat] 139 153 fd=18 140 - group_fd=11 141 154 type=4 142 - config=34304 143 - disabled=0 144 - enable_on_exec=0 145 - read_format=15 155 + config=60 146 156 optional=1 147 157 148 - # PERF_TYPE_RAW / topdown-mem-bound (0x8700) 158 + # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY 149 159 [event19:base-stat] 150 160 fd=19 151 - group_fd=11 152 161 type=4 153 - config=34560 154 - disabled=0 155 - enable_on_exec=0 156 - read_format=15 162 + config=2097421 163 + optional=1 164 + 165 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK 166 + [event20:base-stat] 167 + fd=20 168 + type=4 169 + config=316 170 + optional=1 171 + 172 + # PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE 173 + [event21:base-stat] 174 + fd=21 175 + type=4 176 + config=412 177 + optional=1 178 + 179 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE 180 + [event22:base-stat] 181 + fd=22 182 + type=4 183 + config=572 184 + optional=1 185 + 186 + # PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS 187 + [event23:base-stat] 188 + fd=23 189 + type=4 190 + config=706 191 + optional=1 192 + 193 + # PERF_TYPE_RAW / UOPS_ISSUED.ANY 194 + [event24:base-stat] 195 + fd=24 196 + type=4 197 + config=270 157 198 optional=1 158 199 159 200 # PERF_TYPE_HW_CACHE / 160 201 # PERF_COUNT_HW_CACHE_L1D << 0 | 161 202 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 162 203 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 163 - [event20:base-stat] 164 - fd=20 204 + [event25:base-stat] 205 + fd=25 165 206 type=3 166 207 config=0 167 208 optional=1 ··· 200 181 # PERF_COUNT_HW_CACHE_L1D << 0 | 201 182 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 202 183 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 203 - [event21:base-stat] 204 - fd=21 184 + [event26:base-stat] 185 + fd=26 205 186 type=3 206 187 config=65536 207 188 optional=1 ··· 210 191 # PERF_COUNT_HW_CACHE_LL << 0 | 211 192 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 212 193 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 213 - [event22:base-stat] 214 - fd=22 194 + [event27:base-stat] 195 + fd=27 215 196 type=3 216 197 config=2 217 198 optional=1 ··· 220 201 # PERF_COUNT_HW_CACHE_LL << 0 | 221 202 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 222 203 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 223 - [event23:base-stat] 224 - fd=23 204 + [event28:base-stat] 205 + fd=28 225 206 type=3 226 207 config=65538 227 208 optional=1 ··· 230 211 # PERF_COUNT_HW_CACHE_L1I << 0 | 231 212 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 232 213 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 233 - [event24:base-stat] 234 - fd=24 214 + [event29:base-stat] 215 + fd=29 235 216 type=3 236 217 config=1 237 218 optional=1 ··· 240 221 # PERF_COUNT_HW_CACHE_L1I << 0 | 241 222 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 242 223 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 243 - [event25:base-stat] 244 - fd=25 224 + [event30:base-stat] 225 + fd=30 245 226 type=3 246 227 config=65537 247 228 optional=1 ··· 250 231 # PERF_COUNT_HW_CACHE_DTLB << 0 | 251 232 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 252 233 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 253 - [event26:base-stat] 254 - fd=26 234 + [event31:base-stat] 235 + fd=31 255 236 type=3 256 237 config=3 257 238 optional=1 ··· 260 241 # PERF_COUNT_HW_CACHE_DTLB << 0 | 261 242 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 262 243 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 263 - [event27:base-stat] 264 - fd=27 244 + [event32:base-stat] 245 + fd=32 265 246 type=3 266 247 config=65539 267 248 optional=1 ··· 270 251 # PERF_COUNT_HW_CACHE_ITLB << 0 | 271 252 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 272 253 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 273 - [event28:base-stat] 274 - fd=28 254 + [event33:base-stat] 255 + fd=33 275 256 type=3 276 257 config=4 277 258 optional=1 ··· 280 261 # PERF_COUNT_HW_CACHE_ITLB << 0 | 281 262 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 282 263 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 283 - [event29:base-stat] 284 - fd=29 264 + [event34:base-stat] 265 + fd=34 285 266 type=3 286 267 config=65540 287 268 optional=1

+81 -62

tools/perf/tests/attr/test-stat-detailed-3

··· 90 90 read_format=15 91 91 optional=1 92 92 93 - # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 93 + # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 94 94 [event13:base-stat] 95 95 fd=13 96 - group_fd=11 97 - type=4 98 - config=33024 99 - disabled=0 100 - enable_on_exec=0 101 - read_format=15 102 - optional=1 103 - 104 - # PERF_TYPE_RAW / topdown-fe-bound (0x8200) 105 - [event14:base-stat] 106 - fd=14 107 96 group_fd=11 108 97 type=4 109 98 config=33280 ··· 102 113 optional=1 103 114 104 115 # PERF_TYPE_RAW / topdown-be-bound (0x8300) 105 - [event15:base-stat] 106 - fd=15 116 + [event14:base-stat] 117 + fd=14 107 118 group_fd=11 108 119 type=4 109 120 config=33536 ··· 112 123 read_format=15 113 124 optional=1 114 125 115 - # PERF_TYPE_RAW / topdown-heavy-ops (0x8400) 126 + # PERF_TYPE_RAW / topdown-bad-spec (0x8100) 127 + [event15:base-stat] 128 + fd=15 129 + group_fd=11 130 + type=4 131 + config=33024 132 + disabled=0 133 + enable_on_exec=0 134 + read_format=15 135 + optional=1 136 + 137 + # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING 116 138 [event16:base-stat] 117 139 fd=16 118 - group_fd=11 119 140 type=4 120 - config=33792 121 - disabled=0 122 - enable_on_exec=0 123 - read_format=15 141 + config=4109 124 142 optional=1 125 143 126 - # PERF_TYPE_RAW / topdown-br-mispredict (0x8500) 144 + # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/ 127 145 [event17:base-stat] 128 146 fd=17 129 - group_fd=11 130 147 type=4 131 - config=34048 132 - disabled=0 133 - enable_on_exec=0 134 - read_format=15 148 + config=17039629 135 149 optional=1 136 150 137 - # PERF_TYPE_RAW / topdown-fetch-lat (0x8600) 151 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD 138 152 [event18:base-stat] 139 153 fd=18 140 - group_fd=11 141 154 type=4 142 - config=34304 143 - disabled=0 144 - enable_on_exec=0 145 - read_format=15 155 + config=60 146 156 optional=1 147 157 148 - # PERF_TYPE_RAW / topdown-mem-bound (0x8700) 158 + # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY 149 159 [event19:base-stat] 150 160 fd=19 151 - group_fd=11 152 161 type=4 153 - config=34560 154 - disabled=0 155 - enable_on_exec=0 156 - read_format=15 162 + config=2097421 163 + optional=1 164 + 165 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK 166 + [event20:base-stat] 167 + fd=20 168 + type=4 169 + config=316 170 + optional=1 171 + 172 + # PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE 173 + [event21:base-stat] 174 + fd=21 175 + type=4 176 + config=412 177 + optional=1 178 + 179 + # PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE 180 + [event22:base-stat] 181 + fd=22 182 + type=4 183 + config=572 184 + optional=1 185 + 186 + # PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS 187 + [event23:base-stat] 188 + fd=23 189 + type=4 190 + config=706 191 + optional=1 192 + 193 + # PERF_TYPE_RAW / UOPS_ISSUED.ANY 194 + [event24:base-stat] 195 + fd=24 196 + type=4 197 + config=270 157 198 optional=1 158 199 159 200 # PERF_TYPE_HW_CACHE / 160 201 # PERF_COUNT_HW_CACHE_L1D << 0 | 161 202 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 162 203 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 163 - [event20:base-stat] 164 - fd=20 204 + [event25:base-stat] 205 + fd=25 165 206 type=3 166 207 config=0 167 208 optional=1 ··· 200 181 # PERF_COUNT_HW_CACHE_L1D << 0 | 201 182 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 202 183 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 203 - [event21:base-stat] 204 - fd=21 184 + [event26:base-stat] 185 + fd=26 205 186 type=3 206 187 config=65536 207 188 optional=1 ··· 210 191 # PERF_COUNT_HW_CACHE_LL << 0 | 211 192 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 212 193 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 213 - [event22:base-stat] 214 - fd=22 194 + [event27:base-stat] 195 + fd=27 215 196 type=3 216 197 config=2 217 198 optional=1 ··· 220 201 # PERF_COUNT_HW_CACHE_LL << 0 | 221 202 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 222 203 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 223 - [event23:base-stat] 224 - fd=23 204 + [event28:base-stat] 205 + fd=28 225 206 type=3 226 207 config=65538 227 208 optional=1 ··· 230 211 # PERF_COUNT_HW_CACHE_L1I << 0 | 231 212 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 232 213 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 233 - [event24:base-stat] 234 - fd=24 214 + [event29:base-stat] 215 + fd=29 235 216 type=3 236 217 config=1 237 218 optional=1 ··· 240 221 # PERF_COUNT_HW_CACHE_L1I << 0 | 241 222 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 242 223 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 243 - [event25:base-stat] 244 - fd=25 224 + [event30:base-stat] 225 + fd=30 245 226 type=3 246 227 config=65537 247 228 optional=1 ··· 250 231 # PERF_COUNT_HW_CACHE_DTLB << 0 | 251 232 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 252 233 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 253 - [event26:base-stat] 254 - fd=26 234 + [event31:base-stat] 235 + fd=31 255 236 type=3 256 237 config=3 257 238 optional=1 ··· 260 241 # PERF_COUNT_HW_CACHE_DTLB << 0 | 261 242 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 262 243 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 263 - [event27:base-stat] 264 - fd=27 244 + [event32:base-stat] 245 + fd=32 265 246 type=3 266 247 config=65539 267 248 optional=1 ··· 270 251 # PERF_COUNT_HW_CACHE_ITLB << 0 | 271 252 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 272 253 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 273 - [event28:base-stat] 274 - fd=28 254 + [event33:base-stat] 255 + fd=33 275 256 type=3 276 257 config=4 277 258 optional=1 ··· 280 261 # PERF_COUNT_HW_CACHE_ITLB << 0 | 281 262 # (PERF_COUNT_HW_CACHE_OP_READ << 8) | 282 263 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 283 - [event29:base-stat] 284 - fd=29 264 + [event34:base-stat] 265 + fd=34 285 266 type=3 286 267 config=65540 287 268 optional=1 ··· 290 271 # PERF_COUNT_HW_CACHE_L1D << 0 | 291 272 # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 292 273 # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) 293 - [event30:base-stat] 294 - fd=30 274 + [event35:base-stat] 275 + fd=35 295 276 type=3 296 277 config=512 297 278 optional=1 ··· 300 281 # PERF_COUNT_HW_CACHE_L1D << 0 | 301 282 # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 302 283 # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) 303 - [event31:base-stat] 304 - fd=31 284 + [event36:base-stat] 285 + fd=36 305 286 type=3 306 287 config=66048 307 288 optional=1

+2 -1

tools/perf/tests/expr.c

··· 120 120 121 121 p = "FOO/0"; 122 122 ret = expr__parse(&val, ctx, p); 123 - TEST_ASSERT_VAL("division by zero", ret == -1); 123 + TEST_ASSERT_VAL("division by zero", ret == 0); 124 + TEST_ASSERT_VAL("division by zero", isnan(val)); 124 125 125 126 p = "BAR/"; 126 127 ret = expr__parse(&val, ctx, p);

+1

tools/perf/tests/parse-metric.c

··· 38 38 evlist__alloc_aggr_stats(evlist, 1); 39 39 evlist__for_each_entry(evlist, evsel) { 40 40 count = find_value(evsel->name, vals); 41 + evsel->supported = true; 41 42 evsel->stats->aggr->counts.val = count; 42 43 if (evsel__name_is(evsel, "duration_time")) 43 44 update_stats(&walltime_nsecs_stats, count);

+13

tools/perf/tests/shell/stat.sh

··· 28 28 echo "stat record and report test [Success]" 29 29 } 30 30 31 + test_stat_record_script() { 32 + echo "stat record and script test" 33 + if ! perf stat record -o - true | perf script -i - 2>&1 | \ 34 + grep -E -q "CPU[[:space:]]+THREAD[[:space:]]+VAL[[:space:]]+ENA[[:space:]]+RUN[[:space:]]+TIME[[:space:]]+EVENT" 35 + then 36 + echo "stat record and script test [Failed]" 37 + err=1 38 + return 39 + fi 40 + echo "stat record and script test [Success]" 41 + } 42 + 31 43 test_stat_repeat_weak_groups() { 32 44 echo "stat repeat weak groups test" 33 45 if ! perf stat -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}' \ ··· 105 93 106 94 test_default_stat 107 95 test_stat_record_report 96 + test_stat_record_script 108 97 test_stat_repeat_weak_groups 109 98 test_topdown_groups 110 99 test_topdown_weak_groups

+7

tools/perf/tests/shell/test_intel_pt.sh

··· 506 506 echo "perf record failed with --aux-sample" 507 507 return 1 508 508 fi 509 + # Check with event with PMU name 510 + if perf_record_no_decode -o "${perfdatafile}" -e br_misp_retired.all_branches:u uname ; then 511 + if ! perf_record_no_decode -o "${perfdatafile}" -e '{intel_pt//,br_misp_retired.all_branches/aux-sample-size=8192/}:u' uname ; then 512 + echo "perf record failed with --aux-sample-size" 513 + return 1 514 + fi 515 + fi 509 516 echo OK 510 517 return 0 511 518 }

+1 -1

tools/perf/tests/shell/test_java_symbol.sh

··· 56 56 exit 1 57 57 fi 58 58 59 - if ! perf inject -i $PERF_DATA -o $PERF_INJ_DATA -j; then 59 + if ! DEBUGINFOD_URLS='' perf inject -i $PERF_DATA -o $PERF_INJ_DATA -j; then 60 60 echo "Fail to inject samples" 61 61 exit 1 62 62 fi

+2

tools/perf/trace/beauty/arch_prctl.c

··· 12 12 13 13 static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_1, "ARCH_", x86_arch_prctl_codes_1_offset); 14 14 static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_2, "ARCH_", x86_arch_prctl_codes_2_offset); 15 + static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_3, "ARCH_", x86_arch_prctl_codes_3_offset); 15 16 16 17 static struct strarray *x86_arch_prctl_codes[] = { 17 18 &strarray__x86_arch_prctl_codes_1, 18 19 &strarray__x86_arch_prctl_codes_2, 20 + &strarray__x86_arch_prctl_codes_3, 19 21 }; 20 22 21 23 static DEFINE_STRARRAYS(x86_arch_prctl_codes);

+1

tools/perf/trace/beauty/x86_arch_prctl.sh

··· 24 24 25 25 print_range 1 0x1 0x1001 26 26 print_range 2 0x2 0x2001 27 + print_range 3 0x4 0x4001

+2

tools/perf/util/bpf_skel/lock_contention.bpf.c

··· 416 416 return 0; 417 417 } 418 418 419 + struct rq {}; 420 + 419 421 extern struct rq runqueues __ksym; 420 422 421 423 struct rq___old {

+1

tools/perf/util/bpf_skel/vmlinux.h

··· 1 1 #ifndef __VMLINUX_H 2 2 #define __VMLINUX_H 3 3 4 + #include <linux/stddef.h> // for define __always_inline 4 5 #include <linux/bpf.h> 5 6 #include <linux/types.h> 6 7 #include <linux/perf_event.h>

+24 -13

tools/perf/util/evsel.c

··· 290 290 evsel->per_pkg_mask = NULL; 291 291 evsel->collect_stat = false; 292 292 evsel->pmu_name = NULL; 293 + evsel->skippable = false; 293 294 } 294 295 295 296 struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx) ··· 829 828 830 829 const char *evsel__group_pmu_name(const struct evsel *evsel) 831 830 { 832 - const struct evsel *leader; 831 + struct evsel *leader = evsel__leader(evsel); 832 + struct evsel *pos; 833 833 834 - /* If the pmu_name is set use it. pmu_name isn't set for CPU and software events. */ 835 - if (evsel->pmu_name) 836 - return evsel->pmu_name; 837 834 /* 838 835 * Software events may be in a group with other uncore PMU events. Use 839 - * the pmu_name of the group leader to avoid breaking the software event 840 - * out of the group. 836 + * the pmu_name of the first non-software event to avoid breaking the 837 + * software event out of the group. 841 838 * 842 839 * Aux event leaders, like intel_pt, expect a group with events from 843 840 * other PMUs, so substitute the AUX event's PMU in this case. 844 841 */ 845 - leader = evsel__leader(evsel); 846 - if ((evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) && 847 - leader->pmu_name) { 848 - return leader->pmu_name; 842 + if (evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) { 843 + /* Starting with the leader, find the first event with a named PMU. */ 844 + for_each_group_evsel(pos, leader) { 845 + if (pos->pmu_name) 846 + return pos->pmu_name; 847 + } 849 848 } 850 849 851 - return "cpu"; 850 + return evsel->pmu_name ?: "cpu"; 852 851 } 853 852 854 853 const char *evsel__metric_id(const struct evsel *evsel) ··· 1726 1725 return -1; 1727 1726 1728 1727 fd = FD(leader, cpu_map_idx, thread); 1729 - BUG_ON(fd == -1); 1728 + BUG_ON(fd == -1 && !leader->skippable); 1730 1729 1731 - return fd; 1730 + /* 1731 + * When the leader has been skipped, return -2 to distinguish from no 1732 + * group leader case. 1733 + */ 1734 + return fd == -1 ? -2 : fd; 1732 1735 } 1733 1736 1734 1737 static void evsel__remove_fd(struct evsel *pos, int nr_cpus, int nr_threads, int thread_idx) ··· 2113 2108 pid = perf_thread_map__pid(threads, thread); 2114 2109 2115 2110 group_fd = get_group_fd(evsel, idx, thread); 2111 + 2112 + if (group_fd == -2) { 2113 + pr_debug("broken group leader for %s\n", evsel->name); 2114 + err = -EINVAL; 2115 + goto out_close; 2116 + } 2116 2117 2117 2118 test_attr__ready(); 2118 2119

+1

tools/perf/util/evsel.h

··· 95 95 bool weak_group; 96 96 bool bpf_counter; 97 97 bool use_config_name; 98 + bool skippable; 98 99 int bpf_fd; 99 100 struct bpf_object *bpf_obj; 100 101 struct list_head config_terms;

+5 -1

tools/perf/util/expr.y

··· 225 225 { 226 226 if (fpclassify($3.val) == FP_ZERO) { 227 227 pr_debug("division by zero\n"); 228 - YYABORT; 228 + assert($3.ids == NULL); 229 + if (compute_ids) 230 + ids__free($1.ids); 231 + $$.val = NAN; 232 + $$.ids = NULL; 229 233 } else if (!compute_ids || (is_const($1.val) && is_const($3.val))) { 230 234 assert($1.ids == NULL); 231 235 assert($3.ids == NULL);

+5 -5

tools/perf/util/metricgroup.c

··· 1144 1144 struct metricgroup__add_metric_data *data = vdata; 1145 1145 int ret = 0; 1146 1146 1147 - if (pm->metric_expr && 1148 - (match_metric(pm->metric_group, data->metric_name) || 1149 - match_metric(pm->metric_name, data->metric_name))) { 1147 + if (pm->metric_expr && match_pm_metric(pm, data->metric_name)) { 1148 + bool metric_no_group = data->metric_no_group || 1149 + match_metric(data->metric_name, pm->metricgroup_no_group); 1150 1150 1151 1151 data->has_match = true; 1152 - ret = add_metric(data->list, pm, data->modifier, data->metric_no_group, 1152 + ret = add_metric(data->list, pm, data->modifier, metric_no_group, 1153 1153 data->metric_no_threshold, data->user_requested_cpu_list, 1154 1154 data->system_wide, /*root_metric=*/NULL, 1155 1155 /*visited_metrics=*/NULL, table); ··· 1672 1672 { 1673 1673 unsigned int *max_level = data; 1674 1674 unsigned int level; 1675 - const char *p = strstr(pm->metric_group, "TopdownL"); 1675 + const char *p = strstr(pm->metric_group ?: "", "TopdownL"); 1676 1676 1677 1677 if (!p || p[8] == '\0') 1678 1678 return 0;

+15 -8

tools/perf/util/parse-events.c

··· 2140 2140 int *leader_idx = state; 2141 2141 int lhs_leader_idx = *leader_idx, rhs_leader_idx = *leader_idx, ret; 2142 2142 const char *lhs_pmu_name, *rhs_pmu_name; 2143 + bool lhs_has_group = false, rhs_has_group = false; 2143 2144 2144 2145 /* 2145 2146 * First sort by grouping/leader. Read the leader idx only if the evsel 2146 2147 * is part of a group, as -1 indicates no group. 2147 2148 */ 2148 - if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1) 2149 + if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1) { 2150 + lhs_has_group = true; 2149 2151 lhs_leader_idx = lhs_core->leader->idx; 2150 - if (rhs_core->leader != rhs_core || rhs_core->nr_members > 1) 2152 + } 2153 + if (rhs_core->leader != rhs_core || rhs_core->nr_members > 1) { 2154 + rhs_has_group = true; 2151 2155 rhs_leader_idx = rhs_core->leader->idx; 2156 + } 2152 2157 2153 2158 if (lhs_leader_idx != rhs_leader_idx) 2154 2159 return lhs_leader_idx - rhs_leader_idx; 2155 2160 2156 - /* Group by PMU. Groups can't span PMUs. */ 2157 - lhs_pmu_name = evsel__group_pmu_name(lhs); 2158 - rhs_pmu_name = evsel__group_pmu_name(rhs); 2159 - ret = strcmp(lhs_pmu_name, rhs_pmu_name); 2160 - if (ret) 2161 - return ret; 2161 + /* Group by PMU if there is a group. Groups can't span PMUs. */ 2162 + if (lhs_has_group && rhs_has_group) { 2163 + lhs_pmu_name = evsel__group_pmu_name(lhs); 2164 + rhs_pmu_name = evsel__group_pmu_name(rhs); 2165 + ret = strcmp(lhs_pmu_name, rhs_pmu_name); 2166 + if (ret) 2167 + return ret; 2168 + } 2162 2169 2163 2170 /* Architecture specific sorting. */ 2164 2171 return arch_evlist__cmp(lhs, rhs);

+1 -1

tools/perf/util/stat-display.c

··· 431 431 struct outstate *os = ctx; 432 432 FILE *out = os->fh; 433 433 434 - fprintf(out, "\"metric-value\" : %f, ", val); 434 + fprintf(out, "\"metric-value\" : \"%f\", ", val); 435 435 fprintf(out, "\"metric-unit\" : \"%s\"", unit); 436 436 if (!config->metric_only) 437 437 fprintf(out, "}");

+19 -6

tools/perf/util/stat-shadow.c

··· 403 403 if (!aggr) 404 404 break; 405 405 406 - /* 407 - * If an event was scaled during stat gathering, reverse 408 - * the scale before computing the metric. 409 - */ 410 - val = aggr->counts.val * (1.0 / metric_events[i]->scale); 411 - source_count = evsel__source_count(metric_events[i]); 406 + if (!metric_events[i]->supported) { 407 + /* 408 + * Not supported events will have a count of 0, 409 + * which can be confusing in a 410 + * metric. Explicitly set the value to NAN. Not 411 + * counted events (enable time of 0) are read as 412 + * 0. 413 + */ 414 + val = NAN; 415 + source_count = 0; 416 + } else { 417 + /* 418 + * If an event was scaled during stat gathering, 419 + * reverse the scale before computing the 420 + * metric. 421 + */ 422 + val = aggr->counts.val * (1.0 / metric_events[i]->scale); 423 + source_count = evsel__source_count(metric_events[i]); 424 + } 412 425 } 413 426 n = strdup(evsel__metric_id(metric_events[i])); 414 427 if (!n)

Configure Feed

Configure Feed