Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

perf/x86/intel: Add support for PEBS memory auxiliary info field in DMR

With the introduction of the OMR feature, the PEBS memory auxiliary info
field for load and store latency events has been restructured for DMR.

The memory auxiliary info field's bit[8] indicates whether a L2 cache
miss occurred for a memory load or store instruction. If bit[8] is 0,
it signifies no L2 cache miss, and bits[7:0] specify the exact cache data
source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent
the OMR encoding, indicating the specific L3 cache or memory region
involved in the memory access. A significant enhancement is OMR encoding
provides up to 8 fine-grained memory regions besides the cache region.

A significant enhancement for OMR encoding is the ability to provide
up to 8 fine-grained memory regions in addition to the cache region,
offering more detailed insights into memory access regions.

For detailed information on the memory auxiliary info encoding, please
refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in
the ISE documentation.

This patch ensures that the PEBS memory auxiliary info field is correctly
interpreted and utilized in DMR.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com

authored by

Dapeng Mi and committed by
Peter Zijlstra
d2bdcde9 4e955c08

+190 -6
+140
arch/x86/events/intel/ds.c
··· 34 34 35 35 */ 36 36 37 + union omr_encoding { 38 + struct { 39 + u8 omr_source : 4; 40 + u8 omr_remote : 1; 41 + u8 omr_hitm : 1; 42 + u8 omr_snoop : 1; 43 + u8 omr_promoted : 1; 44 + }; 45 + u8 omr_full; 46 + }; 47 + 37 48 union intel_x86_pebs_dse { 38 49 u64 val; 39 50 struct { ··· 83 72 unsigned int lnc_data_blk:1; 84 73 unsigned int lnc_addr_blk:1; 85 74 unsigned int ld_reserved6:18; 75 + }; 76 + struct { 77 + unsigned int pnc_dse: 8; 78 + unsigned int pnc_l2_miss:1; 79 + unsigned int pnc_stlb_clean_hit:1; 80 + unsigned int pnc_stlb_any_hit:1; 81 + unsigned int pnc_stlb_miss:1; 82 + unsigned int pnc_locked:1; 83 + unsigned int pnc_data_blk:1; 84 + unsigned int pnc_addr_blk:1; 85 + unsigned int pnc_fb_full:1; 86 + unsigned int ld_reserved8:16; 86 87 }; 87 88 }; 88 89 ··· 249 226 data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; 250 227 memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); 251 228 __intel_pmu_pebs_data_source_cmt(data_source); 229 + } 230 + 231 + /* Version for Panthercove and later */ 232 + 233 + /* L2 hit */ 234 + #define PNC_PEBS_DATA_SOURCE_MAX 16 235 + static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = { 236 + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: non-cache access */ 237 + OP_LH | LEVEL(L0) | P(SNOOP, NONE), /* 0x01: L0 hit */ 238 + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */ 239 + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: L1 Miss Handling Buffer hit */ 240 + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x04: L2 Hit Clean */ 241 + 0, /* 0x05: Reserved */ 242 + 0, /* 0x06: Reserved */ 243 + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HIT), /* 0x07: L2 Hit Snoop HIT */ 244 + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, HITM), /* 0x08: L2 Hit Snoop Hit Modified */ 245 + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x09: Prefetch Promotion */ 246 + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, MISS), /* 0x0a: Cross Core Prefetch Promotion */ 247 + 0, /* 0x0b: Reserved */ 248 + 0, /* 0x0c: Reserved */ 249 + 0, /* 0x0d: Reserved */ 250 + 0, /* 0x0e: Reserved */ 251 + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ 252 + }; 253 + 254 + /* L2 miss */ 255 + #define OMR_DATA_SOURCE_MAX 16 256 + static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = { 257 + P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA), /* 0x00: invalid */ 258 + 0, /* 0x01: Reserved */ 259 + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE), /* 0x02: local CA shared cache */ 260 + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */ 261 + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO), /* 0x04: other CA IO agent */ 262 + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE), /* 0x05: other CA shared cache */ 263 + OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */ 264 + OP_LH | LEVEL(RAM) | P(REGION, MMIO), /* 0x07: MMIO */ 265 + OP_LH | LEVEL(RAM) | P(REGION, MEM0), /* 0x08: Memory region 0 */ 266 + OP_LH | LEVEL(RAM) | P(REGION, MEM1), /* 0x09: Memory region 1 */ 267 + OP_LH | LEVEL(RAM) | P(REGION, MEM2), /* 0x0a: Memory region 2 */ 268 + OP_LH | LEVEL(RAM) | P(REGION, MEM3), /* 0x0b: Memory region 3 */ 269 + OP_LH | LEVEL(RAM) | P(REGION, MEM4), /* 0x0c: Memory region 4 */ 270 + OP_LH | LEVEL(RAM) | P(REGION, MEM5), /* 0x0d: Memory region 5 */ 271 + OP_LH | LEVEL(RAM) | P(REGION, MEM6), /* 0x0e: Memory region 6 */ 272 + OP_LH | LEVEL(RAM) | P(REGION, MEM7), /* 0x0f: Memory region 7 */ 273 + }; 274 + 275 + static u64 parse_omr_data_source(u8 dse) 276 + { 277 + union omr_encoding omr; 278 + u64 val = 0; 279 + 280 + omr.omr_full = dse; 281 + val = omr_data_source[omr.omr_source]; 282 + if (omr.omr_source > 0x1 && omr.omr_source < 0x7) 283 + val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0; 284 + else if (omr.omr_source > 0x7) 285 + val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM); 286 + 287 + if (omr.omr_remote) 288 + val |= REM; 289 + 290 + val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT); 291 + 292 + if (omr.omr_source == 0x2) { 293 + u8 snoop = omr.omr_snoop | omr.omr_promoted; 294 + 295 + if (snoop == 0x0) 296 + val |= P(SNOOP, NA); 297 + else if (snoop == 0x1) 298 + val |= P(SNOOP, MISS); 299 + else if (snoop == 0x2) 300 + val |= P(SNOOP, HIT); 301 + else if (snoop == 0x3) 302 + val |= P(SNOOP, NONE); 303 + } else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) { 304 + val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0; 305 + } 306 + 307 + return val; 252 308 } 253 309 254 310 static u64 precise_store_data(u64 status) ··· 511 409 return cmt_latency_data(event, status); 512 410 513 411 return lnl_latency_data(event, status); 412 + } 413 + 414 + u64 pnc_latency_data(struct perf_event *event, u64 status) 415 + { 416 + union intel_x86_pebs_dse dse; 417 + union perf_mem_data_src src; 418 + u64 val; 419 + 420 + dse.val = status; 421 + 422 + if (!dse.pnc_l2_miss) 423 + val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf]; 424 + else 425 + val = parse_omr_data_source(dse.pnc_dse); 426 + 427 + if (!val) 428 + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); 429 + 430 + if (dse.pnc_stlb_miss) 431 + val |= P(TLB, MISS) | P(TLB, L2); 432 + else 433 + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); 434 + 435 + if (dse.pnc_locked) 436 + val |= P(LOCK, LOCKED); 437 + 438 + if (dse.pnc_data_blk) 439 + val |= P(BLK, DATA); 440 + if (dse.pnc_addr_blk) 441 + val |= P(BLK, ADDR); 442 + if (!dse.pnc_data_blk && !dse.pnc_addr_blk) 443 + val |= P(BLK, NA); 444 + 445 + src.val = val; 446 + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) 447 + src.mem_op = P(OP, STORE); 448 + 449 + return src.val; 514 450 } 515 451 516 452 static u64 load_latency_data(struct perf_event *event, u64 status)
+2
arch/x86/events/perf_event.h
··· 1664 1664 1665 1665 u64 arl_h_latency_data(struct perf_event *event, u64 status); 1666 1666 1667 + u64 pnc_latency_data(struct perf_event *event, u64 status); 1668 + 1667 1669 extern struct event_constraint intel_core2_pebs_event_constraints[]; 1668 1670 1669 1671 extern struct event_constraint intel_atom_pebs_event_constraints[];
+24 -3
include/uapi/linux/perf_event.h
··· 1330 1330 mem_snoopx : 2, /* Snoop mode, ext */ 1331 1331 mem_blk : 3, /* Access blocked */ 1332 1332 mem_hops : 3, /* Hop level */ 1333 - mem_rsvd : 18; 1333 + mem_region : 5, /* cache/memory regions */ 1334 + mem_rsvd : 13; 1334 1335 }; 1335 1336 }; 1336 1337 #elif defined(__BIG_ENDIAN_BITFIELD) 1337 1338 union perf_mem_data_src { 1338 1339 __u64 val; 1339 1340 struct { 1340 - __u64 mem_rsvd : 18, 1341 + __u64 mem_rsvd : 13, 1342 + mem_region : 5, /* cache/memory regions */ 1341 1343 mem_hops : 3, /* Hop level */ 1342 1344 mem_blk : 3, /* Access blocked */ 1343 1345 mem_snoopx : 2, /* Snoop mode, ext */ ··· 1396 1394 #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ 1397 1395 #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ 1398 1396 #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ 1399 - /* 0x007 available */ 1397 + #define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ 1400 1398 #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ 1401 1399 #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ 1402 1400 #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ ··· 1448 1446 #define PERF_MEM_HOPS_3 0x0004 /* Remote board */ 1449 1447 /* 5-7 available */ 1450 1448 #define PERF_MEM_HOPS_SHIFT 43 1449 + 1450 + /* Cache/Memory region */ 1451 + #define PERF_MEM_REGION_NA 0x0 /* Invalid */ 1452 + #define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ 1453 + #define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ 1454 + #define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ 1455 + #define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ 1456 + #define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ 1457 + #define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ 1458 + #define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ 1459 + #define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ 1460 + #define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ 1461 + #define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ 1462 + #define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ 1463 + #define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ 1464 + #define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ 1465 + #define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ 1466 + #define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ 1467 + #define PERF_MEM_REGION_SHIFT 46 1451 1468 1452 1469 #define PERF_MEM_S(a, s) \ 1453 1470 (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
+24 -3
tools/include/uapi/linux/perf_event.h
··· 1330 1330 mem_snoopx : 2, /* Snoop mode, ext */ 1331 1331 mem_blk : 3, /* Access blocked */ 1332 1332 mem_hops : 3, /* Hop level */ 1333 - mem_rsvd : 18; 1333 + mem_region : 5, /* cache/memory regions */ 1334 + mem_rsvd : 13; 1334 1335 }; 1335 1336 }; 1336 1337 #elif defined(__BIG_ENDIAN_BITFIELD) 1337 1338 union perf_mem_data_src { 1338 1339 __u64 val; 1339 1340 struct { 1340 - __u64 mem_rsvd : 18, 1341 + __u64 mem_rsvd : 13, 1342 + mem_region : 5, /* cache/memory regions */ 1341 1343 mem_hops : 3, /* Hop level */ 1342 1344 mem_blk : 3, /* Access blocked */ 1343 1345 mem_snoopx : 2, /* Snoop mode, ext */ ··· 1396 1394 #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ 1397 1395 #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ 1398 1396 #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ 1399 - /* 0x007 available */ 1397 + #define PERF_MEM_LVLNUM_L0 0x0007 /* L0 */ 1400 1398 #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ 1401 1399 #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ 1402 1400 #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ ··· 1448 1446 #define PERF_MEM_HOPS_3 0x0004 /* Remote board */ 1449 1447 /* 5-7 available */ 1450 1448 #define PERF_MEM_HOPS_SHIFT 43 1449 + 1450 + /* Cache/Memory region */ 1451 + #define PERF_MEM_REGION_NA 0x0 /* Invalid */ 1452 + #define PERF_MEM_REGION_RSVD 0x01 /* Reserved */ 1453 + #define PERF_MEM_REGION_L_SHARE 0x02 /* Local CA shared cache */ 1454 + #define PERF_MEM_REGION_L_NON_SHARE 0x03 /* Local CA non-shared cache */ 1455 + #define PERF_MEM_REGION_O_IO 0x04 /* Other CA IO agent */ 1456 + #define PERF_MEM_REGION_O_SHARE 0x05 /* Other CA shared cache */ 1457 + #define PERF_MEM_REGION_O_NON_SHARE 0x06 /* Other CA non-shared cache */ 1458 + #define PERF_MEM_REGION_MMIO 0x07 /* MMIO */ 1459 + #define PERF_MEM_REGION_MEM0 0x08 /* Memory region 0 */ 1460 + #define PERF_MEM_REGION_MEM1 0x09 /* Memory region 1 */ 1461 + #define PERF_MEM_REGION_MEM2 0x0a /* Memory region 2 */ 1462 + #define PERF_MEM_REGION_MEM3 0x0b /* Memory region 3 */ 1463 + #define PERF_MEM_REGION_MEM4 0x0c /* Memory region 4 */ 1464 + #define PERF_MEM_REGION_MEM5 0x0d /* Memory region 5 */ 1465 + #define PERF_MEM_REGION_MEM6 0x0e /* Memory region 6 */ 1466 + #define PERF_MEM_REGION_MEM7 0x0f /* Memory region 7 */ 1467 + #define PERF_MEM_REGION_SHIFT 46 1451 1468 1452 1469 #define PERF_MEM_S(a, s) \ 1453 1470 (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)