Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

- Fix Intel Alder Lake PEBS memory access latency & data source
profiling info bugs.

- Use Intel large-PEBS hardware feature in more circumstances, to
reduce PMI overhead & reduce sampling data.

- Extend the lost-sample profiling output with the PERF_FORMAT_LOST ABI
variant, which tells tooling the exact number of samples lost.

- Add new IBS register bits definitions.

- AMD uncore events: Add PerfMonV2 DF (Data Fabric) enhancements.

* tag 'perf-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86/ibs: Add new IBS register bits into header
perf/x86/intel: Fix PEBS data source encoding for ADL
perf/x86/intel: Fix PEBS memory access info encoding for ADL
perf/core: Add a new read format to get a number of lost samples
perf/x86/amd/uncore: Add PerfMonV2 RDPMC assignments
perf/x86/amd/uncore: Add PerfMonV2 DF event format
perf/x86/amd/uncore: Detect available DF counters
perf/x86/amd/uncore: Use attr_update for format attributes
perf/x86/amd/uncore: Use dynamic events array
x86/events/intel/ds: Enable large PEBS for PERF_SAMPLE_WEIGHT_TYPE

+280 -84
+120 -26
arch/x86/events/amd/uncore.c
··· 21 21 #define NUM_COUNTERS_NB 4 22 22 #define NUM_COUNTERS_L2 4 23 23 #define NUM_COUNTERS_L3 6 24 - #define MAX_COUNTERS 6 25 24 26 25 #define RDPMC_BASE_NB 6 27 26 #define RDPMC_BASE_LLC 10 ··· 30 31 #undef pr_fmt 31 32 #define pr_fmt(fmt) "amd_uncore: " fmt 32 33 34 + static int pmu_version; 33 35 static int num_counters_llc; 34 36 static int num_counters_nb; 35 37 static bool l3_mask; ··· 46 46 u32 msr_base; 47 47 cpumask_t *active_mask; 48 48 struct pmu *pmu; 49 - struct perf_event *events[MAX_COUNTERS]; 49 + struct perf_event **events; 50 50 struct hlist_node node; 51 51 }; 52 52 ··· 158 158 hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; 159 159 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 160 160 161 + /* 162 + * The first four DF counters are accessible via RDPMC index 6 to 9 163 + * followed by the L3 counters from index 10 to 15. For processors 164 + * with more than four DF counters, the DF RDPMC assignments become 165 + * discontiguous as the additional counters are accessible starting 166 + * from index 16. 167 + */ 168 + if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB) 169 + hwc->event_base_rdpmc += NUM_COUNTERS_L3; 170 + 161 171 if (flags & PERF_EF_START) 162 172 amd_uncore_start(event, PERF_EF_RELOAD); 163 173 ··· 219 209 { 220 210 struct amd_uncore *uncore; 221 211 struct hw_perf_event *hwc = &event->hw; 212 + u64 event_mask = AMD64_RAW_EVENT_MASK_NB; 222 213 223 214 if (event->attr.type != event->pmu->type) 224 215 return -ENOENT; 216 + 217 + if (pmu_version >= 2 && is_nb_event(event)) 218 + event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB; 225 219 226 220 /* 227 221 * NB and Last level cache counters (MSRs) are shared across all cores ··· 235 221 * out. So we do not support sampling and per-thread events via 236 222 * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts: 237 223 */ 238 - hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; 224 + hwc->config = event->attr.config & event_mask; 239 225 hwc->idx = -1; 240 226 241 227 if (event->cpu < 0) ··· 259 245 event->cpu = uncore->cpu; 260 246 261 247 return 0; 248 + } 249 + 250 + static umode_t 251 + amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) 252 + { 253 + return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ? 254 + attr->mode : 0; 255 + } 256 + 257 + static umode_t 258 + amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) 259 + { 260 + return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0; 262 261 } 263 262 264 263 static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, ··· 314 287 315 288 DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); 316 289 DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ 290 + DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */ 317 291 DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ 318 - DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); 292 + DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15"); 293 + DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */ 319 294 DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ 320 295 DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */ 321 296 DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */ ··· 326 297 DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */ 327 298 DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ 328 299 300 + /* Common DF and NB attributes */ 329 301 static struct attribute *amd_uncore_df_format_attr[] = { 330 - &format_attr_event12.attr, /* event14 if F17h+ */ 331 - &format_attr_umask.attr, 302 + &format_attr_event12.attr, /* event */ 303 + &format_attr_umask8.attr, /* umask */ 332 304 NULL, 333 305 }; 334 306 307 + /* Common L2 and L3 attributes */ 335 308 static struct attribute *amd_uncore_l3_format_attr[] = { 336 - &format_attr_event12.attr, /* event8 if F17h+ */ 337 - &format_attr_umask.attr, 338 - NULL, /* slicemask if F17h, coreid if F19h */ 339 - NULL, /* threadmask8 if F17h, enallslices if F19h */ 340 - NULL, /* enallcores if F19h */ 341 - NULL, /* sliceid if F19h */ 342 - NULL, /* threadmask2 if F19h */ 309 + &format_attr_event12.attr, /* event */ 310 + &format_attr_umask8.attr, /* umask */ 311 + NULL, /* threadmask */ 312 + NULL, 313 + }; 314 + 315 + /* F17h unique L3 attributes */ 316 + static struct attribute *amd_f17h_uncore_l3_format_attr[] = { 317 + &format_attr_slicemask.attr, /* slicemask */ 318 + NULL, 319 + }; 320 + 321 + /* F19h unique L3 attributes */ 322 + static struct attribute *amd_f19h_uncore_l3_format_attr[] = { 323 + &format_attr_coreid.attr, /* coreid */ 324 + &format_attr_enallslices.attr, /* enallslices */ 325 + &format_attr_enallcores.attr, /* enallcores */ 326 + &format_attr_sliceid.attr, /* sliceid */ 343 327 NULL, 344 328 }; 345 329 ··· 366 324 .attrs = amd_uncore_l3_format_attr, 367 325 }; 368 326 327 + static struct attribute_group amd_f17h_uncore_l3_format_group = { 328 + .name = "format", 329 + .attrs = amd_f17h_uncore_l3_format_attr, 330 + .is_visible = amd_f17h_uncore_is_visible, 331 + }; 332 + 333 + static struct attribute_group amd_f19h_uncore_l3_format_group = { 334 + .name = "format", 335 + .attrs = amd_f19h_uncore_l3_format_attr, 336 + .is_visible = amd_f19h_uncore_is_visible, 337 + }; 338 + 369 339 static const struct attribute_group *amd_uncore_df_attr_groups[] = { 370 340 &amd_uncore_attr_group, 371 341 &amd_uncore_df_format_group, ··· 387 333 static const struct attribute_group *amd_uncore_l3_attr_groups[] = { 388 334 &amd_uncore_attr_group, 389 335 &amd_uncore_l3_format_group, 336 + NULL, 337 + }; 338 + 339 + static const struct attribute_group *amd_uncore_l3_attr_update[] = { 340 + &amd_f17h_uncore_l3_format_group, 341 + &amd_f19h_uncore_l3_format_group, 390 342 NULL, 391 343 }; 392 344 ··· 413 353 static struct pmu amd_llc_pmu = { 414 354 .task_ctx_nr = perf_invalid_context, 415 355 .attr_groups = amd_uncore_l3_attr_groups, 356 + .attr_update = amd_uncore_l3_attr_update, 416 357 .name = "amd_l2", 417 358 .event_init = amd_uncore_event_init, 418 359 .add = amd_uncore_add, ··· 431 370 cpu_to_node(cpu)); 432 371 } 433 372 373 + static inline struct perf_event ** 374 + amd_uncore_events_alloc(unsigned int num, unsigned int cpu) 375 + { 376 + return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL, 377 + cpu_to_node(cpu)); 378 + } 379 + 434 380 static int amd_uncore_cpu_up_prepare(unsigned int cpu) 435 381 { 436 - struct amd_uncore *uncore_nb = NULL, *uncore_llc; 382 + struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL; 437 383 438 384 if (amd_uncore_nb) { 385 + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; 439 386 uncore_nb = amd_uncore_alloc(cpu); 440 387 if (!uncore_nb) 441 388 goto fail; ··· 453 384 uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; 454 385 uncore_nb->active_mask = &amd_nb_active_mask; 455 386 uncore_nb->pmu = &amd_nb_pmu; 387 + uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu); 388 + if (!uncore_nb->events) 389 + goto fail; 456 390 uncore_nb->id = -1; 457 391 *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; 458 392 } 459 393 460 394 if (amd_uncore_llc) { 395 + *per_cpu_ptr(amd_uncore_llc, cpu) = NULL; 461 396 uncore_llc = amd_uncore_alloc(cpu); 462 397 if (!uncore_llc) 463 398 goto fail; ··· 471 398 uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; 472 399 uncore_llc->active_mask = &amd_llc_active_mask; 473 400 uncore_llc->pmu = &amd_llc_pmu; 401 + uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu); 402 + if (!uncore_llc->events) 403 + goto fail; 474 404 uncore_llc->id = -1; 475 405 *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; 476 406 } ··· 481 405 return 0; 482 406 483 407 fail: 484 - if (amd_uncore_nb) 485 - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; 486 - kfree(uncore_nb); 408 + if (uncore_nb) { 409 + kfree(uncore_nb->events); 410 + kfree(uncore_nb); 411 + } 412 + 413 + if (uncore_llc) { 414 + kfree(uncore_llc->events); 415 + kfree(uncore_llc); 416 + } 417 + 487 418 return -ENOMEM; 488 419 } 489 420 ··· 623 540 if (cpu == uncore->cpu) 624 541 cpumask_clear_cpu(cpu, uncore->active_mask); 625 542 626 - if (!--uncore->refcnt) 543 + if (!--uncore->refcnt) { 544 + kfree(uncore->events); 627 545 kfree(uncore); 546 + } 547 + 628 548 *per_cpu_ptr(uncores, cpu) = NULL; 629 549 } 630 550 ··· 646 560 { 647 561 struct attribute **df_attr = amd_uncore_df_format_attr; 648 562 struct attribute **l3_attr = amd_uncore_l3_format_attr; 563 + union cpuid_0x80000022_ebx ebx; 649 564 int ret = -ENODEV; 650 565 651 566 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && ··· 655 568 656 569 if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) 657 570 return -ENODEV; 571 + 572 + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) 573 + pmu_version = 2; 658 574 659 575 num_counters_nb = NUM_COUNTERS_NB; 660 576 num_counters_llc = NUM_COUNTERS_L2; ··· 675 585 } 676 586 677 587 if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { 678 - if (boot_cpu_data.x86 >= 0x17) 588 + if (pmu_version >= 2) { 589 + *df_attr++ = &format_attr_event14v2.attr; 590 + *df_attr++ = &format_attr_umask12.attr; 591 + } else if (boot_cpu_data.x86 >= 0x17) { 679 592 *df_attr = &format_attr_event14.attr; 593 + } 680 594 681 595 amd_uncore_nb = alloc_percpu(struct amd_uncore *); 682 596 if (!amd_uncore_nb) { ··· 690 596 ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); 691 597 if (ret) 692 598 goto fail_nb; 599 + 600 + if (pmu_version >= 2) { 601 + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); 602 + num_counters_nb = ebx.split.num_df_pmc; 603 + } 693 604 694 605 pr_info("%d %s %s counters detected\n", num_counters_nb, 695 606 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", ··· 706 607 if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { 707 608 if (boot_cpu_data.x86 >= 0x19) { 708 609 *l3_attr++ = &format_attr_event8.attr; 709 - *l3_attr++ = &format_attr_umask.attr; 710 - *l3_attr++ = &format_attr_coreid.attr; 711 - *l3_attr++ = &format_attr_enallslices.attr; 712 - *l3_attr++ = &format_attr_enallcores.attr; 713 - *l3_attr++ = &format_attr_sliceid.attr; 610 + *l3_attr++ = &format_attr_umask8.attr; 714 611 *l3_attr++ = &format_attr_threadmask2.attr; 715 612 } else if (boot_cpu_data.x86 >= 0x17) { 716 613 *l3_attr++ = &format_attr_event8.attr; 717 - *l3_attr++ = &format_attr_umask.attr; 718 - *l3_attr++ = &format_attr_slicemask.attr; 614 + *l3_attr++ = &format_attr_umask8.attr; 719 615 *l3_attr++ = &format_attr_threadmask8.attr; 720 616 } 721 617
+4 -3
arch/x86/events/intel/core.c
··· 4141 4141 { 4142 4142 struct event_constraint *c; 4143 4143 4144 + c = intel_get_event_constraints(cpuc, idx, event); 4145 + 4144 4146 /* 4145 4147 * :ppp means to do reduced skid PEBS, 4146 4148 * which is available on PMC0 and fixed counter 0. ··· 4154 4152 4155 4153 return &counter0_constraint; 4156 4154 } 4157 - 4158 - c = intel_get_event_constraints(cpuc, idx, event); 4159 4155 4160 4156 return c; 4161 4157 } ··· 6241 6241 x86_pmu.flags |= PMU_FL_INSTR_LATENCY; 6242 6242 x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; 6243 6243 x86_pmu.lbr_pt_coexist = true; 6244 - intel_pmu_pebs_data_source_skl(false); 6244 + intel_pmu_pebs_data_source_adl(); 6245 + x86_pmu.pebs_latency_data = adl_latency_data_small; 6245 6246 x86_pmu.num_topdown_events = 8; 6246 6247 x86_pmu.update_topdown_event = adl_update_topdown_event; 6247 6248 x86_pmu.set_topdown_event_period = adl_set_topdown_event_period;
+86 -43
arch/x86/events/intel/ds.c
··· 94 94 pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); 95 95 } 96 96 97 - void __init intel_pmu_pebs_data_source_skl(bool pmem) 97 + static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source) 98 98 { 99 99 u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); 100 100 101 - pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); 102 - pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); 103 - pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); 104 - pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); 105 - pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); 101 + data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); 102 + data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); 103 + data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); 104 + data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); 105 + data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); 106 + } 107 + 108 + void __init intel_pmu_pebs_data_source_skl(bool pmem) 109 + { 110 + __intel_pmu_pebs_data_source_skl(pmem, pebs_data_source); 111 + } 112 + 113 + static void __init intel_pmu_pebs_data_source_grt(u64 *data_source) 114 + { 115 + data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); 116 + data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); 117 + data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD); 118 + } 119 + 120 + void __init intel_pmu_pebs_data_source_adl(void) 121 + { 122 + u64 *data_source; 123 + 124 + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source; 125 + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); 126 + __intel_pmu_pebs_data_source_skl(false, data_source); 127 + 128 + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; 129 + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); 130 + intel_pmu_pebs_data_source_grt(data_source); 106 131 } 107 132 108 133 static u64 precise_store_data(u64 status) ··· 196 171 return dse.val; 197 172 } 198 173 199 - static u64 load_latency_data(u64 status) 174 + static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock) 175 + { 176 + /* 177 + * TLB access 178 + * 0 = did not miss 2nd level TLB 179 + * 1 = missed 2nd level TLB 180 + */ 181 + if (tlb) 182 + *val |= P(TLB, MISS) | P(TLB, L2); 183 + else 184 + *val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); 185 + 186 + /* locked prefix */ 187 + if (lock) 188 + *val |= P(LOCK, LOCKED); 189 + } 190 + 191 + /* Retrieve the latency data for e-core of ADL */ 192 + u64 adl_latency_data_small(struct perf_event *event, u64 status) 193 + { 194 + union intel_x86_pebs_dse dse; 195 + u64 val; 196 + 197 + WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big); 198 + 199 + dse.val = status; 200 + 201 + val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse]; 202 + 203 + /* 204 + * For the atom core on ADL, 205 + * bit 4: lock, bit 5: TLB access. 206 + */ 207 + pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss); 208 + 209 + if (dse.ld_data_blk) 210 + val |= P(BLK, DATA); 211 + else 212 + val |= P(BLK, NA); 213 + 214 + return val; 215 + } 216 + 217 + static u64 load_latency_data(struct perf_event *event, u64 status) 200 218 { 201 219 union intel_x86_pebs_dse dse; 202 220 u64 val; ··· 249 181 /* 250 182 * use the mapping table for bit 0-3 251 183 */ 252 - val = pebs_data_source[dse.ld_dse]; 184 + val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse]; 253 185 254 186 /* 255 187 * Nehalem models do not support TLB, Lock infos ··· 258 190 val |= P(TLB, NA) | P(LOCK, NA); 259 191 return val; 260 192 } 261 - /* 262 - * bit 4: TLB access 263 - * 0 = did not miss 2nd level TLB 264 - * 1 = missed 2nd level TLB 265 - */ 266 - if (dse.ld_stlb_miss) 267 - val |= P(TLB, MISS) | P(TLB, L2); 268 - else 269 - val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); 270 193 271 - /* 272 - * bit 5: locked prefix 273 - */ 274 - if (dse.ld_locked) 275 - val |= P(LOCK, LOCKED); 194 + pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked); 276 195 277 196 /* 278 197 * Ice Lake and earlier models do not support block infos. ··· 288 233 return val; 289 234 } 290 235 291 - static u64 store_latency_data(u64 status) 236 + static u64 store_latency_data(struct perf_event *event, u64 status) 292 237 { 293 238 union intel_x86_pebs_dse dse; 294 239 u64 val; ··· 298 243 /* 299 244 * use the mapping table for bit 0-3 300 245 */ 301 - val = pebs_data_source[dse.st_lat_dse]; 246 + val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse]; 302 247 303 - /* 304 - * bit 4: TLB access 305 - * 0 = did not miss 2nd level TLB 306 - * 1 = missed 2nd level TLB 307 - */ 308 - if (dse.st_lat_stlb_miss) 309 - val |= P(TLB, MISS) | P(TLB, L2); 310 - else 311 - val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); 312 - 313 - /* 314 - * bit 5: locked prefix 315 - */ 316 - if (dse.st_lat_locked) 317 - val |= P(LOCK, LOCKED); 248 + pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked); 318 249 319 250 val |= P(BLK, NA); 320 251 ··· 822 781 823 782 struct event_constraint intel_grt_pebs_event_constraints[] = { 824 783 /* Allow all events as PEBS with no flags */ 825 - INTEL_PLD_CONSTRAINT(0x5d0, 0xf), 826 - INTEL_PSD_CONSTRAINT(0x6d0, 0xf), 784 + INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf), 785 + INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf), 827 786 EVENT_CONSTRAINT_END 828 787 }; 829 788 ··· 1484 1443 bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); 1485 1444 1486 1445 if (fl & PERF_X86_EVENT_PEBS_LDLAT) 1487 - val = load_latency_data(aux); 1446 + val = load_latency_data(event, aux); 1488 1447 else if (fl & PERF_X86_EVENT_PEBS_STLAT) 1489 - val = store_latency_data(aux); 1448 + val = store_latency_data(event, aux); 1449 + else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID) 1450 + val = x86_pmu.pebs_latency_data(event, aux); 1490 1451 else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) 1491 1452 val = precise_datala_hsw(event, aux); 1492 1453 else if (fst)
+16 -1
arch/x86/events/perf_event.h
··· 84 84 #define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ 85 85 #define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ 86 86 #define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ 87 + #define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */ 87 88 88 89 static inline bool is_topdown_count(struct perf_event *event) 89 90 { ··· 137 136 PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ 138 137 PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ 139 138 PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ 140 - PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) 139 + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE | \ 140 + PERF_SAMPLE_WEIGHT_TYPE) 141 141 142 142 #define PEBS_GP_REGS \ 143 143 ((1ULL << PERF_REG_X86_AX) | \ ··· 462 460 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ 463 461 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) 464 462 463 + #define INTEL_HYBRID_LAT_CONSTRAINT(c, n) \ 464 + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ 465 + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID) 466 + 465 467 /* Event constraint, but match on all event flags too. */ 466 468 #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ 467 469 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) ··· 644 638 x86_lbr_exclusive_max, 645 639 }; 646 640 641 + #define PERF_PEBS_DATA_SOURCE_MAX 0x10 642 + 647 643 struct x86_hybrid_pmu { 648 644 struct pmu pmu; 649 645 const char *name; ··· 673 665 unsigned int late_ack :1, 674 666 mid_ack :1, 675 667 enabled_ack :1; 668 + 669 + u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; 676 670 }; 677 671 678 672 static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu) ··· 835 825 void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data); 836 826 struct event_constraint *pebs_constraints; 837 827 void (*pebs_aliases)(struct perf_event *event); 828 + u64 (*pebs_latency_data)(struct perf_event *event, u64 status); 838 829 unsigned long large_pebs_flags; 839 830 u64 rtm_abort_event; 840 831 ··· 1403 1392 1404 1393 int intel_pmu_drain_bts_buffer(void); 1405 1394 1395 + u64 adl_latency_data_small(struct perf_event *event, u64 status); 1396 + 1406 1397 extern struct event_constraint intel_core2_pebs_event_constraints[]; 1407 1398 1408 1399 extern struct event_constraint intel_atom_pebs_event_constraints[]; ··· 1511 1498 void intel_pmu_pebs_data_source_nhm(void); 1512 1499 1513 1500 void intel_pmu_pebs_data_source_skl(bool pmem); 1501 + 1502 + void intel_pmu_pebs_data_source_adl(void); 1514 1503 1515 1504 int intel_pmu_setup_lbr_filter(struct perf_event *event); 1516 1505
+10 -6
arch/x86/include/asm/amd-ibs.h
··· 29 29 rand_en:1, /* 57: random tagging enable */ 30 30 fetch_l2_miss:1,/* 58: L2 miss for sampled fetch 31 31 * (needs IbsFetchComp) */ 32 - reserved:5; /* 59-63: reserved */ 32 + l3_miss_only:1, /* 59: Collect L3 miss samples only */ 33 + fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */ 34 + fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */ 35 + reserved:2; /* 62-63: reserved */ 33 36 }; 34 37 }; 35 38 ··· 41 38 __u64 val; 42 39 struct { 43 40 __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ 44 - reserved0:1, /* 16: reserved */ 41 + l3_miss_only:1, /* 16: Collect L3 miss samples only */ 45 42 op_en:1, /* 17: op sampling enable */ 46 43 op_val:1, /* 18: op sample valid */ 47 44 cnt_ctl:1, /* 19: periodic op counter control */ 48 45 opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ 49 - reserved1:5, /* 27-31: reserved */ 46 + reserved0:5, /* 27-31: reserved */ 50 47 opcurcnt:27, /* 32-58: periodic op counter current count */ 51 - reserved2:5; /* 59-63: reserved */ 48 + reserved1:5; /* 59-63: reserved */ 52 49 }; 53 50 }; 54 51 ··· 74 71 union ibs_op_data2 { 75 72 __u64 val; 76 73 struct { 77 - __u64 data_src:3, /* 0-2: data source */ 74 + __u64 data_src_lo:3, /* 0-2: data source low */ 78 75 reserved0:1, /* 3: reserved */ 79 76 rmt_node:1, /* 4: destination node */ 80 77 cache_hit_st:1, /* 5: cache hit state */ 81 - reserved1:57; /* 5-63: reserved */ 78 + data_src_hi:2, /* 6-7: data source high */ 79 + reserved1:56; /* 8-63: reserved */ 82 80 }; 83 81 }; 84 82
+16
arch/x86/include/asm/perf_event.h
··· 89 89 #define AMD64_RAW_EVENT_MASK_NB \ 90 90 (AMD64_EVENTSEL_EVENT | \ 91 91 ARCH_PERFMON_EVENTSEL_UMASK) 92 + 93 + #define AMD64_PERFMON_V2_EVENTSEL_EVENT_NB \ 94 + (AMD64_EVENTSEL_EVENT | \ 95 + GENMASK_ULL(37, 36)) 96 + 97 + #define AMD64_PERFMON_V2_EVENTSEL_UMASK_NB \ 98 + (ARCH_PERFMON_EVENTSEL_UMASK | \ 99 + GENMASK_ULL(27, 24)) 100 + 101 + #define AMD64_PERFMON_V2_RAW_EVENT_MASK_NB \ 102 + (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \ 103 + AMD64_PERFMON_V2_EVENTSEL_UMASK_NB) 104 + 92 105 #define AMD64_NUM_COUNTERS 4 93 106 #define AMD64_NUM_COUNTERS_CORE 6 94 107 #define AMD64_NUM_COUNTERS_NB 4 ··· 207 194 struct { 208 195 /* Number of Core Performance Counters */ 209 196 unsigned int num_core_pmc:4; 197 + unsigned int reserved:6; 198 + /* Number of Data Fabric Counters */ 199 + unsigned int num_df_pmc:6; 210 200 } split; 211 201 unsigned int full; 212 202 };
+2
include/linux/perf_event.h
··· 759 759 struct pid_namespace *ns; 760 760 u64 id; 761 761 762 + atomic64_t lost_samples; 763 + 762 764 u64 (*clock)(void); 763 765 perf_overflow_handler_t overflow_handler; 764 766 void *overflow_handler_context;
+4 -1
include/uapi/linux/perf_event.h
··· 301 301 * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED 302 302 * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING 303 303 * { u64 id; } && PERF_FORMAT_ID 304 + * { u64 lost; } && PERF_FORMAT_LOST 304 305 * } && !PERF_FORMAT_GROUP 305 306 * 306 307 * { u64 nr; ··· 309 308 * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING 310 309 * { u64 value; 311 310 * { u64 id; } && PERF_FORMAT_ID 311 + * { u64 lost; } && PERF_FORMAT_LOST 312 312 * } cntr[nr]; 313 313 * } && PERF_FORMAT_GROUP 314 314 * }; ··· 319 317 PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, 320 318 PERF_FORMAT_ID = 1U << 2, 321 319 PERF_FORMAT_GROUP = 1U << 3, 320 + PERF_FORMAT_LOST = 1U << 4, 322 321 323 - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ 322 + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ 324 323 }; 325 324 326 325 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
+18 -3
kernel/events/core.c
··· 1819 1819 if (event->attr.read_format & PERF_FORMAT_ID) 1820 1820 entry += sizeof(u64); 1821 1821 1822 + if (event->attr.read_format & PERF_FORMAT_LOST) 1823 + entry += sizeof(u64); 1824 + 1822 1825 if (event->attr.read_format & PERF_FORMAT_GROUP) { 1823 1826 nr += nr_siblings; 1824 1827 size += sizeof(u64); ··· 5263 5260 values[n++] += perf_event_count(leader); 5264 5261 if (read_format & PERF_FORMAT_ID) 5265 5262 values[n++] = primary_event_id(leader); 5263 + if (read_format & PERF_FORMAT_LOST) 5264 + values[n++] = atomic64_read(&leader->lost_samples); 5266 5265 5267 5266 for_each_sibling_event(sub, leader) { 5268 5267 values[n++] += perf_event_count(sub); 5269 5268 if (read_format & PERF_FORMAT_ID) 5270 5269 values[n++] = primary_event_id(sub); 5270 + if (read_format & PERF_FORMAT_LOST) 5271 + values[n++] = atomic64_read(&sub->lost_samples); 5271 5272 } 5272 5273 5273 5274 raw_spin_unlock_irqrestore(&ctx->lock, flags); ··· 5328 5321 u64 read_format, char __user *buf) 5329 5322 { 5330 5323 u64 enabled, running; 5331 - u64 values[4]; 5324 + u64 values[5]; 5332 5325 int n = 0; 5333 5326 5334 5327 values[n++] = __perf_event_read_value(event, &enabled, &running); ··· 5338 5331 values[n++] = running; 5339 5332 if (read_format & PERF_FORMAT_ID) 5340 5333 values[n++] = primary_event_id(event); 5334 + if (read_format & PERF_FORMAT_LOST) 5335 + values[n++] = atomic64_read(&event->lost_samples); 5341 5336 5342 5337 if (copy_to_user(buf, values, n * sizeof(u64))) 5343 5338 return -EFAULT; ··· 6867 6858 u64 enabled, u64 running) 6868 6859 { 6869 6860 u64 read_format = event->attr.read_format; 6870 - u64 values[4]; 6861 + u64 values[5]; 6871 6862 int n = 0; 6872 6863 6873 6864 values[n++] = perf_event_count(event); ··· 6881 6872 } 6882 6873 if (read_format & PERF_FORMAT_ID) 6883 6874 values[n++] = primary_event_id(event); 6875 + if (read_format & PERF_FORMAT_LOST) 6876 + values[n++] = atomic64_read(&event->lost_samples); 6884 6877 6885 6878 __output_copy(handle, values, n * sizeof(u64)); 6886 6879 } ··· 6893 6882 { 6894 6883 struct perf_event *leader = event->group_leader, *sub; 6895 6884 u64 read_format = event->attr.read_format; 6896 - u64 values[5]; 6885 + u64 values[6]; 6897 6886 int n = 0; 6898 6887 6899 6888 values[n++] = 1 + leader->nr_siblings; ··· 6911 6900 values[n++] = perf_event_count(leader); 6912 6901 if (read_format & PERF_FORMAT_ID) 6913 6902 values[n++] = primary_event_id(leader); 6903 + if (read_format & PERF_FORMAT_LOST) 6904 + values[n++] = atomic64_read(&leader->lost_samples); 6914 6905 6915 6906 __output_copy(handle, values, n * sizeof(u64)); 6916 6907 ··· 6926 6913 values[n++] = perf_event_count(sub); 6927 6914 if (read_format & PERF_FORMAT_ID) 6928 6915 values[n++] = primary_event_id(sub); 6916 + if (read_format & PERF_FORMAT_LOST) 6917 + values[n++] = atomic64_read(&sub->lost_samples); 6929 6918 6930 6919 __output_copy(handle, values, n * sizeof(u64)); 6931 6920 }
+4 -1
kernel/events/ring_buffer.c
··· 172 172 goto out; 173 173 174 174 if (unlikely(rb->paused)) { 175 - if (rb->nr_pages) 175 + if (rb->nr_pages) { 176 176 local_inc(&rb->lost); 177 + atomic64_inc(&event->lost_samples); 178 + } 177 179 goto out; 178 180 } 179 181 ··· 256 254 257 255 fail: 258 256 local_inc(&rb->lost); 257 + atomic64_inc(&event->lost_samples); 259 258 perf_output_put_handle(handle); 260 259 out: 261 260 rcu_read_unlock();