Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
"The biggest chunk of the changes are two regression fixes: a HT
workaround fix and an event-group scheduling fix. It's been verified
with 5 days of fuzzer testing.

Other fixes:

- eBPF fix
- a BIOS breakage detection fix
- PMU driver fixes"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86/intel/pt: Fix a refactoring bug
perf/x86: Tweak broken BIOS rules during check_hw_exists()
perf/x86/intel/pt: Untangle pt_buffer_reset_markers()
perf: Disallow sparse AUX allocations for non-SG PMUs in overwrite mode
perf/x86: Improve HT workaround GP counter constraint
perf/x86: Fix event/group validation
perf: Fix race in BPF program unregister

+139 -84
+62 -21
arch/x86/kernel/cpu/perf_event.c
··· 190 190 u64 val, val_fail, val_new= ~0; 191 191 int i, reg, reg_fail, ret = 0; 192 192 int bios_fail = 0; 193 + int reg_safe = -1; 193 194 194 195 /* 195 196 * Check to see if the BIOS enabled any of the counters, if so ··· 205 204 bios_fail = 1; 206 205 val_fail = val; 207 206 reg_fail = reg; 207 + } else { 208 + reg_safe = i; 208 209 } 209 210 } 210 211 ··· 225 222 } 226 223 227 224 /* 225 + * If all the counters are enabled, the below test will always 226 + * fail. The tools will also become useless in this scenario. 227 + * Just fail and disable the hardware counters. 228 + */ 229 + 230 + if (reg_safe == -1) { 231 + reg = reg_safe; 232 + goto msr_fail; 233 + } 234 + 235 + /* 228 236 * Read the current value, change it and read it back to see if it 229 237 * matches, this is needed to detect certain hardware emulators 230 238 * (qemu/kvm) that don't trap on the MSR access and always return 0s. 231 239 */ 232 - reg = x86_pmu_event_addr(0); 240 + reg = x86_pmu_event_addr(reg_safe); 233 241 if (rdmsrl_safe(reg, &val)) 234 242 goto msr_fail; 235 243 val ^= 0xffffUL; ··· 625 611 int event; /* event index */ 626 612 int counter; /* counter index */ 627 613 int unassigned; /* number of events to be assigned left */ 614 + int nr_gp; /* number of GP counters used */ 628 615 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 629 616 }; 630 617 ··· 635 620 struct perf_sched { 636 621 int max_weight; 637 622 int max_events; 638 - struct perf_event **events; 639 - struct sched_state state; 623 + int max_gp; 640 624 int saved_states; 625 + struct event_constraint **constraints; 626 + struct sched_state state; 641 627 struct sched_state saved[SCHED_STATES_MAX]; 642 628 }; 643 629 644 630 /* 645 631 * Initialize interator that runs through all events and counters. 646 632 */ 647 - static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, 648 - int num, int wmin, int wmax) 633 + static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, 634 + int num, int wmin, int wmax, int gpmax) 649 635 { 650 636 int idx; 651 637 652 638 memset(sched, 0, sizeof(*sched)); 653 639 sched->max_events = num; 654 640 sched->max_weight = wmax; 655 - sched->events = events; 641 + sched->max_gp = gpmax; 642 + sched->constraints = constraints; 656 643 657 644 for (idx = 0; idx < num; idx++) { 658 - if (events[idx]->hw.constraint->weight == wmin) 645 + if (constraints[idx]->weight == wmin) 659 646 break; 660 647 } 661 648 ··· 704 687 if (sched->state.event >= sched->max_events) 705 688 return false; 706 689 707 - c = sched->events[sched->state.event]->hw.constraint; 690 + c = sched->constraints[sched->state.event]; 708 691 /* Prefer fixed purpose counters */ 709 692 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { 710 693 idx = INTEL_PMC_IDX_FIXED; ··· 713 696 goto done; 714 697 } 715 698 } 699 + 716 700 /* Grab the first unused counter starting with idx */ 717 701 idx = sched->state.counter; 718 702 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { 719 - if (!__test_and_set_bit(idx, sched->state.used)) 703 + if (!__test_and_set_bit(idx, sched->state.used)) { 704 + if (sched->state.nr_gp++ >= sched->max_gp) 705 + return false; 706 + 720 707 goto done; 708 + } 721 709 } 722 710 723 711 return false; ··· 767 745 if (sched->state.weight > sched->max_weight) 768 746 return false; 769 747 } 770 - c = sched->events[sched->state.event]->hw.constraint; 748 + c = sched->constraints[sched->state.event]; 771 749 } while (c->weight != sched->state.weight); 772 750 773 751 sched->state.counter = 0; /* start with first counter */ ··· 778 756 /* 779 757 * Assign a counter for each event. 780 758 */ 781 - int perf_assign_events(struct perf_event **events, int n, 782 - int wmin, int wmax, int *assign) 759 + int perf_assign_events(struct event_constraint **constraints, int n, 760 + int wmin, int wmax, int gpmax, int *assign) 783 761 { 784 762 struct perf_sched sched; 785 763 786 - perf_sched_init(&sched, events, n, wmin, wmax); 764 + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); 787 765 788 766 do { 789 767 if (!perf_sched_find_counter(&sched)) ··· 810 788 x86_pmu.start_scheduling(cpuc); 811 789 812 790 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 813 - hwc = &cpuc->event_list[i]->hw; 791 + cpuc->event_constraint[i] = NULL; 814 792 c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); 815 - hwc->constraint = c; 793 + cpuc->event_constraint[i] = c; 816 794 817 795 wmin = min(wmin, c->weight); 818 796 wmax = max(wmax, c->weight); ··· 823 801 */ 824 802 for (i = 0; i < n; i++) { 825 803 hwc = &cpuc->event_list[i]->hw; 826 - c = hwc->constraint; 804 + c = cpuc->event_constraint[i]; 827 805 828 806 /* never assigned */ 829 807 if (hwc->idx == -1) ··· 843 821 } 844 822 845 823 /* slow path */ 846 - if (i != n) 847 - unsched = perf_assign_events(cpuc->event_list, n, wmin, 848 - wmax, assign); 824 + if (i != n) { 825 + int gpmax = x86_pmu.num_counters; 826 + 827 + /* 828 + * Do not allow scheduling of more than half the available 829 + * generic counters. 830 + * 831 + * This helps avoid counter starvation of sibling thread by 832 + * ensuring at most half the counters cannot be in exclusive 833 + * mode. There is no designated counters for the limits. Any 834 + * N/2 counters can be used. This helps with events with 835 + * specific counter constraints. 836 + */ 837 + if (is_ht_workaround_enabled() && !cpuc->is_fake && 838 + READ_ONCE(cpuc->excl_cntrs->exclusive_present)) 839 + gpmax /= 2; 840 + 841 + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, 842 + wmax, gpmax, assign); 843 + } 849 844 850 845 /* 851 846 * In case of success (unsched = 0), mark events as committed, ··· 879 840 e = cpuc->event_list[i]; 880 841 e->hw.flags |= PERF_X86_EVENT_COMMITTED; 881 842 if (x86_pmu.commit_scheduling) 882 - x86_pmu.commit_scheduling(cpuc, e, assign[i]); 843 + x86_pmu.commit_scheduling(cpuc, i, assign[i]); 883 844 } 884 845 } 885 846 ··· 1331 1292 x86_pmu.put_event_constraints(cpuc, event); 1332 1293 1333 1294 /* Delete the array entry. */ 1334 - while (++i < cpuc->n_events) 1295 + while (++i < cpuc->n_events) { 1335 1296 cpuc->event_list[i-1] = cpuc->event_list[i]; 1297 + cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; 1298 + } 1336 1299 --cpuc->n_events; 1337 1300 1338 1301 perf_event_update_userpage(event);
+17 -7
arch/x86/kernel/cpu/perf_event.h
··· 74 74 #define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ 75 75 #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ 76 76 #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ 77 + #define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ 77 78 78 79 79 80 struct amd_nb { ··· 135 134 struct intel_excl_states { 136 135 enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; 137 136 enum intel_excl_state_type state[X86_PMC_IDX_MAX]; 138 - int num_alloc_cntrs;/* #counters allocated */ 139 - int max_alloc_cntrs;/* max #counters allowed */ 140 137 bool sched_started; /* true if scheduling has started */ 141 138 }; 142 139 ··· 142 143 raw_spinlock_t lock; 143 144 144 145 struct intel_excl_states states[2]; 146 + 147 + union { 148 + u16 has_exclusive[2]; 149 + u32 exclusive_present; 150 + }; 145 151 146 152 int refcnt; /* per-core: #HT threads */ 147 153 unsigned core_id; /* per-core: core id */ ··· 176 172 added in the current transaction */ 177 173 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 178 174 u64 tags[X86_PMC_IDX_MAX]; 175 + 179 176 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 177 + struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; 178 + 179 + int n_excl; /* the number of exclusive events */ 180 180 181 181 unsigned int group_flag; 182 182 int is_fake; ··· 527 519 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 528 520 struct perf_event *event); 529 521 530 - void (*commit_scheduling)(struct cpu_hw_events *cpuc, 531 - struct perf_event *event, 532 - int cntr); 522 + void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); 533 523 534 524 void (*start_scheduling)(struct cpu_hw_events *cpuc); 535 525 ··· 723 717 724 718 void x86_pmu_enable_all(int added); 725 719 726 - int perf_assign_events(struct perf_event **events, int n, 727 - int wmin, int wmax, int *assign); 720 + int perf_assign_events(struct event_constraint **constraints, int n, 721 + int wmin, int wmax, int gpmax, int *assign); 728 722 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); 729 723 730 724 void x86_pmu_stop(struct perf_event *event, int flags); ··· 935 929 return NULL; 936 930 } 937 931 932 + static inline int is_ht_workaround_enabled(void) 933 + { 934 + return 0; 935 + } 938 936 #endif /* CONFIG_CPU_SUP_INTEL */
+14 -31
arch/x86/kernel/cpu/perf_event_intel.c
··· 1923 1923 xl = &excl_cntrs->states[tid]; 1924 1924 1925 1925 xl->sched_started = true; 1926 - xl->num_alloc_cntrs = 0; 1927 1926 /* 1928 1927 * lock shared state until we are done scheduling 1929 1928 * in stop_event_scheduling() ··· 1999 2000 * across HT threads 2000 2001 */ 2001 2002 is_excl = c->flags & PERF_X86_EVENT_EXCL; 2003 + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { 2004 + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; 2005 + if (!cpuc->n_excl++) 2006 + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); 2007 + } 2002 2008 2003 2009 /* 2004 2010 * xl = state of current HT ··· 2011 2007 */ 2012 2008 xl = &excl_cntrs->states[tid]; 2013 2009 xlo = &excl_cntrs->states[o_tid]; 2014 - 2015 - /* 2016 - * do not allow scheduling of more than max_alloc_cntrs 2017 - * which is set to half the available generic counters. 2018 - * this helps avoid counter starvation of sibling thread 2019 - * by ensuring at most half the counters cannot be in 2020 - * exclusive mode. There is not designated counters for the 2021 - * limits. Any N/2 counters can be used. This helps with 2022 - * events with specifix counter constraints 2023 - */ 2024 - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) 2025 - return &emptyconstraint; 2026 2010 2027 2011 cx = c; 2028 2012 ··· 2098 2106 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, 2099 2107 struct perf_event *event) 2100 2108 { 2101 - struct event_constraint *c1 = event->hw.constraint; 2109 + struct event_constraint *c1 = cpuc->event_constraint[idx]; 2102 2110 struct event_constraint *c2; 2103 2111 2104 2112 /* ··· 2142 2150 2143 2151 xl = &excl_cntrs->states[tid]; 2144 2152 xlo = &excl_cntrs->states[o_tid]; 2153 + if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { 2154 + hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; 2155 + if (!--cpuc->n_excl) 2156 + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0); 2157 + } 2145 2158 2146 2159 /* 2147 2160 * put_constraint may be called from x86_schedule_events() ··· 2185 2188 static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 2186 2189 struct perf_event *event) 2187 2190 { 2188 - struct event_constraint *c = event->hw.constraint; 2189 - 2190 2191 intel_put_shared_regs_event_constraints(cpuc, event); 2191 2192 2192 2193 /* ··· 2192 2197 * all events are subject to and must call the 2193 2198 * put_excl_constraints() routine 2194 2199 */ 2195 - if (c && cpuc->excl_cntrs) 2200 + if (cpuc->excl_cntrs) 2196 2201 intel_put_excl_constraints(cpuc, event); 2197 - 2198 - /* cleanup dynamic constraint */ 2199 - if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) 2200 - event->hw.constraint = NULL; 2201 2202 } 2202 2203 2203 - static void intel_commit_scheduling(struct cpu_hw_events *cpuc, 2204 - struct perf_event *event, int cntr) 2204 + static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) 2205 2205 { 2206 2206 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; 2207 - struct event_constraint *c = event->hw.constraint; 2207 + struct event_constraint *c = cpuc->event_constraint[idx]; 2208 2208 struct intel_excl_states *xlo, *xl; 2209 2209 int tid = cpuc->excl_thread_id; 2210 2210 int o_tid = 1 - tid; ··· 2629 2639 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; 2630 2640 2631 2641 if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { 2632 - int h = x86_pmu.num_counters >> 1; 2633 - 2634 2642 for_each_cpu(i, topology_thread_cpumask(cpu)) { 2635 2643 struct intel_excl_cntrs *c; 2636 2644 ··· 2642 2654 } 2643 2655 cpuc->excl_cntrs->core_id = core_id; 2644 2656 cpuc->excl_cntrs->refcnt++; 2645 - /* 2646 - * set hard limit to half the number of generic counters 2647 - */ 2648 - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; 2649 - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; 2650 2657 } 2651 2658 } 2652 2659
+2 -2
arch/x86/kernel/cpu/perf_event_intel_ds.c
··· 706 706 707 707 cpuc->pebs_enabled &= ~(1ULL << hwc->idx); 708 708 709 - if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) 709 + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) 710 710 cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); 711 - else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) 711 + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) 712 712 cpuc->pebs_enabled &= ~(1ULL << 63); 713 713 714 714 if (cpuc->enabled)
+23 -13
arch/x86/kernel/cpu/perf_event_intel_pt.c
··· 151 151 152 152 de_attr->attr.attr.name = pt_caps[i].name; 153 153 154 - sysfs_attr_init(&de_attrs->attr.attr); 154 + sysfs_attr_init(&de_attr->attr.attr); 155 155 156 156 de_attr->attr.attr.mode = S_IRUGO; 157 157 de_attr->attr.show = pt_cap_show; ··· 615 615 struct perf_output_handle *handle) 616 616 617 617 { 618 - unsigned long idx, npages, end; 618 + unsigned long head = local64_read(&buf->head); 619 + unsigned long idx, npages, wakeup; 619 620 620 621 if (buf->snapshot) 621 622 return 0; ··· 635 634 buf->topa_index[buf->stop_pos]->stop = 0; 636 635 buf->topa_index[buf->intr_pos]->intr = 0; 637 636 638 - if (pt_cap_get(PT_CAP_topa_multiple_entries)) { 639 - npages = (handle->size + 1) >> PAGE_SHIFT; 640 - end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages; 641 - /*if (end > handle->wakeup >> PAGE_SHIFT) 642 - end = handle->wakeup >> PAGE_SHIFT;*/ 643 - idx = end & (buf->nr_pages - 1); 644 - buf->stop_pos = idx; 645 - idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1; 646 - idx &= buf->nr_pages - 1; 647 - buf->intr_pos = idx; 648 - } 637 + /* how many pages till the STOP marker */ 638 + npages = handle->size >> PAGE_SHIFT; 639 + 640 + /* if it's on a page boundary, fill up one more page */ 641 + if (!offset_in_page(head + handle->size + 1)) 642 + npages++; 643 + 644 + idx = (head >> PAGE_SHIFT) + npages; 645 + idx &= buf->nr_pages - 1; 646 + buf->stop_pos = idx; 647 + 648 + wakeup = handle->wakeup >> PAGE_SHIFT; 649 + 650 + /* in the worst case, wake up the consumer one page before hard stop */ 651 + idx = (head >> PAGE_SHIFT) + npages - 1; 652 + if (idx > wakeup) 653 + idx = wakeup; 654 + 655 + idx &= buf->nr_pages - 1; 656 + buf->intr_pos = idx; 649 657 650 658 buf->topa_index[buf->stop_pos]->stop = 1; 651 659 buf->topa_index[buf->intr_pos]->intr = 1;
+4 -5
arch/x86/kernel/cpu/perf_event_intel_uncore.c
··· 365 365 bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); 366 366 367 367 for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { 368 - hwc = &box->event_list[i]->hw; 369 368 c = uncore_get_event_constraint(box, box->event_list[i]); 370 - hwc->constraint = c; 369 + box->event_constraint[i] = c; 371 370 wmin = min(wmin, c->weight); 372 371 wmax = max(wmax, c->weight); 373 372 } ··· 374 375 /* fastpath, try to reuse previous register */ 375 376 for (i = 0; i < n; i++) { 376 377 hwc = &box->event_list[i]->hw; 377 - c = hwc->constraint; 378 + c = box->event_constraint[i]; 378 379 379 380 /* never assigned */ 380 381 if (hwc->idx == -1) ··· 394 395 } 395 396 /* slow path */ 396 397 if (i != n) 397 - ret = perf_assign_events(box->event_list, n, 398 - wmin, wmax, assign); 398 + ret = perf_assign_events(box->event_constraint, n, 399 + wmin, wmax, n, assign); 399 400 400 401 if (!assign || ret) { 401 402 for (i = 0; i < n; i++)
+1
arch/x86/kernel/cpu/perf_event_intel_uncore.h
··· 97 97 atomic_t refcnt; 98 98 struct perf_event *events[UNCORE_PMC_IDX_MAX]; 99 99 struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; 100 + struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX]; 100 101 unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; 101 102 u64 tags[UNCORE_PMC_IDX_MAX]; 102 103 struct pci_dev *pci_dev;
-4
include/linux/perf_event.h
··· 92 92 int idx; /* index in shared_regs->regs[] */ 93 93 }; 94 94 95 - struct event_constraint; 96 - 97 95 /** 98 96 * struct hw_perf_event - performance event hardware details: 99 97 */ ··· 110 112 111 113 struct hw_perf_event_extra extra_reg; 112 114 struct hw_perf_event_extra branch_reg; 113 - 114 - struct event_constraint *constraint; 115 115 }; 116 116 struct { /* software */ 117 117 struct hrtimer hrtimer;
+2 -1
kernel/events/core.c
··· 3442 3442 if (event->ns) 3443 3443 put_pid_ns(event->ns); 3444 3444 perf_event_free_filter(event); 3445 - perf_event_free_bpf_prog(event); 3446 3445 kfree(event); 3447 3446 } 3448 3447 ··· 3571 3572 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3572 3573 put_callchain_buffers(); 3573 3574 } 3575 + 3576 + perf_event_free_bpf_prog(event); 3574 3577 3575 3578 if (event->destroy) 3576 3579 event->destroy(event);
+14
kernel/events/ring_buffer.c
··· 493 493 rb->aux_pages[rb->aux_nr_pages] = page_address(page++); 494 494 } 495 495 496 + /* 497 + * In overwrite mode, PMUs that don't support SG may not handle more 498 + * than one contiguous allocation, since they rely on PMI to do double 499 + * buffering. In this case, the entire buffer has to be one contiguous 500 + * chunk. 501 + */ 502 + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && 503 + overwrite) { 504 + struct page *page = virt_to_page(rb->aux_pages[0]); 505 + 506 + if (page_private(page) != max_order) 507 + goto out; 508 + } 509 + 496 510 rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, 497 511 overwrite); 498 512 if (!rb->aux_priv)