Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf-urgent-2025-02-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf event fixes from Ingo Molnar:
"Miscellaneous perf events fixes and a minor HW enablement change:

- Fix missing RCU protection in perf_iterate_ctx()

- Fix pmu_ctx_list ordering bug

- Reject the zero page in uprobes

- Fix a family of bugs related to low frequency sampling

- Add Intel Arrow Lake U CPUs to the generic Arrow Lake RAPL support
table

- Fix a lockdep-assert false positive in uretprobes"

* tag 'perf-urgent-2025-02-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
uprobes: Remove too strict lockdep_assert() condition in hprobe_expire()
perf/x86/rapl: Add support for Intel Arrow Lake U
perf/x86/intel: Use better start period for frequency mode
perf/core: Fix low freq setting via IOC_PERIOD
perf/x86: Fix low freqency setting issue
uprobes: Reject the shared zeropage in uprobe_write_opcode()
perf/core: Order the PMU list to fix warning about unordered pmu_ctx_list
perf/core: Add RCU read lock protection to perf_iterate_ctx()

+119 -15
+1 -1
arch/x86/events/core.c
··· 628 628 if (event->attr.type == event->pmu->type) 629 629 event->hw.config |= x86_pmu_get_event_config(event); 630 630 631 - if (event->attr.sample_period && x86_pmu.limit_period) { 631 + if (!event->attr.freq && x86_pmu.limit_period) { 632 632 s64 left = event->attr.sample_period; 633 633 x86_pmu.limit_period(event, &left); 634 634 if (left > event->attr.sample_period)
+85
arch/x86/events/intel/core.c
··· 3952 3952 return test_bit(idx, (unsigned long *)&intel_cap->capabilities); 3953 3953 } 3954 3954 3955 + static u64 intel_pmu_freq_start_period(struct perf_event *event) 3956 + { 3957 + int type = event->attr.type; 3958 + u64 config, factor; 3959 + s64 start; 3960 + 3961 + /* 3962 + * The 127 is the lowest possible recommended SAV (sample after value) 3963 + * for a 4000 freq (default freq), according to the event list JSON file. 3964 + * Also, assume the workload is idle 50% time. 3965 + */ 3966 + factor = 64 * 4000; 3967 + if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE) 3968 + goto end; 3969 + 3970 + /* 3971 + * The estimation of the start period in the freq mode is 3972 + * based on the below assumption. 3973 + * 3974 + * For a cycles or an instructions event, 1GHZ of the 3975 + * underlying platform, 1 IPC. The workload is idle 50% time. 3976 + * The start period = 1,000,000,000 * 1 / freq / 2. 3977 + * = 500,000,000 / freq 3978 + * 3979 + * Usually, the branch-related events occur less than the 3980 + * instructions event. According to the Intel event list JSON 3981 + * file, the SAV (sample after value) of a branch-related event 3982 + * is usually 1/4 of an instruction event. 3983 + * The start period of branch-related events = 125,000,000 / freq. 3984 + * 3985 + * The cache-related events occurs even less. The SAV is usually 3986 + * 1/20 of an instruction event. 3987 + * The start period of cache-related events = 25,000,000 / freq. 3988 + */ 3989 + config = event->attr.config & PERF_HW_EVENT_MASK; 3990 + if (type == PERF_TYPE_HARDWARE) { 3991 + switch (config) { 3992 + case PERF_COUNT_HW_CPU_CYCLES: 3993 + case PERF_COUNT_HW_INSTRUCTIONS: 3994 + case PERF_COUNT_HW_BUS_CYCLES: 3995 + case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND: 3996 + case PERF_COUNT_HW_STALLED_CYCLES_BACKEND: 3997 + case PERF_COUNT_HW_REF_CPU_CYCLES: 3998 + factor = 500000000; 3999 + break; 4000 + case PERF_COUNT_HW_BRANCH_INSTRUCTIONS: 4001 + case PERF_COUNT_HW_BRANCH_MISSES: 4002 + factor = 125000000; 4003 + break; 4004 + case PERF_COUNT_HW_CACHE_REFERENCES: 4005 + case PERF_COUNT_HW_CACHE_MISSES: 4006 + factor = 25000000; 4007 + break; 4008 + default: 4009 + goto end; 4010 + } 4011 + } 4012 + 4013 + if (type == PERF_TYPE_HW_CACHE) 4014 + factor = 25000000; 4015 + end: 4016 + /* 4017 + * Usually, a prime or a number with less factors (close to prime) 4018 + * is chosen as an SAV, which makes it less likely that the sampling 4019 + * period synchronizes with some periodic event in the workload. 4020 + * Minus 1 to make it at least avoiding values near power of twos 4021 + * for the default freq. 4022 + */ 4023 + start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1; 4024 + 4025 + if (start > x86_pmu.max_period) 4026 + start = x86_pmu.max_period; 4027 + 4028 + if (x86_pmu.limit_period) 4029 + x86_pmu.limit_period(event, &start); 4030 + 4031 + return start; 4032 + } 4033 + 3955 4034 static int intel_pmu_hw_config(struct perf_event *event) 3956 4035 { 3957 4036 int ret = x86_pmu_hw_config(event); ··· 4041 3962 ret = intel_pmu_bts_config(event); 4042 3963 if (ret) 4043 3964 return ret; 3965 + 3966 + if (event->attr.freq && event->attr.sample_freq) { 3967 + event->hw.sample_period = intel_pmu_freq_start_period(event); 3968 + event->hw.last_period = event->hw.sample_period; 3969 + local64_set(&event->hw.period_left, event->hw.sample_period); 3970 + } 4044 3971 4045 3972 if (event->attr.precise_ip) { 4046 3973 if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
+1
arch/x86/events/rapl.c
··· 879 879 X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl), 880 880 X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl), 881 881 X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl), 882 + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &model_skl), 882 883 X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl), 883 884 {}, 884 885 };
+20 -11
kernel/events/core.c
··· 4950 4950 find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, 4951 4951 struct perf_event *event) 4952 4952 { 4953 - struct perf_event_pmu_context *new = NULL, *epc; 4953 + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; 4954 4954 void *task_ctx_data = NULL; 4955 4955 4956 4956 if (!ctx->task) { ··· 5007 5007 atomic_inc(&epc->refcount); 5008 5008 goto found_epc; 5009 5009 } 5010 + /* Make sure the pmu_ctx_list is sorted by PMU type: */ 5011 + if (!pos && epc->pmu->type > pmu->type) 5012 + pos = epc; 5010 5013 } 5011 5014 5012 5015 epc = new; 5013 5016 new = NULL; 5014 5017 5015 - list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); 5018 + if (!pos) 5019 + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); 5020 + else 5021 + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); 5022 + 5016 5023 epc->ctx = ctx; 5017 5024 5018 5025 found_epc: ··· 5969 5962 if (!value) 5970 5963 return -EINVAL; 5971 5964 5972 - if (event->attr.freq && value > sysctl_perf_event_sample_rate) 5973 - return -EINVAL; 5974 - 5975 - if (perf_event_check_period(event, value)) 5976 - return -EINVAL; 5977 - 5978 - if (!event->attr.freq && (value & (1ULL << 63))) 5979 - return -EINVAL; 5965 + if (event->attr.freq) { 5966 + if (value > sysctl_perf_event_sample_rate) 5967 + return -EINVAL; 5968 + } else { 5969 + if (perf_event_check_period(event, value)) 5970 + return -EINVAL; 5971 + if (value & (1ULL << 63)) 5972 + return -EINVAL; 5973 + } 5980 5974 5981 5975 event_function_call(event, __perf_event_period, &value); 5982 5976 ··· 8329 8321 8330 8322 perf_event_enable_on_exec(ctx); 8331 8323 perf_event_remove_on_exec(ctx); 8332 - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); 8324 + scoped_guard(rcu) 8325 + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); 8333 8326 8334 8327 perf_unpin_context(ctx); 8335 8328 put_ctx(ctx);
+12 -3
kernel/events/uprobes.c
··· 495 495 if (ret <= 0) 496 496 goto put_old; 497 497 498 + if (is_zero_page(old_page)) { 499 + ret = -EINVAL; 500 + goto put_old; 501 + } 502 + 498 503 if (WARN(!is_register && PageCompound(old_page), 499 504 "uprobe unregister should never work on compound page\n")) { 500 505 ret = -EINVAL; ··· 767 762 enum hprobe_state hstate; 768 763 769 764 /* 770 - * return_instance's hprobe is protected by RCU. 771 - * Underlying uprobe is itself protected from reuse by SRCU. 765 + * Caller should guarantee that return_instance is not going to be 766 + * freed from under us. This can be achieved either through holding 767 + * rcu_read_lock() or by owning return_instance in the first place. 768 + * 769 + * Underlying uprobe is itself protected from reuse by SRCU, so ensure 770 + * SRCU lock is held properly. 772 771 */ 773 - lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); 772 + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); 774 773 775 774 hstate = READ_ONCE(hprobe->state); 776 775 switch (hstate) {