Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Borislav Petkov:

- Avoid a crash on a heterogeneous machine where not all cores support
the same hw events features

- Avoid a deadlock when throttling events

- Document the perf event states more

- Make sure a number of perf paths switching off or rescheduling events
call perf_cgroup_event_disable()

- Make sure perf does task sampling before its userspace mapping is
torn down, and not after

* tag 'perf_urgent_for_v6.16_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86/intel: Fix crash in icl_update_topdown_event()
perf: Fix the throttle error of some clock events
perf: Add comment to enum perf_event_state
perf/core: Fix WARN in perf_cgroup_switch()
perf: Fix dangling cgroup pointer in cpuctx
perf: Fix cgroup state vs ERROR
perf: Fix sample vs do_exit()

+124 -53
+1 -1
arch/x86/events/intel/core.c
··· 2826 2826 * If the PEBS counters snapshotting is enabled, 2827 2827 * the topdown event is available in PEBS records. 2828 2828 */ 2829 - if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) 2829 + if (is_topdown_count(event) && !is_pebs_counter_event_group(event)) 2830 2830 static_call(intel_pmu_update_topdown_event)(event, NULL); 2831 2831 else 2832 2832 intel_pmu_drain_pebs_buffer();
+40 -2
include/linux/perf_event.h
··· 635 635 unsigned long size; 636 636 }; 637 637 638 - /** 639 - * enum perf_event_state - the states of an event: 638 + /* 639 + * The normal states are: 640 + * 641 + * ACTIVE --. 642 + * ^ | 643 + * | | 644 + * sched_{in,out}() | 645 + * | | 646 + * v | 647 + * ,---> INACTIVE --+ <-. 648 + * | | | 649 + * | {dis,en}able() 650 + * sched_in() | | 651 + * | OFF <--' --+ 652 + * | | 653 + * `---> ERROR ------' 654 + * 655 + * That is: 656 + * 657 + * sched_in: INACTIVE -> {ACTIVE,ERROR} 658 + * sched_out: ACTIVE -> INACTIVE 659 + * disable: {ACTIVE,INACTIVE} -> OFF 660 + * enable: {OFF,ERROR} -> INACTIVE 661 + * 662 + * Where {OFF,ERROR} are disabled states. 663 + * 664 + * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of 665 + * defunct events: 666 + * 667 + * - EXIT means task that the even was assigned to died, but child events 668 + * still live, and further children can still be created. But the event 669 + * itself will never be active again. It can only transition to 670 + * {REVOKED,DEAD}; 671 + * 672 + * - REVOKED means the PMU the event was associated with is gone; all 673 + * functionality is stopped but the event is still alive. Can only 674 + * transition to DEAD; 675 + * 676 + * - DEAD event really is DYING tearing down state and freeing bits. 677 + * 640 678 */ 641 679 enum perf_event_state { 642 680 PERF_EVENT_STATE_DEAD = -5,
+74 -42
kernel/events/core.c
··· 207 207 __perf_ctx_unlock(&cpuctx->ctx); 208 208 } 209 209 210 + typedef struct { 211 + struct perf_cpu_context *cpuctx; 212 + struct perf_event_context *ctx; 213 + } class_perf_ctx_lock_t; 214 + 215 + static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) 216 + { perf_ctx_unlock(_T->cpuctx, _T->ctx); } 217 + 218 + static inline class_perf_ctx_lock_t 219 + class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, 220 + struct perf_event_context *ctx) 221 + { perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } 222 + 210 223 #define TASK_TOMBSTONE ((void *)-1L) 211 224 212 225 static bool is_kernel_event(struct perf_event *event) ··· 957 944 if (READ_ONCE(cpuctx->cgrp) == cgrp) 958 945 return; 959 946 960 - perf_ctx_lock(cpuctx, cpuctx->task_ctx); 947 + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); 948 + /* 949 + * Re-check, could've raced vs perf_remove_from_context(). 950 + */ 951 + if (READ_ONCE(cpuctx->cgrp) == NULL) 952 + return; 953 + 961 954 perf_ctx_disable(&cpuctx->ctx, true); 962 955 963 956 ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); ··· 981 962 ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); 982 963 983 964 perf_ctx_enable(&cpuctx->ctx, true); 984 - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 985 965 } 986 966 987 967 static int perf_cgroup_ensure_storage(struct perf_event *event, ··· 2138 2120 if (event->group_leader == event) 2139 2121 del_event_from_groups(event, ctx); 2140 2122 2141 - /* 2142 - * If event was in error state, then keep it 2143 - * that way, otherwise bogus counts will be 2144 - * returned on read(). The only way to get out 2145 - * of error state is by explicit re-enabling 2146 - * of the event 2147 - */ 2148 - if (event->state > PERF_EVENT_STATE_OFF) { 2149 - perf_cgroup_event_disable(event, ctx); 2150 - perf_event_set_state(event, PERF_EVENT_STATE_OFF); 2151 - } 2152 - 2153 2123 ctx->generation++; 2154 2124 event->pmu_ctx->nr_events--; 2155 2125 } ··· 2155 2149 } 2156 2150 2157 2151 static void put_event(struct perf_event *event); 2158 - static void event_sched_out(struct perf_event *event, 2159 - struct perf_event_context *ctx); 2152 + static void __event_disable(struct perf_event *event, 2153 + struct perf_event_context *ctx, 2154 + enum perf_event_state state); 2160 2155 2161 2156 static void perf_put_aux_event(struct perf_event *event) 2162 2157 { ··· 2190 2183 * state so that we don't try to schedule it again. Note 2191 2184 * that perf_event_enable() will clear the ERROR status. 2192 2185 */ 2193 - event_sched_out(iter, ctx); 2194 - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 2186 + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); 2195 2187 } 2196 2188 } 2197 2189 ··· 2248 2242 &event->pmu_ctx->flexible_active; 2249 2243 } 2250 2244 2251 - /* 2252 - * Events that have PERF_EV_CAP_SIBLING require being part of a group and 2253 - * cannot exist on their own, schedule them out and move them into the ERROR 2254 - * state. Also see _perf_event_enable(), it will not be able to recover 2255 - * this ERROR state. 2256 - */ 2257 - static inline void perf_remove_sibling_event(struct perf_event *event) 2258 - { 2259 - event_sched_out(event, event->ctx); 2260 - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); 2261 - } 2262 - 2263 2245 static void perf_group_detach(struct perf_event *event) 2264 2246 { 2265 2247 struct perf_event *leader = event->group_leader; ··· 2283 2289 */ 2284 2290 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { 2285 2291 2292 + /* 2293 + * Events that have PERF_EV_CAP_SIBLING require being part of 2294 + * a group and cannot exist on their own, schedule them out 2295 + * and move them into the ERROR state. Also see 2296 + * _perf_event_enable(), it will not be able to recover this 2297 + * ERROR state. 2298 + */ 2286 2299 if (sibling->event_caps & PERF_EV_CAP_SIBLING) 2287 - perf_remove_sibling_event(sibling); 2300 + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); 2288 2301 2289 2302 sibling->group_leader = sibling; 2290 2303 list_del_init(&sibling->sibling_list); ··· 2494 2493 state = PERF_EVENT_STATE_EXIT; 2495 2494 if (flags & DETACH_REVOKE) 2496 2495 state = PERF_EVENT_STATE_REVOKED; 2497 - if (flags & DETACH_DEAD) { 2498 - event->pending_disable = 1; 2496 + if (flags & DETACH_DEAD) 2499 2497 state = PERF_EVENT_STATE_DEAD; 2500 - } 2498 + 2501 2499 event_sched_out(event, ctx); 2500 + 2501 + if (event->state > PERF_EVENT_STATE_OFF) 2502 + perf_cgroup_event_disable(event, ctx); 2503 + 2502 2504 perf_event_set_state(event, min(event->state, state)); 2503 2505 2504 2506 if (flags & DETACH_GROUP) ··· 2566 2562 event_function_call(event, __perf_remove_from_context, (void *)flags); 2567 2563 } 2568 2564 2565 + static void __event_disable(struct perf_event *event, 2566 + struct perf_event_context *ctx, 2567 + enum perf_event_state state) 2568 + { 2569 + event_sched_out(event, ctx); 2570 + perf_cgroup_event_disable(event, ctx); 2571 + perf_event_set_state(event, state); 2572 + } 2573 + 2569 2574 /* 2570 2575 * Cross CPU call to disable a performance event 2571 2576 */ ··· 2589 2576 perf_pmu_disable(event->pmu_ctx->pmu); 2590 2577 ctx_time_update_event(ctx, event); 2591 2578 2579 + /* 2580 + * When disabling a group leader, the whole group becomes ineligible 2581 + * to run, so schedule out the full group. 2582 + */ 2592 2583 if (event == event->group_leader) 2593 2584 group_sched_out(event, ctx); 2594 - else 2595 - event_sched_out(event, ctx); 2596 2585 2597 - perf_event_set_state(event, PERF_EVENT_STATE_OFF); 2598 - perf_cgroup_event_disable(event, ctx); 2586 + /* 2587 + * But only mark the leader OFF; the siblings will remain 2588 + * INACTIVE. 2589 + */ 2590 + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); 2599 2591 2600 2592 perf_pmu_enable(event->pmu_ctx->pmu); 2601 2593 } ··· 2674 2656 2675 2657 static void perf_event_throttle(struct perf_event *event) 2676 2658 { 2677 - event->pmu->stop(event, 0); 2678 2659 event->hw.interrupts = MAX_INTERRUPTS; 2660 + event->pmu->stop(event, 0); 2679 2661 if (event == event->group_leader) 2680 2662 perf_log_throttle(event, 0); 2681 2663 } ··· 7457 7439 if (!regs) 7458 7440 return 0; 7459 7441 7442 + /* No mm, no stack, no dump. */ 7443 + if (!current->mm) 7444 + return 0; 7445 + 7460 7446 /* 7461 7447 * Check if we fit in with the requested stack size into the: 7462 7448 * - TASK_SIZE ··· 8171 8149 bool crosstask = event->ctx->task && event->ctx->task != current; 8172 8150 const u32 max_stack = event->attr.sample_max_stack; 8173 8151 struct perf_callchain_entry *callchain; 8152 + 8153 + if (!current->mm) 8154 + user = false; 8174 8155 8175 8156 if (!kernel && !user) 8176 8157 return &__empty_callchain; ··· 11774 11749 { 11775 11750 struct hw_perf_event *hwc = &event->hw; 11776 11751 11777 - if (is_sampling_event(event)) { 11752 + /* 11753 + * The throttle can be triggered in the hrtimer handler. 11754 + * The HRTIMER_NORESTART should be used to stop the timer, 11755 + * rather than hrtimer_cancel(). See perf_swevent_hrtimer() 11756 + */ 11757 + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { 11778 11758 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 11779 11759 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 11780 11760 ··· 11834 11804 static void cpu_clock_event_stop(struct perf_event *event, int flags) 11835 11805 { 11836 11806 perf_swevent_cancel_hrtimer(event); 11837 - cpu_clock_event_update(event); 11807 + if (flags & PERF_EF_UPDATE) 11808 + cpu_clock_event_update(event); 11838 11809 } 11839 11810 11840 11811 static int cpu_clock_event_add(struct perf_event *event, int flags) ··· 11913 11882 static void task_clock_event_stop(struct perf_event *event, int flags) 11914 11883 { 11915 11884 perf_swevent_cancel_hrtimer(event); 11916 - task_clock_event_update(event, event->ctx->time); 11885 + if (flags & PERF_EF_UPDATE) 11886 + task_clock_event_update(event, event->ctx->time); 11917 11887 } 11918 11888 11919 11889 static int task_clock_event_add(struct perf_event *event, int flags)
+9 -8
kernel/exit.c
··· 940 940 taskstats_exit(tsk, group_dead); 941 941 trace_sched_process_exit(tsk, group_dead); 942 942 943 + /* 944 + * Since sampling can touch ->mm, make sure to stop everything before we 945 + * tear it down. 946 + * 947 + * Also flushes inherited counters to the parent - before the parent 948 + * gets woken up by child-exit notifications. 949 + */ 950 + perf_event_exit_task(tsk); 951 + 943 952 exit_mm(); 944 953 945 954 if (group_dead) ··· 963 954 exit_task_namespaces(tsk); 964 955 exit_task_work(tsk); 965 956 exit_thread(tsk); 966 - 967 - /* 968 - * Flush inherited counters to the parent - before the parent 969 - * gets woken up by child-exit notifications. 970 - * 971 - * because of cgroup mode, must be called before cgroup_exit() 972 - */ 973 - perf_event_exit_task(tsk); 974 957 975 958 sched_autogroup_exit_task(tsk); 976 959 cgroup_exit(tsk);