sched/psi: Optimize psi_group_change() cpu_clock() usage

Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus
pressure spikes from aggregation race") caused a regression for him on
a high context switch rate benchmark (schbench) due to the now
repeating cpu_clock() calls.

In particular the problem is that get_recent_times() will extrapolate
the current state to 'now'. But if an update uses a timestamp from
before the start of the update, it is possible to get two reads
with inconsistent results. It is effectively back-dating an update.

(note that this all hard-relies on the clock being synchronized across
CPUs -- if this is not the case, all bets are off).

Combine this problem with the fact that there are per-group-per-cpu
seqcounts, the commit in question pushed the clock read into the group
iteration, causing tree-depth cpu_clock() calls. On architectures
where cpu_clock() has appreciable overhead, this hurts.

Instead move to a per-cpu seqcount, which allows us to have a single
clock read for all group updates, increasing internal consistency and
lowering update overhead. This comes at the cost of a longer update
side (proportional to the tree depth) which can cause the read side to
retry more often.

Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race")
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>,
Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net

Peter Zijlstra 11 months ago 570c8efd 155213a2

+68 -59

2 changed files

expand all

include

linux

psi_types.h

kernel

sched

psi.c

+2 -4

include/linux/psi_types.h

··· 84 84 struct psi_group_cpu { 85 85 /* 1st cacheline updated by the scheduler */ 86 86 87 - /* Aggregator needs to know of concurrent changes */ 88 - seqcount_t seq ____cacheline_aligned_in_smp; 89 - 90 87 /* States of the tasks belonging to this group */ 91 - unsigned int tasks[NR_PSI_TASK_COUNTS]; 88 + unsigned int tasks[NR_PSI_TASK_COUNTS] 89 + ____cacheline_aligned_in_smp; 92 90 93 91 /* Aggregate pressure state derived from the tasks */ 94 92 u32 state_mask;

+66 -55

kernel/sched/psi.c

··· 176 176 .pcpu = &system_group_pcpu, 177 177 }; 178 178 179 + static DEFINE_PER_CPU(seqcount_t, psi_seq); 180 + 181 + static inline void psi_write_begin(int cpu) 182 + { 183 + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); 184 + } 185 + 186 + static inline void psi_write_end(int cpu) 187 + { 188 + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); 189 + } 190 + 191 + static inline u32 psi_read_begin(int cpu) 192 + { 193 + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); 194 + } 195 + 196 + static inline bool psi_read_retry(int cpu, u32 seq) 197 + { 198 + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); 199 + } 200 + 179 201 static void psi_avgs_work(struct work_struct *work); 180 202 181 203 static void poll_timer_fn(struct timer_list *t); ··· 208 186 209 187 group->enabled = true; 210 188 for_each_possible_cpu(cpu) 211 - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); 189 + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); 212 190 group->avg_last_update = sched_clock(); 213 191 group->avg_next_update = group->avg_last_update + psi_period; 214 192 mutex_init(&group->avgs_lock); ··· 288 266 289 267 /* Snapshot a coherent view of the CPU state */ 290 268 do { 291 - seq = read_seqcount_begin(&groupc->seq); 269 + seq = psi_read_begin(cpu); 292 270 now = cpu_clock(cpu); 293 271 memcpy(times, groupc->times, sizeof(groupc->times)); 294 272 state_mask = groupc->state_mask; 295 273 state_start = groupc->state_start; 296 274 if (cpu == current_cpu) 297 275 memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); 298 - } while (read_seqcount_retry(&groupc->seq, seq)); 276 + } while (psi_read_retry(cpu, seq)); 299 277 300 278 /* Calculate state time deltas against the previous snapshot */ 301 279 for (s = 0; s < NR_PSI_STATES; s++) { ··· 794 772 groupc->times[PSI_NONIDLE] += delta; 795 773 } 796 774 775 + #define for_each_group(iter, group) \ 776 + for (typeof(group) iter = group; iter; iter = iter->parent) 777 + 797 778 static void psi_group_change(struct psi_group *group, int cpu, 798 779 unsigned int clear, unsigned int set, 799 - bool wake_clock) 780 + u64 now, bool wake_clock) 800 781 { 801 782 struct psi_group_cpu *groupc; 802 783 unsigned int t, m; 803 784 u32 state_mask; 804 - u64 now; 805 785 806 786 lockdep_assert_rq_held(cpu_rq(cpu)); 807 787 groupc = per_cpu_ptr(group->pcpu, cpu); 808 - 809 - /* 810 - * First we update the task counts according to the state 811 - * change requested through the @clear and @set bits. 812 - * 813 - * Then if the cgroup PSI stats accounting enabled, we 814 - * assess the aggregate resource states this CPU's tasks 815 - * have been in since the last change, and account any 816 - * SOME and FULL time these may have resulted in. 817 - */ 818 - write_seqcount_begin(&groupc->seq); 819 - now = cpu_clock(cpu); 820 788 821 789 /* 822 790 * Start with TSK_ONCPU, which doesn't have a corresponding ··· 859 847 860 848 groupc->state_mask = state_mask; 861 849 862 - write_seqcount_end(&groupc->seq); 863 850 return; 864 851 } 865 852 ··· 878 867 record_times(groupc, now); 879 868 880 869 groupc->state_mask = state_mask; 881 - 882 - write_seqcount_end(&groupc->seq); 883 870 884 871 if (state_mask & group->rtpoll_states) 885 872 psi_schedule_rtpoll_work(group, 1, false); ··· 913 904 void psi_task_change(struct task_struct *task, int clear, int set) 914 905 { 915 906 int cpu = task_cpu(task); 916 - struct psi_group *group; 907 + u64 now; 917 908 918 909 if (!task->pid) 919 910 return; 920 911 921 912 psi_flags_change(task, clear, set); 922 913 923 - group = task_psi_group(task); 924 - do { 925 - psi_group_change(group, cpu, clear, set, true); 926 - } while ((group = group->parent)); 914 + psi_write_begin(cpu); 915 + now = cpu_clock(cpu); 916 + for_each_group(group, task_psi_group(task)) 917 + psi_group_change(group, cpu, clear, set, now, true); 918 + psi_write_end(cpu); 927 919 } 928 920 929 921 void psi_task_switch(struct task_struct *prev, struct task_struct *next, 930 922 bool sleep) 931 923 { 932 - struct psi_group *group, *common = NULL; 924 + struct psi_group *common = NULL; 933 925 int cpu = task_cpu(prev); 926 + u64 now; 927 + 928 + psi_write_begin(cpu); 929 + now = cpu_clock(cpu); 934 930 935 931 if (next->pid) { 936 932 psi_flags_change(next, 0, TSK_ONCPU); ··· 944 930 * ancestors with @prev, those will already have @prev's 945 931 * TSK_ONCPU bit set, and we can stop the iteration there. 946 932 */ 947 - group = task_psi_group(next); 948 - do { 949 - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & 950 - PSI_ONCPU) { 933 + for_each_group(group, task_psi_group(next)) { 934 + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); 935 + 936 + if (groupc->state_mask & PSI_ONCPU) { 951 937 common = group; 952 938 break; 953 939 } 954 - 955 - psi_group_change(group, cpu, 0, TSK_ONCPU, true); 956 - } while ((group = group->parent)); 940 + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); 941 + } 957 942 } 958 943 959 944 if (prev->pid) { ··· 985 972 986 973 psi_flags_change(prev, clear, set); 987 974 988 - group = task_psi_group(prev); 989 - do { 975 + for_each_group(group, task_psi_group(prev)) { 990 976 if (group == common) 991 977 break; 992 - psi_group_change(group, cpu, clear, set, wake_clock); 993 - } while ((group = group->parent)); 978 + psi_group_change(group, cpu, clear, set, now, wake_clock); 979 + } 994 980 995 981 /* 996 982 * TSK_ONCPU is handled up to the common ancestor. If there are ··· 999 987 */ 1000 988 if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { 1001 989 clear &= ~TSK_ONCPU; 1002 - for (; group; group = group->parent) 1003 - psi_group_change(group, cpu, clear, set, wake_clock); 990 + for_each_group(group, common) 991 + psi_group_change(group, cpu, clear, set, now, wake_clock); 1004 992 } 1005 993 } 994 + psi_write_end(cpu); 1006 995 } 1007 996 1008 997 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1009 998 void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) 1010 999 { 1011 1000 int cpu = task_cpu(curr); 1012 - struct psi_group *group; 1013 1001 struct psi_group_cpu *groupc; 1014 1002 s64 delta; 1015 1003 u64 irq; 1004 + u64 now; 1016 1005 1017 1006 if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) 1018 1007 return; ··· 1022 1009 return; 1023 1010 1024 1011 lockdep_assert_rq_held(rq); 1025 - group = task_psi_group(curr); 1026 - if (prev && task_psi_group(prev) == group) 1012 + if (prev && task_psi_group(prev) == task_psi_group(curr)) 1027 1013 return; 1028 1014 1029 1015 irq = irq_time_read(cpu); ··· 1031 1019 return; 1032 1020 rq->psi_irq_time = irq; 1033 1021 1034 - do { 1035 - u64 now; 1022 + psi_write_begin(cpu); 1023 + now = cpu_clock(cpu); 1036 1024 1025 + for_each_group(group, task_psi_group(curr)) { 1037 1026 if (!group->enabled) 1038 1027 continue; 1039 1028 1040 1029 groupc = per_cpu_ptr(group->pcpu, cpu); 1041 1030 1042 - write_seqcount_begin(&groupc->seq); 1043 - now = cpu_clock(cpu); 1044 - 1045 1031 record_times(groupc, now); 1046 1032 groupc->times[PSI_IRQ_FULL] += delta; 1047 1033 1048 - write_seqcount_end(&groupc->seq); 1049 - 1050 1034 if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) 1051 1035 psi_schedule_rtpoll_work(group, 1, false); 1052 - } while ((group = group->parent)); 1036 + } 1037 + psi_write_end(cpu); 1053 1038 } 1054 1039 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1055 1040 ··· 1234 1225 return; 1235 1226 1236 1227 for_each_possible_cpu(cpu) { 1237 - struct rq *rq = cpu_rq(cpu); 1238 - struct rq_flags rf; 1228 + u64 now; 1239 1229 1240 - rq_lock_irq(rq, &rf); 1241 - psi_group_change(group, cpu, 0, 0, true); 1242 - rq_unlock_irq(rq, &rf); 1230 + guard(rq_lock_irq)(cpu_rq(cpu)); 1231 + 1232 + psi_write_begin(cpu); 1233 + now = cpu_clock(cpu); 1234 + psi_group_change(group, cpu, 0, 0, now, true); 1235 + psi_write_end(cpu); 1243 1236 } 1244 1237 } 1245 1238 #endif /* CONFIG_CGROUPS */

Configure Feed

Configure Feed