Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
"Core & fair scheduler changes:

- Tweak wait_task_inactive() to force dequeue sched_delayed tasks
(John Stultz)

- Adhere to place_entity() constraints (Peter Zijlstra)

- Allow decaying util_est when util_avg > CPU capacity (Pierre
Gondois)

- Fix up wake_up_sync() vs DELAYED_DEQUEUE (Xuewen Yan)

Energy management:

- Introduce sched_update_asym_prefer_cpu() (K Prateek Nayak)

- cpufreq/amd-pstate: Update asym_prefer_cpu when core rankings
change (K Prateek Nayak)

- Align uclamp and util_est and call before freq update (Xuewen Yan)

CPU isolation:

- Make use of more than one housekeeping CPU (Phil Auld)

RT scheduler:

- Fix race in push_rt_task() (Harshit Agarwal)

- Add kernel cmdline option for rt_group_sched (Michal Koutný)

Scheduler topology support:

- Improve topology_span_sane speed (Steve Wahl)

Scheduler debugging:

- Move and extend the sched_process_exit() tracepoint (Andrii
Nakryiko)

- Add RT_GROUP WARN checks for non-root task_groups (Michal Koutný)

- Fix trace_sched_switch(.prev_state) (Peter Zijlstra)

- Untangle cond_resched() and live-patching (Peter Zijlstra)

Fixes and cleanups:

- Misc fixes and cleanups (K Prateek Nayak, Michal Koutný, Peter
Zijlstra, Xuewen Yan)"

* tag 'sched-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits)
sched/uclamp: Align uclamp and util_est and call before freq update
sched/util_est: Simplify condition for util_est_{en,de}queue()
sched/fair: Fixup wake_up_sync() vs DELAYED_DEQUEUE
sched,livepatch: Untangle cond_resched() and live-patching
sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks
sched/fair: Adhere to place_entity() constraints
sched/debug: Print the local group's asym_prefer_cpu
cpufreq/amd-pstate: Update asym_prefer_cpu when core rankings change
sched/topology: Introduce sched_update_asym_prefer_cpu()
sched/fair: Use READ_ONCE() to read sg->asym_prefer_cpu
sched/isolation: Make use of more than one housekeeping cpu
sched/rt: Fix race in push_rt_task
sched: Add annotations to RT_GROUP_SCHED fields
sched: Add RT_GROUP WARN checks for non-root task_groups
sched: Do not construct nor expose RT_GROUP_SCHED structures if disabled
sched: Bypass bandwitdh checks with runtime disabled RT_GROUP_SCHED
sched: Skip non-root task_groups with disabled RT_GROUP_SCHED
sched: Add commadline option for RT_GROUP_SCHED toggling
sched: Always initialize rt_rq's task_group
sched: Remove unneeed macro wrap
...

+378 -215
+5
Documentation/admin-guide/kernel-parameters.txt
··· 6320 6320 Memory area to be used by remote processor image, 6321 6321 managed by CMA. 6322 6322 6323 + rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling 6324 + when CONFIG_RT_GROUP_SCHED=y. Defaults to 6325 + !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED. 6326 + Format: <bool> 6327 + 6323 6328 rw [KNL] Mount root device read-write on boot 6324 6329 6325 6330 S [KNL] Run init in single mode
+3 -1
drivers/cpufreq/amd-pstate.c
··· 831 831 if (highest_perf_changed) { 832 832 WRITE_ONCE(cpudata->prefcore_ranking, cur_high); 833 833 834 - if (cur_high < CPPC_MAX_PERF) 834 + if (cur_high < CPPC_MAX_PERF) { 835 835 sched_set_itmt_core_prio((int)cur_high, cpu); 836 + sched_update_asym_prefer_cpu(cpu, prev_high, cur_high); 837 + } 836 838 } 837 839 } 838 840
+5 -9
include/linux/livepatch_sched.h
··· 3 3 #define _LINUX_LIVEPATCH_SCHED_H_ 4 4 5 5 #include <linux/jump_label.h> 6 - #include <linux/static_call_types.h> 6 + #include <linux/sched.h> 7 7 8 8 #ifdef CONFIG_LIVEPATCH 9 9 10 10 void __klp_sched_try_switch(void); 11 11 12 - #if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 13 - 14 12 DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key); 15 13 16 - static __always_inline void klp_sched_try_switch(void) 14 + static __always_inline void klp_sched_try_switch(struct task_struct *curr) 17 15 { 18 - if (static_branch_unlikely(&klp_sched_try_switch_key)) 16 + if (static_branch_unlikely(&klp_sched_try_switch_key) && 17 + READ_ONCE(curr->__state) & TASK_FREEZABLE) 19 18 __klp_sched_try_switch(); 20 19 } 21 20 22 - #endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 23 - 24 21 #else /* !CONFIG_LIVEPATCH */ 25 - static inline void klp_sched_try_switch(void) {} 26 - static inline void __klp_sched_try_switch(void) {} 22 + static inline void klp_sched_try_switch(struct task_struct *curr) {} 27 23 #endif /* CONFIG_LIVEPATCH */ 28 24 29 25 #endif /* _LINUX_LIVEPATCH_SCHED_H_ */
-6
include/linux/sched.h
··· 44 44 #include <linux/seqlock_types.h> 45 45 #include <linux/kcsan.h> 46 46 #include <linux/rv.h> 47 - #include <linux/livepatch_sched.h> 48 47 #include <linux/uidgid_types.h> 49 48 #include <linux/tracepoint-defs.h> 50 49 #include <asm/kmap_size.h> ··· 2088 2089 2089 2090 #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 2090 2091 2091 - void sched_dynamic_klp_enable(void); 2092 - void sched_dynamic_klp_disable(void); 2093 - 2094 2092 DECLARE_STATIC_CALL(cond_resched, __cond_resched); 2095 2093 2096 2094 static __always_inline int _cond_resched(void) ··· 2108 2112 2109 2113 static inline int _cond_resched(void) 2110 2114 { 2111 - klp_sched_try_switch(); 2112 2115 return __cond_resched(); 2113 2116 } 2114 2117 ··· 2117 2122 2118 2123 static inline int _cond_resched(void) 2119 2124 { 2120 - klp_sched_try_switch(); 2121 2125 return 0; 2122 2126 } 2123 2127
+6
include/linux/sched/topology.h
··· 195 195 }; 196 196 197 197 extern void __init set_sched_topology(struct sched_domain_topology_level *tl); 198 + extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); 199 + 198 200 199 201 # define SD_INIT_NAME(type) .name = #type 200 202 ··· 223 221 static inline bool cpus_share_resources(int this_cpu, int that_cpu) 224 222 { 225 223 return true; 224 + } 225 + 226 + static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) 227 + { 226 228 } 227 229 228 230 #endif /* !CONFIG_SMP */
+30 -4
include/trace/events/sched.h
··· 326 326 TP_ARGS(p)); 327 327 328 328 /* 329 - * Tracepoint for a task exiting: 329 + * Tracepoint for a task exiting. 330 + * Note, it's a superset of sched_process_template and should be kept 331 + * compatible as much as possible. sched_process_exits has an extra 332 + * `group_dead` argument, so sched_process_template can't be used, 333 + * unfortunately, just like sched_migrate_task above. 330 334 */ 331 - DEFINE_EVENT(sched_process_template, sched_process_exit, 332 - TP_PROTO(struct task_struct *p), 333 - TP_ARGS(p)); 335 + TRACE_EVENT(sched_process_exit, 336 + 337 + TP_PROTO(struct task_struct *p, bool group_dead), 338 + 339 + TP_ARGS(p, group_dead), 340 + 341 + TP_STRUCT__entry( 342 + __array( char, comm, TASK_COMM_LEN ) 343 + __field( pid_t, pid ) 344 + __field( int, prio ) 345 + __field( bool, group_dead ) 346 + ), 347 + 348 + TP_fast_assign( 349 + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); 350 + __entry->pid = p->pid; 351 + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ 352 + __entry->group_dead = group_dead; 353 + ), 354 + 355 + TP_printk("comm=%s pid=%d prio=%d group_dead=%s", 356 + __entry->comm, __entry->pid, __entry->prio, 357 + __entry->group_dead ? "true" : "false" 358 + ) 359 + ); 334 360 335 361 /* 336 362 * Tracepoint for waiting on task to unschedule:
+11
init/Kconfig
··· 1075 1075 realtime bandwidth for them. 1076 1076 See Documentation/scheduler/sched-rt-group.rst for more information. 1077 1077 1078 + config RT_GROUP_SCHED_DEFAULT_DISABLED 1079 + bool "Require boot parameter to enable group scheduling for SCHED_RR/FIFO" 1080 + depends on RT_GROUP_SCHED 1081 + default n 1082 + help 1083 + When set, the RT group scheduling is disabled by default. The option 1084 + is in inverted form so that mere RT_GROUP_SCHED enables the group 1085 + scheduling. 1086 + 1087 + Say N if unsure. 1088 + 1078 1089 config EXT_GROUP_SCHED 1079 1090 bool 1080 1091 depends on SCHED_CLASS_EXT && CGROUP_SCHED
+1 -1
kernel/exit.c
··· 942 942 943 943 tsk->exit_code = code; 944 944 taskstats_exit(tsk, group_dead); 945 + trace_sched_process_exit(tsk, group_dead); 945 946 946 947 exit_mm(); 947 948 948 949 if (group_dead) 949 950 acct_process(); 950 - trace_sched_process_exit(tsk); 951 951 952 952 exit_sem(tsk); 953 953 exit_shm(tsk);
+15 -36
kernel/livepatch/transition.c
··· 29 29 30 30 /* 31 31 * When a livepatch is in progress, enable klp stack checking in 32 - * cond_resched(). This helps CPU-bound kthreads get patched. 32 + * schedule(). This helps CPU-bound kthreads get patched. 33 33 */ 34 - #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 35 - 36 - #define klp_cond_resched_enable() sched_dynamic_klp_enable() 37 - #define klp_cond_resched_disable() sched_dynamic_klp_disable() 38 - 39 - #else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 40 34 41 35 DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); 42 - EXPORT_SYMBOL(klp_sched_try_switch_key); 43 36 44 - #define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) 45 - #define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) 46 - 47 - #endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 37 + #define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key) 38 + #define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key) 48 39 49 40 /* 50 41 * This work can be performed periodically to finish patching or unpatching any ··· 356 365 357 366 void __klp_sched_try_switch(void) 358 367 { 368 + /* 369 + * This function is called from __schedule() while a context switch is 370 + * about to happen. Preemption is already disabled and klp_mutex 371 + * can't be acquired. 372 + * Disabled preemption is used to prevent racing with other callers of 373 + * klp_try_switch_task(). Thanks to task_call_func() they won't be 374 + * able to switch to this task while it's running. 375 + */ 376 + lockdep_assert_preemption_disabled(); 377 + 359 378 if (likely(!klp_patch_pending(current))) 360 379 return; 361 - 362 - /* 363 - * This function is called from cond_resched() which is called in many 364 - * places throughout the kernel. Using the klp_mutex here might 365 - * deadlock. 366 - * 367 - * Instead, disable preemption to prevent racing with other callers of 368 - * klp_try_switch_task(). Thanks to task_call_func() they won't be 369 - * able to switch this task while it's running. 370 - */ 371 - preempt_disable(); 372 - 373 - /* 374 - * Make sure current didn't get patched between the above check and 375 - * preempt_disable(). 376 - */ 377 - if (unlikely(!klp_patch_pending(current))) 378 - goto out; 379 380 380 381 /* 381 382 * Enforce the order of the TIF_PATCH_PENDING read above and the ··· 378 395 smp_rmb(); 379 396 380 397 klp_try_switch_task(current); 381 - 382 - out: 383 - preempt_enable(); 384 398 } 385 - EXPORT_SYMBOL(__klp_sched_try_switch); 386 399 387 400 /* 388 401 * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. ··· 487 508 } 488 509 489 510 /* Done! Now cleanup the data structures. */ 490 - klp_cond_resched_disable(); 511 + klp_resched_disable(); 491 512 patch = klp_transition_patch; 492 513 klp_complete_transition(); 493 514 ··· 539 560 set_tsk_thread_flag(task, TIF_PATCH_PENDING); 540 561 } 541 562 542 - klp_cond_resched_enable(); 563 + klp_resched_enable(); 543 564 544 565 klp_signals_cnt = 0; 545 566 }
+83 -65
kernel/sched/core.c
··· 66 66 #include <linux/vtime.h> 67 67 #include <linux/wait_api.h> 68 68 #include <linux/workqueue_api.h> 69 + #include <linux/livepatch_sched.h> 69 70 70 71 #ifdef CONFIG_PREEMPT_DYNAMIC 71 72 # ifdef CONFIG_GENERIC_ENTRY ··· 1753 1752 } 1754 1753 } 1755 1754 1756 - static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) 1755 + static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) 1757 1756 { 1758 1757 enum uclamp_id clamp_id; 1759 1758 ··· 1769 1768 if (unlikely(!p->sched_class->uclamp_enabled)) 1770 1769 return; 1771 1770 1772 - if (p->se.sched_delayed) 1771 + /* Only inc the delayed task which being woken up. */ 1772 + if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) 1773 1773 return; 1774 1774 1775 1775 for_each_clamp_id(clamp_id) ··· 2038 2036 } 2039 2037 2040 2038 #else /* !CONFIG_UCLAMP_TASK */ 2041 - static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } 2039 + static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } 2042 2040 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } 2043 2041 static inline void uclamp_fork(struct task_struct *p) { } 2044 2042 static inline void uclamp_post_fork(struct task_struct *p) { } ··· 2074 2072 if (!(flags & ENQUEUE_NOCLOCK)) 2075 2073 update_rq_clock(rq); 2076 2074 2077 - p->sched_class->enqueue_task(rq, p, flags); 2078 2075 /* 2079 - * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear 2080 - * ->sched_delayed. 2076 + * Can be before ->enqueue_task() because uclamp considers the 2077 + * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared 2078 + * in ->enqueue_task(). 2081 2079 */ 2082 - uclamp_rq_inc(rq, p); 2080 + uclamp_rq_inc(rq, p, flags); 2081 + 2082 + p->sched_class->enqueue_task(rq, p, flags); 2083 2083 2084 2084 psi_enqueue(p, flags); 2085 2085 ··· 2287 2283 * just go back and repeat. 2288 2284 */ 2289 2285 rq = task_rq_lock(p, &rf); 2286 + /* 2287 + * If task is sched_delayed, force dequeue it, to avoid always 2288 + * hitting the tick timeout in the queued case 2289 + */ 2290 + if (p->se.sched_delayed) 2291 + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 2290 2292 trace_sched_wait_task(p); 2291 2293 running = task_on_cpu(rq, p); 2292 2294 queued = task_on_rq_queued(p); ··· 6581 6571 * Otherwise marks the task's __state as RUNNING 6582 6572 */ 6583 6573 static bool try_to_block_task(struct rq *rq, struct task_struct *p, 6584 - unsigned long task_state) 6574 + unsigned long *task_state_p) 6585 6575 { 6576 + unsigned long task_state = *task_state_p; 6586 6577 int flags = DEQUEUE_NOCLOCK; 6587 6578 6588 6579 if (signal_pending_state(task_state, p)) { 6589 6580 WRITE_ONCE(p->__state, TASK_RUNNING); 6581 + *task_state_p = TASK_RUNNING; 6590 6582 return false; 6591 6583 } 6592 6584 ··· 6680 6668 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) 6681 6669 hrtick_clear(rq); 6682 6670 6671 + klp_sched_try_switch(prev); 6672 + 6683 6673 local_irq_disable(); 6684 6674 rcu_note_context_switch(preempt); 6685 6675 ··· 6727 6713 goto picked; 6728 6714 } 6729 6715 } else if (!preempt && prev_state) { 6730 - try_to_block_task(rq, prev, prev_state); 6716 + try_to_block_task(rq, prev, &prev_state); 6731 6717 switch_count = &prev->nvcsw; 6732 6718 } 6733 6719 ··· 7342 7328 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); 7343 7329 int __sched dynamic_cond_resched(void) 7344 7330 { 7345 - klp_sched_try_switch(); 7346 7331 if (!static_branch_unlikely(&sk_dynamic_cond_resched)) 7347 7332 return 0; 7348 7333 return __cond_resched(); ··· 7513 7500 #endif 7514 7501 7515 7502 static DEFINE_MUTEX(sched_dynamic_mutex); 7516 - static bool klp_override; 7517 7503 7518 7504 static void __sched_dynamic_update(int mode) 7519 7505 { ··· 7520 7508 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in 7521 7509 * the ZERO state, which is invalid. 7522 7510 */ 7523 - if (!klp_override) 7524 - preempt_dynamic_enable(cond_resched); 7511 + preempt_dynamic_enable(cond_resched); 7525 7512 preempt_dynamic_enable(might_resched); 7526 7513 preempt_dynamic_enable(preempt_schedule); 7527 7514 preempt_dynamic_enable(preempt_schedule_notrace); ··· 7529 7518 7530 7519 switch (mode) { 7531 7520 case preempt_dynamic_none: 7532 - if (!klp_override) 7533 - preempt_dynamic_enable(cond_resched); 7521 + preempt_dynamic_enable(cond_resched); 7534 7522 preempt_dynamic_disable(might_resched); 7535 7523 preempt_dynamic_disable(preempt_schedule); 7536 7524 preempt_dynamic_disable(preempt_schedule_notrace); ··· 7540 7530 break; 7541 7531 7542 7532 case preempt_dynamic_voluntary: 7543 - if (!klp_override) 7544 - preempt_dynamic_enable(cond_resched); 7533 + preempt_dynamic_enable(cond_resched); 7545 7534 preempt_dynamic_enable(might_resched); 7546 7535 preempt_dynamic_disable(preempt_schedule); 7547 7536 preempt_dynamic_disable(preempt_schedule_notrace); ··· 7551 7542 break; 7552 7543 7553 7544 case preempt_dynamic_full: 7554 - if (!klp_override) 7555 - preempt_dynamic_disable(cond_resched); 7545 + preempt_dynamic_disable(cond_resched); 7556 7546 preempt_dynamic_disable(might_resched); 7557 7547 preempt_dynamic_enable(preempt_schedule); 7558 7548 preempt_dynamic_enable(preempt_schedule_notrace); ··· 7562 7554 break; 7563 7555 7564 7556 case preempt_dynamic_lazy: 7565 - if (!klp_override) 7566 - preempt_dynamic_disable(cond_resched); 7557 + preempt_dynamic_disable(cond_resched); 7567 7558 preempt_dynamic_disable(might_resched); 7568 7559 preempt_dynamic_enable(preempt_schedule); 7569 7560 preempt_dynamic_enable(preempt_schedule_notrace); ··· 7582 7575 __sched_dynamic_update(mode); 7583 7576 mutex_unlock(&sched_dynamic_mutex); 7584 7577 } 7585 - 7586 - #ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL 7587 - 7588 - static int klp_cond_resched(void) 7589 - { 7590 - __klp_sched_try_switch(); 7591 - return __cond_resched(); 7592 - } 7593 - 7594 - void sched_dynamic_klp_enable(void) 7595 - { 7596 - mutex_lock(&sched_dynamic_mutex); 7597 - 7598 - klp_override = true; 7599 - static_call_update(cond_resched, klp_cond_resched); 7600 - 7601 - mutex_unlock(&sched_dynamic_mutex); 7602 - } 7603 - 7604 - void sched_dynamic_klp_disable(void) 7605 - { 7606 - mutex_lock(&sched_dynamic_mutex); 7607 - 7608 - klp_override = false; 7609 - __sched_dynamic_update(preempt_dynamic_mode); 7610 - 7611 - mutex_unlock(&sched_dynamic_mutex); 7612 - } 7613 - 7614 - #endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ 7615 7578 7616 7579 static int __init setup_preempt_mode(char *str) 7617 7580 { ··· 8995 9018 unsigned long flags; 8996 9019 8997 9020 spin_lock_irqsave(&task_group_lock, flags); 8998 - list_add_rcu(&tg->list, &task_groups); 9021 + list_add_tail_rcu(&tg->list, &task_groups); 8999 9022 9000 9023 /* Root should already exist: */ 9001 9024 WARN_ON(!parent); ··· 9181 9204 struct task_struct *task; 9182 9205 struct cgroup_subsys_state *css; 9183 9206 9207 + if (!rt_group_sched_enabled()) 9208 + goto scx_check; 9209 + 9184 9210 cgroup_taskset_for_each(task, css, tset) { 9185 9211 if (!sched_rt_can_attach(css_tg(css), task)) 9186 9212 return -EINVAL; 9187 9213 } 9188 - #endif 9214 + scx_check: 9215 + #endif /* CONFIG_RT_GROUP_SCHED */ 9189 9216 return scx_cgroup_can_attach(tset); 9190 9217 } 9191 9218 ··· 9842 9861 .seq_show = cpu_cfs_local_stat_show, 9843 9862 }, 9844 9863 #endif 9845 - #ifdef CONFIG_RT_GROUP_SCHED 9846 - { 9847 - .name = "rt_runtime_us", 9848 - .read_s64 = cpu_rt_runtime_read, 9849 - .write_s64 = cpu_rt_runtime_write, 9850 - }, 9851 - { 9852 - .name = "rt_period_us", 9853 - .read_u64 = cpu_rt_period_read_uint, 9854 - .write_u64 = cpu_rt_period_write_uint, 9855 - }, 9856 - #endif 9857 9864 #ifdef CONFIG_UCLAMP_TASK_GROUP 9858 9865 { 9859 9866 .name = "uclamp.min", ··· 9858 9889 #endif 9859 9890 { } /* Terminate */ 9860 9891 }; 9892 + 9893 + #ifdef CONFIG_RT_GROUP_SCHED 9894 + static struct cftype rt_group_files[] = { 9895 + { 9896 + .name = "rt_runtime_us", 9897 + .read_s64 = cpu_rt_runtime_read, 9898 + .write_s64 = cpu_rt_runtime_write, 9899 + }, 9900 + { 9901 + .name = "rt_period_us", 9902 + .read_u64 = cpu_rt_period_read_uint, 9903 + .write_u64 = cpu_rt_period_write_uint, 9904 + }, 9905 + { } /* Terminate */ 9906 + }; 9907 + 9908 + # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED 9909 + DEFINE_STATIC_KEY_FALSE(rt_group_sched); 9910 + # else 9911 + DEFINE_STATIC_KEY_TRUE(rt_group_sched); 9912 + # endif 9913 + 9914 + static int __init setup_rt_group_sched(char *str) 9915 + { 9916 + long val; 9917 + 9918 + if (kstrtol(str, 0, &val) || val < 0 || val > 1) { 9919 + pr_warn("Unable to set rt_group_sched\n"); 9920 + return 1; 9921 + } 9922 + if (val) 9923 + static_branch_enable(&rt_group_sched); 9924 + else 9925 + static_branch_disable(&rt_group_sched); 9926 + 9927 + return 1; 9928 + } 9929 + __setup("rt_group_sched=", setup_rt_group_sched); 9930 + 9931 + static int __init cpu_rt_group_init(void) 9932 + { 9933 + if (!rt_group_sched_enabled()) 9934 + return 0; 9935 + 9936 + WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files)); 9937 + return 0; 9938 + } 9939 + subsys_initcall(cpu_rt_group_init); 9940 + #endif /* CONFIG_RT_GROUP_SCHED */ 9861 9941 9862 9942 static int cpu_extra_stat_show(struct seq_file *sf, 9863 9943 struct cgroup_subsys_state *css)
+4
kernel/sched/debug.c
··· 588 588 debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); 589 589 debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); 590 590 debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); 591 + 592 + if (sd->flags & SD_ASYM_PACKING) 593 + debugfs_create_u32("group_asym_prefer_cpu", 0444, parent, 594 + (u32 *)&sd->groups->asym_prefer_cpu); 591 595 } 592 596 593 597 void update_sched_domain_debugfs(void)
+19 -14
kernel/sched/fair.c
··· 3795 3795 update_entity_lag(cfs_rq, se); 3796 3796 se->deadline -= se->vruntime; 3797 3797 se->rel_deadline = 1; 3798 + cfs_rq->nr_queued--; 3798 3799 if (!curr) 3799 3800 __dequeue_entity(cfs_rq, se); 3800 3801 update_load_sub(&cfs_rq->load, se->load.weight); ··· 3822 3821 3823 3822 enqueue_load_avg(cfs_rq, se); 3824 3823 if (se->on_rq) { 3825 - update_load_add(&cfs_rq->load, se->load.weight); 3826 3824 place_entity(cfs_rq, se, 0); 3825 + update_load_add(&cfs_rq->load, se->load.weight); 3827 3826 if (!curr) 3828 3827 __enqueue_entity(cfs_rq, se); 3828 + cfs_rq->nr_queued++; 3829 3829 3830 3830 /* 3831 3831 * The entity's vruntime has been adjusted, so let's check ··· 4933 4931 last_ewma_diff = ewma - dequeued; 4934 4932 if (last_ewma_diff < UTIL_EST_MARGIN) 4935 4933 goto done; 4936 - 4937 - /* 4938 - * To avoid overestimation of actual task utilization, skip updates if 4939 - * we cannot grant there is idle time in this CPU. 4940 - */ 4941 - if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) 4942 - return; 4943 4934 4944 4935 /* 4945 4936 * To avoid underestimate of task utilization, skip updates of EWMA if ··· 6936 6941 * Let's add the task's estimated utilization to the cfs_rq's 6937 6942 * estimated utilization, before we update schedutil. 6938 6943 */ 6939 - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) 6944 + if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED)) 6940 6945 util_est_enqueue(&rq->cfs, p); 6941 6946 6942 6947 if (flags & ENQUEUE_DELAYED) { ··· 7176 7181 */ 7177 7182 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 7178 7183 { 7179 - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) 7184 + if (!p->se.sched_delayed) 7180 7185 util_est_dequeue(&rq->cfs, p); 7181 7186 7182 7187 util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); ··· 7189 7194 7190 7195 hrtick_update(rq); 7191 7196 return true; 7197 + } 7198 + 7199 + static inline unsigned int cfs_h_nr_delayed(struct rq *rq) 7200 + { 7201 + return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); 7192 7202 } 7193 7203 7194 7204 #ifdef CONFIG_SMP ··· 7357 7357 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) 7358 7358 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; 7359 7359 7360 - if (sync && cpu_rq(this_cpu)->nr_running == 1) 7361 - return this_cpu; 7360 + if (sync) { 7361 + struct rq *rq = cpu_rq(this_cpu); 7362 + 7363 + if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) 7364 + return this_cpu; 7365 + } 7362 7366 7363 7367 if (available_idle_cpu(prev_cpu)) 7364 7368 return prev_cpu; ··· 10260 10256 (sgs->group_weight - sgs->idle_cpus != 1)) 10261 10257 return false; 10262 10258 10263 - return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); 10259 + return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu)); 10264 10260 } 10265 10261 10266 10262 /* One group has more than one SMT CPU while the other group does not */ ··· 10497 10493 10498 10494 case group_asym_packing: 10499 10495 /* Prefer to move from lowest priority CPU's work */ 10500 - return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); 10496 + return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu), 10497 + READ_ONCE(sg->asym_prefer_cpu)); 10501 10498 10502 10499 case group_misfit_task: 10503 10500 /*
+1 -1
kernel/sched/isolation.c
··· 40 40 if (cpu < nr_cpu_ids) 41 41 return cpu; 42 42 43 - cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); 43 + cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask); 44 44 if (likely(cpu < nr_cpu_ids)) 45 45 return cpu; 46 46 /*
+59 -46
kernel/sched/rt.c
··· 89 89 rt_rq->rt_throttled = 0; 90 90 rt_rq->rt_runtime = 0; 91 91 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 92 + rt_rq->tg = &root_task_group; 92 93 #endif 93 94 } 94 95 ··· 176 175 177 176 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 178 177 { 178 + /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */ 179 + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); 179 180 return rt_rq->rq; 180 181 } 181 182 182 183 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 183 184 { 185 + WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group); 184 186 return rt_se->rt_rq; 185 187 } 186 188 ··· 191 187 { 192 188 struct rt_rq *rt_rq = rt_se->rt_rq; 193 189 190 + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); 194 191 return rt_rq->rq; 195 192 } 196 193 197 194 void unregister_rt_sched_group(struct task_group *tg) 198 195 { 196 + if (!rt_group_sched_enabled()) 197 + return; 198 + 199 199 if (tg->rt_se) 200 200 destroy_rt_bandwidth(&tg->rt_bandwidth); 201 201 } ··· 207 199 void free_rt_sched_group(struct task_group *tg) 208 200 { 209 201 int i; 202 + 203 + if (!rt_group_sched_enabled()) 204 + return; 210 205 211 206 for_each_possible_cpu(i) { 212 207 if (tg->rt_rq) ··· 254 243 struct rt_rq *rt_rq; 255 244 struct sched_rt_entity *rt_se; 256 245 int i; 246 + 247 + if (!rt_group_sched_enabled()) 248 + return 1; 257 249 258 250 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); 259 251 if (!tg->rt_rq) ··· 496 482 497 483 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 498 484 { 499 - if (!rt_rq->tg) 500 - return RUNTIME_INF; 501 - 502 485 return rt_rq->rt_runtime; 503 486 } 504 487 ··· 508 497 509 498 static inline struct task_group *next_task_group(struct task_group *tg) 510 499 { 500 + if (!rt_group_sched_enabled()) { 501 + WARN_ON(tg != &root_task_group); 502 + return NULL; 503 + } 504 + 511 505 do { 512 506 tg = list_entry_rcu(tg->list.next, 513 507 typeof(struct task_group), list); ··· 525 509 } 526 510 527 511 #define for_each_rt_rq(rt_rq, iter, rq) \ 528 - for (iter = container_of(&task_groups, typeof(*iter), list); \ 529 - (iter = next_task_group(iter)) && \ 530 - (rt_rq = iter->rt_rq[cpu_of(rq)]);) 512 + for (iter = &root_task_group; \ 513 + iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \ 514 + iter = next_task_group(iter)) 531 515 532 516 #define for_each_sched_rt_entity(rt_se) \ 533 517 for (; rt_se; rt_se = rt_se->parent) ··· 1082 1066 { 1083 1067 struct rq *rq = rq_of_rt_rq(rt_rq); 1084 1068 1085 - #ifdef CONFIG_RT_GROUP_SCHED 1086 1069 /* 1087 1070 * Change rq's cpupri only if rt_rq is the top queue. 1088 1071 */ 1089 - if (&rq->rt != rt_rq) 1072 + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) 1090 1073 return; 1091 - #endif 1074 + 1092 1075 if (rq->online && prio < prev_prio) 1093 1076 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1094 1077 } ··· 1097 1082 { 1098 1083 struct rq *rq = rq_of_rt_rq(rt_rq); 1099 1084 1100 - #ifdef CONFIG_RT_GROUP_SCHED 1101 1085 /* 1102 1086 * Change rq's cpupri only if rt_rq is the top queue. 1103 1087 */ 1104 - if (&rq->rt != rt_rq) 1088 + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) 1105 1089 return; 1106 - #endif 1090 + 1107 1091 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1108 1092 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1109 1093 } ··· 1170 1156 if (rt_se_boosted(rt_se)) 1171 1157 rt_rq->rt_nr_boosted++; 1172 1158 1173 - if (rt_rq->tg) 1174 - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1159 + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1175 1160 } 1176 1161 1177 1162 static void ··· 1270 1257 static inline struct sched_statistics * 1271 1258 __schedstats_from_rt_se(struct sched_rt_entity *rt_se) 1272 1259 { 1273 - #ifdef CONFIG_RT_GROUP_SCHED 1274 1260 /* schedstats is not supported for rt group. */ 1275 1261 if (!rt_entity_is_task(rt_se)) 1276 1262 return NULL; 1277 - #endif 1278 1263 1279 1264 return &rt_task_of(rt_se)->stats; 1280 1265 } ··· 1894 1883 return -1; 1895 1884 } 1896 1885 1886 + static struct task_struct *pick_next_pushable_task(struct rq *rq) 1887 + { 1888 + struct task_struct *p; 1889 + 1890 + if (!has_pushable_tasks(rq)) 1891 + return NULL; 1892 + 1893 + p = plist_first_entry(&rq->rt.pushable_tasks, 1894 + struct task_struct, pushable_tasks); 1895 + 1896 + BUG_ON(rq->cpu != task_cpu(p)); 1897 + BUG_ON(task_current(rq, p)); 1898 + BUG_ON(task_current_donor(rq, p)); 1899 + BUG_ON(p->nr_cpus_allowed <= 1); 1900 + 1901 + BUG_ON(!task_on_rq_queued(p)); 1902 + BUG_ON(!rt_task(p)); 1903 + 1904 + return p; 1905 + } 1906 + 1897 1907 /* Will lock the rq it finds */ 1898 1908 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1899 1909 { ··· 1945 1913 /* 1946 1914 * We had to unlock the run queue. In 1947 1915 * the mean time, task could have 1948 - * migrated already or had its affinity changed. 1949 - * Also make sure that it wasn't scheduled on its rq. 1916 + * migrated already or had its affinity changed, 1917 + * therefore check if the task is still at the 1918 + * head of the pushable tasks list. 1950 1919 * It is possible the task was scheduled, set 1951 1920 * "migrate_disabled" and then got preempted, so we must 1952 1921 * check the task migration disable flag here too. 1953 1922 */ 1954 - if (unlikely(task_rq(task) != rq || 1923 + if (unlikely(is_migration_disabled(task) || 1955 1924 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 1956 - task_on_cpu(rq, task) || 1957 - !rt_task(task) || 1958 - is_migration_disabled(task) || 1959 - !task_on_rq_queued(task))) { 1925 + task != pick_next_pushable_task(rq))) { 1960 1926 1961 1927 double_unlock_balance(rq, lowest_rq); 1962 1928 lowest_rq = NULL; ··· 1972 1942 } 1973 1943 1974 1944 return lowest_rq; 1975 - } 1976 - 1977 - static struct task_struct *pick_next_pushable_task(struct rq *rq) 1978 - { 1979 - struct task_struct *p; 1980 - 1981 - if (!has_pushable_tasks(rq)) 1982 - return NULL; 1983 - 1984 - p = plist_first_entry(&rq->rt.pushable_tasks, 1985 - struct task_struct, pushable_tasks); 1986 - 1987 - BUG_ON(rq->cpu != task_cpu(p)); 1988 - BUG_ON(task_current(rq, p)); 1989 - BUG_ON(task_current_donor(rq, p)); 1990 - BUG_ON(p->nr_cpus_allowed <= 1); 1991 - 1992 - BUG_ON(!task_on_rq_queued(p)); 1993 - BUG_ON(!rt_task(p)); 1994 - 1995 - return p; 1996 1945 } 1997 1946 1998 1947 /* ··· 2611 2602 { 2612 2603 struct rt_rq *rt_rq; 2613 2604 2614 - #ifdef CONFIG_RT_GROUP_SCHED 2605 + #ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq 2615 2606 rt_rq = task_group(p)->rt_rq[cpu]; 2607 + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); 2616 2608 #else 2617 2609 rt_rq = &cpu_rq(cpu)->rt; 2618 2610 #endif ··· 2721 2711 */ 2722 2712 if (rt_bandwidth_enabled() && !runtime && 2723 2713 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) 2714 + return -EBUSY; 2715 + 2716 + if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) 2724 2717 return -EBUSY; 2725 2718 2726 2719 total = to_ratio(period, runtime); ··· 2881 2868 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 2882 2869 { 2883 2870 /* Don't accept real-time tasks when there is no way for them to run */ 2884 - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 2871 + if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 2885 2872 return 0; 2886 2873 2887 2874 return 1;
+30 -4
kernel/sched/sched.h
··· 813 813 814 814 #ifdef CONFIG_RT_GROUP_SCHED 815 815 int rt_throttled; 816 - u64 rt_time; 817 - u64 rt_runtime; 816 + u64 rt_time; /* consumed RT time, goes up in update_curr_rt */ 817 + u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ 818 818 /* Nests inside the rq lock: */ 819 819 raw_spinlock_t rt_runtime_lock; 820 820 821 821 unsigned int rt_nr_boosted; 822 822 823 - struct rq *rq; 824 - struct task_group *tg; 823 + struct rq *rq; /* this is always top-level rq, cache? */ 824 + #endif 825 + #ifdef CONFIG_CGROUP_SCHED 826 + struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ 825 827 #endif 826 828 }; 827 829 ··· 1500 1498 } 1501 1499 1502 1500 #endif /* !CONFIG_SCHED_CORE */ 1501 + #ifdef CONFIG_RT_GROUP_SCHED 1502 + # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED 1503 + DECLARE_STATIC_KEY_FALSE(rt_group_sched); 1504 + static inline bool rt_group_sched_enabled(void) 1505 + { 1506 + return static_branch_unlikely(&rt_group_sched); 1507 + } 1508 + # else 1509 + DECLARE_STATIC_KEY_TRUE(rt_group_sched); 1510 + static inline bool rt_group_sched_enabled(void) 1511 + { 1512 + return static_branch_likely(&rt_group_sched); 1513 + } 1514 + # endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ 1515 + #else 1516 + # define rt_group_sched_enabled() false 1517 + #endif /* CONFIG_RT_GROUP_SCHED */ 1503 1518 1504 1519 static inline void lockdep_assert_rq_held(struct rq *rq) 1505 1520 { ··· 2165 2146 #endif 2166 2147 2167 2148 #ifdef CONFIG_RT_GROUP_SCHED 2149 + /* 2150 + * p->rt.rt_rq is NULL initially and it is easier to assign 2151 + * root_task_group's rt_rq than switching in rt_rq_of_se() 2152 + * Clobbers tg(!) 2153 + */ 2154 + if (!rt_group_sched_enabled()) 2155 + tg = &root_task_group; 2168 2156 p->rt.rt_rq = tg->rt_rq[cpu]; 2169 2157 p->rt.parent = tg->rt_se[cpu]; 2170 2158 #endif
+3 -2
kernel/sched/syscalls.c
··· 634 634 * Do not allow real-time tasks into groups that have no runtime 635 635 * assigned. 636 636 */ 637 - if (rt_bandwidth_enabled() && rt_policy(policy) && 637 + if (rt_group_sched_enabled() && 638 + rt_bandwidth_enabled() && rt_policy(policy) && 638 639 task_group(p)->rt_bandwidth.rt_runtime == 0 && 639 640 !task_group_is_autogroup(task_group(p))) { 640 641 retval = -EPERM; 641 642 goto unlock; 642 643 } 643 - #endif 644 + #endif /* CONFIG_RT_GROUP_SCHED */ 644 645 #ifdef CONFIG_SMP 645 646 if (dl_bandwidth_enabled() && dl_policy(policy) && 646 647 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
+103 -26
kernel/sched/topology.c
··· 1333 1333 update_group_capacity(sd, cpu); 1334 1334 } 1335 1335 1336 + #ifdef CONFIG_SMP 1337 + 1338 + /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ 1339 + void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) 1340 + { 1341 + int asym_prefer_cpu = cpu; 1342 + struct sched_domain *sd; 1343 + 1344 + guard(rcu)(); 1345 + 1346 + for_each_domain(cpu, sd) { 1347 + struct sched_group *sg; 1348 + int group_cpu; 1349 + 1350 + if (!(sd->flags & SD_ASYM_PACKING)) 1351 + continue; 1352 + 1353 + /* 1354 + * Groups of overlapping domain are replicated per NUMA 1355 + * node and will require updating "asym_prefer_cpu" on 1356 + * each local copy. 1357 + * 1358 + * If you are hitting this warning, consider moving 1359 + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" 1360 + * which is shared by all the overlapping groups. 1361 + */ 1362 + WARN_ON_ONCE(sd->flags & SD_OVERLAP); 1363 + 1364 + sg = sd->groups; 1365 + if (cpu != sg->asym_prefer_cpu) { 1366 + /* 1367 + * Since the parent is a superset of the current group, 1368 + * if the cpu is not the "asym_prefer_cpu" at the 1369 + * current level, it cannot be the preferred CPU at a 1370 + * higher levels either. 1371 + */ 1372 + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) 1373 + return; 1374 + 1375 + WRITE_ONCE(sg->asym_prefer_cpu, cpu); 1376 + continue; 1377 + } 1378 + 1379 + /* Ranking has improved; CPU is still the preferred one. */ 1380 + if (new_prio >= old_prio) 1381 + continue; 1382 + 1383 + for_each_cpu(group_cpu, sched_group_span(sg)) { 1384 + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) 1385 + asym_prefer_cpu = group_cpu; 1386 + } 1387 + 1388 + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); 1389 + } 1390 + } 1391 + 1392 + #endif /* CONFIG_SMP */ 1393 + 1336 1394 /* 1337 1395 * Set of available CPUs grouped by their corresponding capacities 1338 1396 * Each list entry contains a CPU mask reflecting CPUs that share the same ··· 2156 2098 for (i = 0; i < sched_domains_numa_levels; i++) { 2157 2099 if (!masks[i][j]) 2158 2100 break; 2159 - cpu = cpumask_any_and(cpus, masks[i][j]); 2101 + cpu = cpumask_any_and_distribute(cpus, masks[i][j]); 2160 2102 if (cpu < nr_cpu_ids) { 2161 2103 found = cpu; 2162 2104 break; ··· 2405 2347 2406 2348 /* 2407 2349 * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for 2408 - * any two given CPUs at this (non-NUMA) topology level. 2350 + * any two given CPUs on non-NUMA topology levels. 2409 2351 */ 2410 - static bool topology_span_sane(struct sched_domain_topology_level *tl, 2411 - const struct cpumask *cpu_map, int cpu) 2352 + static bool topology_span_sane(const struct cpumask *cpu_map) 2412 2353 { 2413 - int i = cpu + 1; 2354 + struct sched_domain_topology_level *tl; 2355 + struct cpumask *covered, *id_seen; 2356 + int cpu; 2414 2357 2415 - /* NUMA levels are allowed to overlap */ 2416 - if (tl->flags & SDTL_OVERLAP) 2417 - return true; 2358 + lockdep_assert_held(&sched_domains_mutex); 2359 + covered = sched_domains_tmpmask; 2360 + id_seen = sched_domains_tmpmask2; 2418 2361 2419 - /* 2420 - * Non-NUMA levels cannot partially overlap - they must be either 2421 - * completely equal or completely disjoint. Otherwise we can end up 2422 - * breaking the sched_group lists - i.e. a later get_group() pass 2423 - * breaks the linking done for an earlier span. 2424 - */ 2425 - for_each_cpu_from(i, cpu_map) { 2362 + for_each_sd_topology(tl) { 2363 + 2364 + /* NUMA levels are allowed to overlap */ 2365 + if (tl->flags & SDTL_OVERLAP) 2366 + continue; 2367 + 2368 + cpumask_clear(covered); 2369 + cpumask_clear(id_seen); 2370 + 2426 2371 /* 2427 - * We should 'and' all those masks with 'cpu_map' to exactly 2428 - * match the topology we're about to build, but that can only 2429 - * remove CPUs, which only lessens our ability to detect 2430 - * overlaps 2372 + * Non-NUMA levels cannot partially overlap - they must be either 2373 + * completely equal or completely disjoint. Otherwise we can end up 2374 + * breaking the sched_group lists - i.e. a later get_group() pass 2375 + * breaks the linking done for an earlier span. 2431 2376 */ 2432 - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && 2433 - cpumask_intersects(tl->mask(cpu), tl->mask(i))) 2434 - return false; 2435 - } 2377 + for_each_cpu(cpu, cpu_map) { 2378 + const struct cpumask *tl_cpu_mask = tl->mask(cpu); 2379 + int id; 2436 2380 2381 + /* lowest bit set in this mask is used as a unique id */ 2382 + id = cpumask_first(tl_cpu_mask); 2383 + 2384 + if (cpumask_test_cpu(id, id_seen)) { 2385 + /* First CPU has already been seen, ensure identical spans */ 2386 + if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) 2387 + return false; 2388 + } else { 2389 + /* First CPU hasn't been seen before, ensure it's a completely new span */ 2390 + if (cpumask_intersects(tl_cpu_mask, covered)) 2391 + return false; 2392 + 2393 + cpumask_or(covered, covered, tl_cpu_mask); 2394 + cpumask_set_cpu(id, id_seen); 2395 + } 2396 + } 2397 + } 2437 2398 return true; 2438 2399 } 2439 2400 ··· 2485 2408 sd = NULL; 2486 2409 for_each_sd_topology(tl) { 2487 2410 2488 - if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) 2489 - goto error; 2490 - 2491 2411 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 2492 2412 2493 2413 has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; ··· 2497 2423 break; 2498 2424 } 2499 2425 } 2426 + 2427 + if (WARN_ON(!topology_span_sane(cpu_map))) 2428 + goto error; 2500 2429 2501 2430 /* Build the groups for the domains */ 2502 2431 for_each_cpu(i, cpu_map) {