Merge tag 'sched-core-2026-04-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+1

MAINTAINERS

··· 23708 23708 R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH) 23709 23709 R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING) 23710 23710 R: Valentin Schneider <vschneid@redhat.com> (TOPOLOGY) 23711 + R: K Prateek Nayak <kprateek.nayak@amd.com> 23711 23712 L: linux-kernel@vger.kernel.org 23712 23713 S: Maintained 23713 23714 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core

-3

arch/x86/include/asm/mmu_context.h

··· 136 136 } 137 137 #endif 138 138 139 - #define enter_lazy_tlb enter_lazy_tlb 140 - extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); 141 - 142 139 extern void mm_init_global_asid(struct mm_struct *mm); 143 140 extern void mm_free_global_asid(struct mm_struct *mm); 144 141

+26

arch/x86/include/asm/tlbflush.h

··· 172 172 }; 173 173 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); 174 174 175 + /* 176 + * Please ignore the name of this function. It should be called 177 + * switch_to_kernel_thread(). 178 + * 179 + * enter_lazy_tlb() is a hint from the scheduler that we are entering a 180 + * kernel thread or other context without an mm. Acceptable implementations 181 + * include doing nothing whatsoever, switching to init_mm, or various clever 182 + * lazy tricks to try to minimize TLB flushes. 183 + * 184 + * The scheduler reserves the right to call enter_lazy_tlb() several times 185 + * in a row. It will notify us that we're going back to a real mm by 186 + * calling switch_mm_irqs_off(). 187 + */ 188 + #define enter_lazy_tlb enter_lazy_tlb 189 + static __always_inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 190 + { 191 + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 192 + return; 193 + 194 + this_cpu_write(cpu_tlbstate_shared.is_lazy, true); 195 + } 196 + 175 197 bool nmi_uaccess_okay(void); 176 198 #define nmi_uaccess_okay nmi_uaccess_okay 177 199 ··· 502 480 { 503 481 } 504 482 #endif 483 + #else /* !MODULE */ 484 + #define enter_lazy_tlb enter_lazy_tlb 485 + extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 486 + __compiletime_error("enter_lazy_tlb() should not be used in modules"); 505 487 #endif /* !MODULE */ 506 488 507 489 static inline void __native_tlb_flush_global(unsigned long cr4)

-21

arch/x86/mm/tlb.c

··· 972 972 } 973 973 974 974 /* 975 - * Please ignore the name of this function. It should be called 976 - * switch_to_kernel_thread(). 977 - * 978 - * enter_lazy_tlb() is a hint from the scheduler that we are entering a 979 - * kernel thread or other context without an mm. Acceptable implementations 980 - * include doing nothing whatsoever, switching to init_mm, or various clever 981 - * lazy tricks to try to minimize TLB flushes. 982 - * 983 - * The scheduler reserves the right to call enter_lazy_tlb() several times 984 - * in a row. It will notify us that we're going back to a real mm by 985 - * calling switch_mm_irqs_off(). 986 - */ 987 - void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 988 - { 989 - if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 990 - return; 991 - 992 - this_cpu_write(cpu_tlbstate_shared.is_lazy, true); 993 - } 994 - 995 - /* 996 975 * Using a temporary mm allows to set temporary mappings that are not accessible 997 976 * by other CPUs. Such mappings are needed to perform sensitive memory writes 998 977 * that override the kernel memory protections (e.g., W^X), without exposing the

+2 -2

include/linux/energy_model.h

··· 248 248 struct em_perf_state *ps; 249 249 int i; 250 250 251 - WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); 251 + lockdep_assert(rcu_read_lock_any_held()); 252 252 253 253 if (!sum_util) 254 254 return 0; ··· 267 267 * Find the lowest performance state of the Energy Model above the 268 268 * requested performance. 269 269 */ 270 - em_table = rcu_dereference(pd->em_table); 270 + em_table = rcu_dereference_all(pd->em_table); 271 271 i = em_pd_get_efficient_state(em_table->state, pd, max_util); 272 272 ps = &em_table->state[i]; 273 273

+64 -31

include/linux/sched.h

··· 1239 1239 #endif 1240 1240 1241 1241 struct mutex *blocked_on; /* lock we're blocked on */ 1242 + raw_spinlock_t blocked_lock; 1242 1243 1243 1244 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER 1244 1245 /* ··· 2181 2180 }) 2182 2181 2183 2182 #ifndef CONFIG_PREEMPT_RT 2183 + 2184 + /* 2185 + * With proxy exec, if a task has been proxy-migrated, it may be a donor 2186 + * on a cpu that it can't actually run on. Thus we need a special state 2187 + * to denote that the task is being woken, but that it needs to be 2188 + * evaluated for return-migration before it is run. So if the task is 2189 + * blocked_on PROXY_WAKING, return migrate it before running it. 2190 + */ 2191 + #define PROXY_WAKING ((struct mutex *)(-1L)) 2192 + 2184 2193 static inline struct mutex *__get_task_blocked_on(struct task_struct *p) 2185 2194 { 2186 - struct mutex *m = p->blocked_on; 2187 - 2188 - if (m) 2189 - lockdep_assert_held_once(&m->wait_lock); 2190 - return m; 2195 + lockdep_assert_held_once(&p->blocked_lock); 2196 + return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on; 2191 2197 } 2192 2198 2193 2199 static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) 2194 2200 { 2195 - struct mutex *blocked_on = READ_ONCE(p->blocked_on); 2196 - 2197 2201 WARN_ON_ONCE(!m); 2198 2202 /* The task should only be setting itself as blocked */ 2199 2203 WARN_ON_ONCE(p != current); 2200 - /* Currently we serialize blocked_on under the mutex::wait_lock */ 2201 - lockdep_assert_held_once(&m->wait_lock); 2204 + /* Currently we serialize blocked_on under the task::blocked_lock */ 2205 + lockdep_assert_held_once(&p->blocked_lock); 2202 2206 /* 2203 2207 * Check ensure we don't overwrite existing mutex value 2204 2208 * with a different mutex. Note, setting it to the same 2205 2209 * lock repeatedly is ok. 2206 2210 */ 2207 - WARN_ON_ONCE(blocked_on && blocked_on != m); 2208 - WRITE_ONCE(p->blocked_on, m); 2209 - } 2210 - 2211 - static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) 2212 - { 2213 - guard(raw_spinlock_irqsave)(&m->wait_lock); 2214 - __set_task_blocked_on(p, m); 2211 + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); 2212 + p->blocked_on = m; 2215 2213 } 2216 2214 2217 2215 static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) 2218 2216 { 2219 - if (m) { 2220 - struct mutex *blocked_on = READ_ONCE(p->blocked_on); 2221 - 2222 - /* Currently we serialize blocked_on under the mutex::wait_lock */ 2223 - lockdep_assert_held_once(&m->wait_lock); 2224 - /* 2225 - * There may be cases where we re-clear already cleared 2226 - * blocked_on relationships, but make sure we are not 2227 - * clearing the relationship with a different lock. 2228 - */ 2229 - WARN_ON_ONCE(blocked_on && blocked_on != m); 2230 - } 2231 - WRITE_ONCE(p->blocked_on, NULL); 2217 + /* Currently we serialize blocked_on under the task::blocked_lock */ 2218 + lockdep_assert_held_once(&p->blocked_lock); 2219 + /* 2220 + * There may be cases where we re-clear already cleared 2221 + * blocked_on relationships, but make sure we are not 2222 + * clearing the relationship with a different lock. 2223 + */ 2224 + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING); 2225 + p->blocked_on = NULL; 2232 2226 } 2233 2227 2234 2228 static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) 2235 2229 { 2236 - guard(raw_spinlock_irqsave)(&m->wait_lock); 2230 + guard(raw_spinlock_irqsave)(&p->blocked_lock); 2237 2231 __clear_task_blocked_on(p, m); 2238 2232 } 2233 + 2234 + static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) 2235 + { 2236 + /* Currently we serialize blocked_on under the task::blocked_lock */ 2237 + lockdep_assert_held_once(&p->blocked_lock); 2238 + 2239 + if (!sched_proxy_exec()) { 2240 + __clear_task_blocked_on(p, m); 2241 + return; 2242 + } 2243 + 2244 + /* Don't set PROXY_WAKING if blocked_on was already cleared */ 2245 + if (!p->blocked_on) 2246 + return; 2247 + /* 2248 + * There may be cases where we set PROXY_WAKING on tasks that were 2249 + * already set to waking, but make sure we are not changing 2250 + * the relationship with a different lock. 2251 + */ 2252 + WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING); 2253 + p->blocked_on = PROXY_WAKING; 2254 + } 2255 + 2256 + static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) 2257 + { 2258 + guard(raw_spinlock_irqsave)(&p->blocked_lock); 2259 + __set_task_blocked_on_waking(p, m); 2260 + } 2261 + 2239 2262 #else 2240 2263 static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) 2241 2264 { 2242 2265 } 2243 2266 2244 2267 static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) 2268 + { 2269 + } 2270 + 2271 + static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) 2272 + { 2273 + } 2274 + 2275 + static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) 2245 2276 { 2246 2277 } 2247 2278 #endif /* !CONFIG_PREEMPT_RT */

+19 -7

include/linux/sched/topology.h

··· 95 95 unsigned int newidle_call; 96 96 unsigned int newidle_success; 97 97 unsigned int newidle_ratio; 98 + u64 newidle_stamp; 98 99 u64 max_newidle_lb_cost; 99 100 unsigned long last_decay_max_lb_cost; 100 101 ··· 142 141 143 142 unsigned int span_weight; 144 143 /* 145 - * Span of all CPUs in this domain. 144 + * See sched_domain_span(), on why flex arrays are broken. 146 145 * 147 - * NOTE: this field is variable length. (Allocated dynamically 148 - * by attaching extra space to the end of the structure, 149 - * depending on how many CPUs the kernel has booted up with) 150 - */ 151 146 unsigned long span[]; 147 + */ 152 148 }; 153 149 154 150 static inline struct cpumask *sched_domain_span(struct sched_domain *sd) 155 151 { 156 - return to_cpumask(sd->span); 152 + /* 153 + * Turns out that C flexible arrays are fundamentally broken since it 154 + * is allowed for offsetof(*sd, span) < sizeof(*sd), this means that 155 + * structure initialzation *sd = { ... }; which writes every byte 156 + * inside sizeof(*type), will over-write the start of the flexible 157 + * array. 158 + * 159 + * Luckily, the way we allocate sched_domain is by: 160 + * 161 + * sizeof(*sd) + cpumask_size() 162 + * 163 + * this means that we have sufficient space for the whole flex array 164 + * *outside* of sizeof(*sd). So use that, and avoid using sd->span. 165 + */ 166 + unsigned long *bitmap = (void *)sd + sizeof(*sd); 167 + return to_cpumask(bitmap); 157 168 } 158 169 159 170 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ··· 184 171 185 172 struct sd_data { 186 173 struct sched_domain *__percpu *sd; 187 - struct sched_domain_shared *__percpu *sds; 188 174 struct sched_group *__percpu *sg; 189 175 struct sched_group_capacity *__percpu *sgc; 190 176 };

+2 -2

include/linux/wait_bit.h

··· 406 406 schedule()) 407 407 408 408 /** 409 - * wait_var_event_killable - wait for a variable to be updated and notified 409 + * wait_var_event_interruptible - wait for a variable to be updated and notified 410 410 * @var: the address of variable being waited on 411 411 * @condition: the condition to wait for 412 412 * ··· 492 492 * wait_var_event_mutex - wait for a variable to be updated under a mutex 493 493 * @var: the address of the variable being waited on 494 494 * @condition: condition to wait for 495 - * @mutex: the mutex which protects updates to the variable 495 + * @lock: the mutex which protects updates to the variable 496 496 * 497 497 * Wait for a condition which can only be reliably tested while holding 498 498 * a mutex. The variables assessed in the condition will normal be

+3

include/uapi/linux/sched.h

··· 149 149 SCHED_FLAG_KEEP_ALL | \ 150 150 SCHED_FLAG_UTIL_CLAMP) 151 151 152 + /* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */ 153 + #define SCHED_GETATTR_FLAG_DL_DYNAMIC 0x01 154 + 152 155 #endif /* _UAPI_LINUX_SCHED_H */

+1

init/init_task.c

··· 169 169 .journal_info = NULL, 170 170 INIT_CPU_TIMERS(init_task) 171 171 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), 172 + .blocked_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.blocked_lock), 172 173 .timer_slack_ns = 50000, /* 50 usec default slack */ 173 174 .thread_pid = &init_struct_pid, 174 175 .thread_node = LIST_HEAD_INIT(init_signals.thread_head),

+1

kernel/fork.c

··· 2113 2113 ftrace_graph_init_task(p); 2114 2114 2115 2115 rt_mutex_init_task(p); 2116 + raw_spin_lock_init(&p->blocked_lock); 2116 2117 2117 2118 lockdep_assert_irqs_enabled(); 2118 2119 #ifdef CONFIG_PROVE_LOCKING

+2 -2

kernel/locking/mutex-debug.c

··· 53 53 lockdep_assert_held(&lock->wait_lock); 54 54 55 55 /* Current thread can't be already blocked (since it's executing!) */ 56 - DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); 56 + DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task)); 57 57 } 58 58 59 59 void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 60 60 struct task_struct *task) 61 61 { 62 - struct mutex *blocked_on = __get_task_blocked_on(task); 62 + struct mutex *blocked_on = get_task_blocked_on(task); 63 63 64 64 DEBUG_LOCKS_WARN_ON(waiter->task != task); 65 65 DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);

+26 -14

kernel/locking/mutex.c

··· 674 674 goto err_early_kill; 675 675 } 676 676 677 + raw_spin_lock(&current->blocked_lock); 677 678 __set_task_blocked_on(current, lock); 678 679 set_current_state(state); 679 680 trace_contention_begin(lock, LCB_F_MUTEX); ··· 688 687 * the handoff. 689 688 */ 690 689 if (__mutex_trylock(lock)) 691 - goto acquired; 690 + break; 692 691 692 + raw_spin_unlock(&current->blocked_lock); 693 693 /* 694 694 * Check for signals and kill conditions while holding 695 695 * wait_lock. This ensures the lock cancellation is ordered ··· 713 711 714 712 first = lock->first_waiter == &waiter; 715 713 714 + raw_spin_lock_irqsave(&lock->wait_lock, flags); 715 + raw_spin_lock(&current->blocked_lock); 716 716 /* 717 717 * As we likely have been woken up by task 718 718 * that has cleared our blocked_on state, re-set 719 719 * it to the lock we are trying to acquire. 720 720 */ 721 - set_task_blocked_on(current, lock); 721 + __set_task_blocked_on(current, lock); 722 722 set_current_state(state); 723 723 /* 724 724 * Here we order against unlock; we must either see it change ··· 731 727 break; 732 728 733 729 if (first) { 734 - trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); 730 + bool opt_acquired; 731 + 735 732 /* 736 733 * mutex_optimistic_spin() can call schedule(), so 737 - * clear blocked on so we don't become unselectable 734 + * we need to release these locks before calling it, 735 + * and clear blocked on so we don't become unselectable 738 736 * to run. 739 737 */ 740 - clear_task_blocked_on(current, lock); 741 - if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) 738 + __clear_task_blocked_on(current, lock); 739 + raw_spin_unlock(&current->blocked_lock); 740 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 741 + 742 + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); 743 + opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter); 744 + 745 + raw_spin_lock_irqsave(&lock->wait_lock, flags); 746 + raw_spin_lock(&current->blocked_lock); 747 + __set_task_blocked_on(current, lock); 748 + 749 + if (opt_acquired) 742 750 break; 743 - set_task_blocked_on(current, lock); 744 751 trace_contention_begin(lock, LCB_F_MUTEX); 745 752 } 746 - 747 - raw_spin_lock_irqsave(&lock->wait_lock, flags); 748 753 } 749 - raw_spin_lock_irqsave(&lock->wait_lock, flags); 750 - acquired: 751 754 __clear_task_blocked_on(current, lock); 752 755 __set_current_state(TASK_RUNNING); 756 + raw_spin_unlock(&current->blocked_lock); 753 757 754 758 if (ww_ctx) { 755 759 /* ··· 785 773 return 0; 786 774 787 775 err: 788 - __clear_task_blocked_on(current, lock); 776 + clear_task_blocked_on(current, lock); 789 777 __set_current_state(TASK_RUNNING); 790 778 __mutex_remove_waiter(lock, &waiter); 791 779 err_early_kill: 792 - WARN_ON(__get_task_blocked_on(current)); 780 + WARN_ON(get_task_blocked_on(current)); 793 781 trace_contention_end(lock, ret); 794 782 raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); 795 783 debug_mutex_free_waiter(&waiter); ··· 1005 993 next = waiter->task; 1006 994 1007 995 debug_mutex_wake_waiter(lock, waiter); 1008 - __clear_task_blocked_on(next, lock); 996 + set_task_blocked_on_waking(next, lock); 1009 997 wake_q_add(&wake_q, next); 1010 998 } 1011 999

+6

kernel/locking/mutex.h

··· 48 48 return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); 49 49 } 50 50 51 + static inline struct mutex *get_task_blocked_on(struct task_struct *p) 52 + { 53 + guard(raw_spinlock_irqsave)(&p->blocked_lock); 54 + return __get_task_blocked_on(p); 55 + } 56 + 51 57 #ifdef CONFIG_DEBUG_MUTEXES 52 58 extern void debug_mutex_lock_common(struct mutex *lock, 53 59 struct mutex_waiter *waiter);

+8 -8

kernel/locking/ww_mutex.h

··· 290 290 debug_mutex_wake_waiter(lock, waiter); 291 291 #endif 292 292 /* 293 - * When waking up the task to die, be sure to clear the 294 - * blocked_on pointer. Otherwise we can see circular 295 - * blocked_on relationships that can't resolve. 293 + * When waking up the task to die, be sure to set the 294 + * blocked_on to PROXY_WAKING. Otherwise we can see 295 + * circular blocked_on relationships that can't resolve. 296 296 */ 297 - __clear_task_blocked_on(waiter->task, lock); 297 + set_task_blocked_on_waking(waiter->task, lock); 298 298 wake_q_add(wake_q, waiter->task); 299 299 } 300 300 ··· 345 345 */ 346 346 if (owner != current) { 347 347 /* 348 - * When waking up the task to wound, be sure to clear the 349 - * blocked_on pointer. Otherwise we can see circular 350 - * blocked_on relationships that can't resolve. 348 + * When waking up the task to wound, be sure to set the 349 + * blocked_on to PROXY_WAKING. Otherwise we can see 350 + * circular blocked_on relationships that can't resolve. 351 351 * 352 352 * NOTE: We pass NULL here instead of lock, because we 353 353 * are waking the mutex owner, who may be currently 354 354 * blocked on a different mutex. 355 355 */ 356 - __clear_task_blocked_on(owner, NULL); 356 + set_task_blocked_on_waking(owner, NULL); 357 357 wake_q_add(wake_q, owner); 358 358 } 359 359 return true;

+272 -62

kernel/sched/core.c

··· 687 687 } 688 688 } 689 689 690 - void raw_spin_rq_unlock(struct rq *rq) 691 - { 692 - raw_spin_unlock(rq_lockp(rq)); 693 - } 694 - 695 690 /* 696 691 * double_rq_lock - safely lock two runqueues 697 692 */ ··· 3900 3905 3901 3906 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) 3902 3907 { 3908 + int this_cpu = smp_processor_id(); 3909 + 3903 3910 /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ 3904 3911 if (!scx_allow_ttwu_queue(p)) 3905 3912 return false; ··· 3926 3929 * If the CPU does not share cache, then queue the task on the 3927 3930 * remote rqs wakelist to avoid accessing remote data. 3928 3931 */ 3929 - if (!cpus_share_cache(smp_processor_id(), cpu)) 3932 + if (!cpus_share_cache(this_cpu, cpu)) 3930 3933 return true; 3931 3934 3932 - if (cpu == smp_processor_id()) 3935 + if (cpu == this_cpu) 3933 3936 return false; 3934 3937 3935 3938 /* ··· 4793 4796 scx_post_fork(p); 4794 4797 } 4795 4798 4796 - unsigned long to_ratio(u64 period, u64 runtime) 4799 + u64 to_ratio(u64 period, u64 runtime) 4797 4800 { 4798 4801 if (runtime == RUNTIME_INF) 4799 4802 return BW_UNIT; ··· 4966 4969 * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). 4967 4970 */ 4968 4971 smp_store_release(&prev->on_cpu, 0); 4972 + } 4973 + 4974 + /* 4975 + * Only called from __schedule context 4976 + * 4977 + * There are some cases where we are going to re-do the action 4978 + * that added the balance callbacks. We may not be in a state 4979 + * where we can run them, so just zap them so they can be 4980 + * properly re-added on the next time around. This is similar 4981 + * handling to running the callbacks, except we just don't call 4982 + * them. 4983 + */ 4984 + static void zap_balance_callbacks(struct rq *rq) 4985 + { 4986 + struct balance_callback *next, *head; 4987 + bool found = false; 4988 + 4989 + lockdep_assert_rq_held(rq); 4990 + 4991 + head = rq->balance_callback; 4992 + while (head) { 4993 + if (head == &balance_push_callback) 4994 + found = true; 4995 + next = head->next; 4996 + head->next = NULL; 4997 + head = next; 4998 + } 4999 + rq->balance_callback = found ? &balance_push_callback : NULL; 4969 5000 } 4970 5001 4971 5002 static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) ··· 5765 5740 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); 5766 5741 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); 5767 5742 if (os == TICK_SCHED_REMOTE_RUNNING) 5768 - queue_delayed_work(system_unbound_wq, dwork, HZ); 5743 + queue_delayed_work(system_dfl_wq, dwork, HZ); 5769 5744 } 5770 5745 5771 5746 static void sched_tick_start(int cpu) ··· 5784 5759 if (os == TICK_SCHED_REMOTE_OFFLINE) { 5785 5760 twork->cpu = cpu; 5786 5761 INIT_DELAYED_WORK(&twork->work, sched_tick_remote); 5787 - queue_delayed_work(system_unbound_wq, &twork->work, HZ); 5762 + queue_delayed_work(system_dfl_wq, &twork->work, HZ); 5788 5763 } 5789 5764 } 5790 5765 ··· 6582 6557 if (signal_pending_state(task_state, p)) { 6583 6558 WRITE_ONCE(p->__state, TASK_RUNNING); 6584 6559 *task_state_p = TASK_RUNNING; 6560 + set_task_blocked_on_waking(p, NULL); 6561 + 6585 6562 return false; 6586 6563 } 6587 6564 ··· 6621 6594 } 6622 6595 6623 6596 #ifdef CONFIG_SCHED_PROXY_EXEC 6597 + static inline void proxy_set_task_cpu(struct task_struct *p, int cpu) 6598 + { 6599 + unsigned int wake_cpu; 6600 + 6601 + /* 6602 + * Since we are enqueuing a blocked task on a cpu it may 6603 + * not be able to run on, preserve wake_cpu when we 6604 + * __set_task_cpu so we can return the task to where it 6605 + * was previously runnable. 6606 + */ 6607 + wake_cpu = p->wake_cpu; 6608 + __set_task_cpu(p, cpu); 6609 + p->wake_cpu = wake_cpu; 6610 + } 6611 + 6624 6612 static inline struct task_struct *proxy_resched_idle(struct rq *rq) 6625 6613 { 6626 6614 put_prev_set_next_task(rq, rq->donor, rq->idle); ··· 6644 6602 return rq->idle; 6645 6603 } 6646 6604 6647 - static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) 6605 + static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) 6648 6606 { 6649 6607 unsigned long state = READ_ONCE(donor->__state); 6650 6608 ··· 6664 6622 return try_to_block_task(rq, donor, &state, true); 6665 6623 } 6666 6624 6667 - static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) 6625 + static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) 6626 + __releases(__rq_lockp(rq)) 6668 6627 { 6669 - if (!__proxy_deactivate(rq, donor)) { 6628 + /* 6629 + * The class scheduler may have queued a balance callback 6630 + * from pick_next_task() called earlier. 6631 + * 6632 + * So here we have to zap callbacks before unlocking the rq 6633 + * as another CPU may jump in and call sched_balance_rq 6634 + * which can trip the warning in rq_pin_lock() if we 6635 + * leave callbacks set. 6636 + * 6637 + * After we later reaquire the rq lock, we will force __schedule() 6638 + * to pick_again, so the callbacks will get re-established. 6639 + */ 6640 + zap_balance_callbacks(rq); 6641 + rq_unpin_lock(rq, rf); 6642 + raw_spin_rq_unlock(rq); 6643 + } 6644 + 6645 + static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf) 6646 + __acquires(__rq_lockp(rq)) 6647 + { 6648 + raw_spin_rq_lock(rq); 6649 + rq_repin_lock(rq, rf); 6650 + update_rq_clock(rq); 6651 + } 6652 + 6653 + /* 6654 + * If the blocked-on relationship crosses CPUs, migrate @p to the 6655 + * owner's CPU. 6656 + * 6657 + * This is because we must respect the CPU affinity of execution 6658 + * contexts (owner) but we can ignore affinity for scheduling 6659 + * contexts (@p). So we have to move scheduling contexts towards 6660 + * potential execution contexts. 6661 + * 6662 + * Note: The owner can disappear, but simply migrate to @target_cpu 6663 + * and leave that CPU to sort things out. 6664 + */ 6665 + static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, 6666 + struct task_struct *p, int target_cpu) 6667 + __must_hold(__rq_lockp(rq)) 6668 + { 6669 + struct rq *target_rq = cpu_rq(target_cpu); 6670 + 6671 + lockdep_assert_rq_held(rq); 6672 + WARN_ON(p == rq->curr); 6673 + /* 6674 + * Since we are migrating a blocked donor, it could be rq->donor, 6675 + * and we want to make sure there aren't any references from this 6676 + * rq to it before we drop the lock. This avoids another cpu 6677 + * jumping in and grabbing the rq lock and referencing rq->donor 6678 + * or cfs_rq->curr, etc after we have migrated it to another cpu, 6679 + * and before we pick_again in __schedule. 6680 + * 6681 + * So call proxy_resched_idle() to drop the rq->donor references 6682 + * before we release the lock. 6683 + */ 6684 + proxy_resched_idle(rq); 6685 + 6686 + deactivate_task(rq, p, DEQUEUE_NOCLOCK); 6687 + proxy_set_task_cpu(p, target_cpu); 6688 + 6689 + proxy_release_rq_lock(rq, rf); 6690 + 6691 + attach_one_task(target_rq, p); 6692 + 6693 + proxy_reacquire_rq_lock(rq, rf); 6694 + } 6695 + 6696 + static void proxy_force_return(struct rq *rq, struct rq_flags *rf, 6697 + struct task_struct *p) 6698 + __must_hold(__rq_lockp(rq)) 6699 + { 6700 + struct rq *task_rq, *target_rq = NULL; 6701 + int cpu, wake_flag = WF_TTWU; 6702 + 6703 + lockdep_assert_rq_held(rq); 6704 + WARN_ON(p == rq->curr); 6705 + 6706 + if (p == rq->donor) 6707 + proxy_resched_idle(rq); 6708 + 6709 + proxy_release_rq_lock(rq, rf); 6710 + /* 6711 + * We drop the rq lock, and re-grab task_rq_lock to get 6712 + * the pi_lock (needed for select_task_rq) as well. 6713 + */ 6714 + scoped_guard (task_rq_lock, p) { 6715 + task_rq = scope.rq; 6716 + 6670 6717 /* 6671 - * XXX: For now, if deactivation failed, set donor 6672 - * as unblocked, as we aren't doing proxy-migrations 6673 - * yet (more logic will be needed then). 6718 + * Since we let go of the rq lock, the task may have been 6719 + * woken or migrated to another rq before we got the 6720 + * task_rq_lock. So re-check we're on the same RQ. If 6721 + * not, the task has already been migrated and that CPU 6722 + * will handle any futher migrations. 6674 6723 */ 6675 - donor->blocked_on = NULL; 6724 + if (task_rq != rq) 6725 + break; 6726 + 6727 + /* 6728 + * Similarly, if we've been dequeued, someone else will 6729 + * wake us 6730 + */ 6731 + if (!task_on_rq_queued(p)) 6732 + break; 6733 + 6734 + /* 6735 + * Since we should only be calling here from __schedule() 6736 + * -> find_proxy_task(), no one else should have 6737 + * assigned current out from under us. But check and warn 6738 + * if we see this, then bail. 6739 + */ 6740 + if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { 6741 + WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", 6742 + __func__, cpu_of(task_rq), 6743 + p->comm, p->pid, p->on_cpu); 6744 + break; 6745 + } 6746 + 6747 + update_rq_clock(task_rq); 6748 + deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); 6749 + cpu = select_task_rq(p, p->wake_cpu, &wake_flag); 6750 + set_task_cpu(p, cpu); 6751 + target_rq = cpu_rq(cpu); 6752 + clear_task_blocked_on(p, NULL); 6676 6753 } 6677 - return NULL; 6754 + 6755 + if (target_rq) 6756 + attach_one_task(target_rq, p); 6757 + 6758 + proxy_reacquire_rq_lock(rq, rf); 6678 6759 } 6679 6760 6680 6761 /* ··· 6811 6646 * p->pi_lock 6812 6647 * rq->lock 6813 6648 * mutex->wait_lock 6649 + * p->blocked_lock 6814 6650 * 6815 6651 * Returns the task that is going to be used as execution context (the one 6816 6652 * that is actually going to be run on cpu_of(rq)). 6817 6653 */ 6818 6654 static struct task_struct * 6819 6655 find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) 6656 + __must_hold(__rq_lockp(rq)) 6820 6657 { 6821 6658 struct task_struct *owner = NULL; 6659 + bool curr_in_chain = false; 6822 6660 int this_cpu = cpu_of(rq); 6823 6661 struct task_struct *p; 6824 6662 struct mutex *mutex; 6663 + int owner_cpu; 6825 6664 6826 6665 /* Follow blocked_on chain. */ 6827 - for (p = donor; task_is_blocked(p); p = owner) { 6828 - mutex = p->blocked_on; 6829 - /* Something changed in the chain, so pick again */ 6830 - if (!mutex) 6831 - return NULL; 6666 + for (p = donor; (mutex = p->blocked_on); p = owner) { 6667 + /* if its PROXY_WAKING, do return migration or run if current */ 6668 + if (mutex == PROXY_WAKING) { 6669 + if (task_current(rq, p)) { 6670 + clear_task_blocked_on(p, PROXY_WAKING); 6671 + return p; 6672 + } 6673 + goto force_return; 6674 + } 6675 + 6832 6676 /* 6833 6677 * By taking mutex->wait_lock we hold off concurrent mutex_unlock() 6834 6678 * and ensure @owner sticks around. 6835 6679 */ 6836 6680 guard(raw_spinlock)(&mutex->wait_lock); 6681 + guard(raw_spinlock)(&p->blocked_lock); 6837 6682 6838 - /* Check again that p is blocked with wait_lock held */ 6683 + /* Check again that p is blocked with blocked_lock held */ 6839 6684 if (mutex != __get_task_blocked_on(p)) { 6840 6685 /* 6841 6686 * Something changed in the blocked_on chain and ··· 6856 6681 return NULL; 6857 6682 } 6858 6683 6684 + if (task_current(rq, p)) 6685 + curr_in_chain = true; 6686 + 6859 6687 owner = __mutex_owner(mutex); 6860 6688 if (!owner) { 6861 - __clear_task_blocked_on(p, mutex); 6862 - return p; 6689 + /* 6690 + * If there is no owner, either clear blocked_on 6691 + * and return p (if it is current and safe to 6692 + * just run on this rq), or return-migrate the task. 6693 + */ 6694 + if (task_current(rq, p)) { 6695 + __clear_task_blocked_on(p, NULL); 6696 + return p; 6697 + } 6698 + goto force_return; 6863 6699 } 6864 6700 6865 6701 if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { 6866 6702 /* XXX Don't handle blocked owners/delayed dequeue yet */ 6867 - return proxy_deactivate(rq, donor); 6703 + if (curr_in_chain) 6704 + return proxy_resched_idle(rq); 6705 + goto deactivate; 6868 6706 } 6869 6707 6870 - if (task_cpu(owner) != this_cpu) { 6871 - /* XXX Don't handle migrations yet */ 6872 - return proxy_deactivate(rq, donor); 6708 + owner_cpu = task_cpu(owner); 6709 + if (owner_cpu != this_cpu) { 6710 + /* 6711 + * @owner can disappear, simply migrate to @owner_cpu 6712 + * and leave that CPU to sort things out. 6713 + */ 6714 + if (curr_in_chain) 6715 + return proxy_resched_idle(rq); 6716 + goto migrate_task; 6873 6717 } 6874 6718 6875 6719 if (task_on_rq_migrating(owner)) { ··· 6945 6751 * guarantee its existence, as per ttwu_remote(). 6946 6752 */ 6947 6753 } 6948 - 6949 6754 WARN_ON_ONCE(owner && !owner->on_rq); 6950 6755 return owner; 6756 + 6757 + deactivate: 6758 + if (proxy_deactivate(rq, donor)) 6759 + return NULL; 6760 + /* If deactivate fails, force return */ 6761 + p = donor; 6762 + force_return: 6763 + proxy_force_return(rq, rf, p); 6764 + return NULL; 6765 + migrate_task: 6766 + proxy_migrate_task(rq, rf, p, owner_cpu); 6767 + return NULL; 6951 6768 } 6952 6769 #else /* SCHED_PROXY_EXEC */ 6953 6770 static struct task_struct * ··· 6968 6763 return donor; 6969 6764 } 6970 6765 #endif /* SCHED_PROXY_EXEC */ 6971 - 6972 - static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) 6973 - { 6974 - if (!sched_proxy_exec()) 6975 - return; 6976 - /* 6977 - * pick_next_task() calls set_next_task() on the chosen task 6978 - * at some point, which ensures it is not push/pullable. 6979 - * However, the chosen/donor task *and* the mutex owner form an 6980 - * atomic pair wrt push/pull. 6981 - * 6982 - * Make sure owner we run is not pushable. Unfortunately we can 6983 - * only deal with that by means of a dequeue/enqueue cycle. :-/ 6984 - */ 6985 - dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); 6986 - enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); 6987 - } 6988 6766 6989 6767 /* 6990 6768 * __schedule() is the main scheduler function. ··· 7095 6907 } 7096 6908 7097 6909 pick_again: 6910 + assert_balance_callbacks_empty(rq); 7098 6911 next = pick_next_task(rq, rq->donor, &rf); 7099 - rq_set_donor(rq, next); 7100 6912 rq->next_class = next->sched_class; 7101 - if (unlikely(task_is_blocked(next))) { 7102 - next = find_proxy_task(rq, next, &rf); 7103 - if (!next) 7104 - goto pick_again; 7105 - if (next == rq->idle) 7106 - goto keep_resched; 6913 + if (sched_proxy_exec()) { 6914 + struct task_struct *prev_donor = rq->donor; 6915 + 6916 + rq_set_donor(rq, next); 6917 + if (unlikely(next->blocked_on)) { 6918 + next = find_proxy_task(rq, next, &rf); 6919 + if (!next) { 6920 + zap_balance_callbacks(rq); 6921 + goto pick_again; 6922 + } 6923 + if (next == rq->idle) { 6924 + zap_balance_callbacks(rq); 6925 + goto keep_resched; 6926 + } 6927 + } 6928 + if (rq->donor == prev_donor && prev != next) { 6929 + struct task_struct *donor = rq->donor; 6930 + /* 6931 + * When transitioning like: 6932 + * 6933 + * prev next 6934 + * donor: B B 6935 + * curr: A B or C 6936 + * 6937 + * then put_prev_set_next_task() will not have done 6938 + * anything, since B == B. However, A might have 6939 + * missed a RT/DL balance opportunity due to being 6940 + * on_cpu. 6941 + */ 6942 + donor->sched_class->put_prev_task(rq, donor, donor); 6943 + donor->sched_class->set_next_task(rq, donor, true); 6944 + } 6945 + } else { 6946 + rq_set_donor(rq, next); 7107 6947 } 6948 + 7108 6949 picked: 7109 6950 clear_tsk_need_resched(prev); 7110 6951 clear_preempt_need_resched(); ··· 7148 6931 * changes to task_struct made by pick_next_task(). 7149 6932 */ 7150 6933 RCU_INIT_POINTER(rq->curr, next); 7151 - 7152 - if (!task_current_donor(rq, next)) 7153 - proxy_tag_curr(rq, next); 7154 6934 7155 6935 /* 7156 6936 * The membarrier system call requires each architecture ··· 7182 6968 /* Also unlocks the rq: */ 7183 6969 rq = context_switch(rq, prev, next, &rf); 7184 6970 } else { 7185 - /* In case next was already curr but just got blocked_donor */ 7186 - if (!task_current_donor(rq, next)) 7187 - proxy_tag_curr(rq, next); 7188 - 7189 6971 rq_unpin_lock(rq, &rf); 7190 6972 __balance_callbacks(rq, NULL); 7191 6973 hrtick_schedule_exit(rq);

+36 -5

kernel/sched/deadline.c

··· 2142 2142 int flags) 2143 2143 { 2144 2144 struct task_struct *p = dl_task_of(dl_se); 2145 + struct rq *rq = rq_of_dl_rq(dl_rq); 2145 2146 2146 2147 if (!schedstat_enabled()) 2147 2148 return; 2149 + 2150 + if (p != rq->curr) 2151 + update_stats_wait_end_dl(dl_rq, dl_se); 2148 2152 2149 2153 if ((flags & DEQUEUE_SLEEP)) { 2150 2154 unsigned int state; ··· 2805 2801 2806 2802 static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) 2807 2803 { 2808 - struct task_struct *p; 2804 + struct task_struct *i, *p = NULL; 2805 + struct rb_node *next_node; 2809 2806 2810 2807 if (!has_pushable_dl_tasks(rq)) 2811 2808 return NULL; 2812 2809 2813 - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); 2810 + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); 2811 + while (next_node) { 2812 + i = __node_2_pdl(next_node); 2813 + /* make sure task isn't on_cpu (possible with proxy-exec) */ 2814 + if (!task_on_cpu(rq, i)) { 2815 + p = i; 2816 + break; 2817 + } 2818 + 2819 + next_node = rb_next(next_node); 2820 + } 2821 + 2822 + if (!p) 2823 + return NULL; 2814 2824 2815 2825 WARN_ON_ONCE(rq->cpu != task_cpu(p)); 2816 2826 WARN_ON_ONCE(task_current(rq, p)); ··· 3631 3613 dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); 3632 3614 } 3633 3615 3634 - void __getparam_dl(struct task_struct *p, struct sched_attr *attr) 3616 + void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags) 3635 3617 { 3636 3618 struct sched_dl_entity *dl_se = &p->dl; 3619 + struct rq *rq = task_rq(p); 3620 + u64 adj_deadline; 3637 3621 3638 3622 attr->sched_priority = p->rt_priority; 3639 - attr->sched_runtime = dl_se->dl_runtime; 3640 - attr->sched_deadline = dl_se->dl_deadline; 3623 + if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) { 3624 + guard(raw_spinlock_irq)(&rq->__lock); 3625 + update_rq_clock(rq); 3626 + if (task_current(rq, p)) 3627 + update_curr_dl(rq); 3628 + 3629 + attr->sched_runtime = dl_se->runtime; 3630 + adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns(); 3631 + attr->sched_deadline = adj_deadline; 3632 + } else { 3633 + attr->sched_runtime = dl_se->dl_runtime; 3634 + attr->sched_deadline = dl_se->dl_deadline; 3635 + } 3641 3636 attr->sched_period = dl_se->dl_period; 3642 3637 attr->sched_flags &= ~SCHED_DL_FLAGS; 3643 3638 attr->sched_flags |= dl_se->flags;

+13 -1

kernel/sched/debug.c

··· 8 8 */ 9 9 #include <linux/debugfs.h> 10 10 #include <linux/nmi.h> 11 + #include <linux/log2.h> 11 12 #include "sched.h" 12 13 13 14 /* ··· 902 901 903 902 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 904 903 { 905 - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; 904 + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread; 905 + s64 zero_vruntime = -1, sum_w_vruntime = -1; 906 906 u64 avruntime; 907 907 struct sched_entity *last, *first, *root; 908 908 struct rq *rq = cpu_rq(cpu); 909 + unsigned int sum_shift; 909 910 unsigned long flags; 911 + u64 sum_weight; 910 912 911 913 #ifdef CONFIG_FAIR_GROUP_SCHED 912 914 SEQ_printf(m, "\n"); ··· 930 926 if (last) 931 927 right_vruntime = last->vruntime; 932 928 zero_vruntime = cfs_rq->zero_vruntime; 929 + sum_w_vruntime = cfs_rq->sum_w_vruntime; 930 + sum_weight = cfs_rq->sum_weight; 931 + sum_shift = cfs_rq->sum_shift; 933 932 avruntime = avg_vruntime(cfs_rq); 934 933 raw_spin_rq_unlock_irqrestore(rq, flags); 935 934 ··· 942 935 SPLIT_NS(left_vruntime)); 943 936 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", 944 937 SPLIT_NS(zero_vruntime)); 938 + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime", 939 + sum_w_vruntime, ilog2(abs(sum_w_vruntime))); 940 + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight", 941 + sum_weight); 942 + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift); 945 943 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", 946 944 SPLIT_NS(avruntime)); 947 945 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",

+2 -2

kernel/sched/ext.c

··· 2837 2837 2838 2838 cond_resched(); 2839 2839 } 2840 - queue_delayed_work(system_unbound_wq, to_delayed_work(work), 2840 + queue_delayed_work(system_dfl_wq, to_delayed_work(work), 2841 2841 READ_ONCE(scx_watchdog_timeout) / 2); 2842 2842 } 2843 2843 ··· 5164 5164 5165 5165 WRITE_ONCE(scx_watchdog_timeout, timeout); 5166 5166 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5167 - queue_delayed_work(system_unbound_wq, &scx_watchdog_work, 5167 + queue_delayed_work(system_dfl_wq, &scx_watchdog_work, 5168 5168 READ_ONCE(scx_watchdog_timeout) / 2); 5169 5169 5170 5170 /*

+363 -152

kernel/sched/fair.c

··· 225 225 update_sysctl(); 226 226 } 227 227 228 + #ifndef CONFIG_64BIT 228 229 #define WMULT_CONST (~0U) 229 230 #define WMULT_SHIFT 32 230 231 ··· 284 283 285 284 return mul_u64_u32_shr(delta_exec, fact, shift); 286 285 } 286 + #else 287 + static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) 288 + { 289 + return (delta_exec * weight) / lw->weight; 290 + } 291 + #endif 287 292 288 293 /* 289 294 * delta /= w ··· 672 665 * Since zero_vruntime closely tracks the per-task service, these 673 666 * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag 674 667 * induced in the system due to quantisation. 675 - * 676 - * Also, we use scale_load_down() to reduce the size. 677 - * 678 - * As measured, the max (key * weight) value was ~44 bits for a kernel build. 679 668 */ 669 + static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w) 670 + { 671 + #ifdef CONFIG_64BIT 672 + if (cfs_rq->sum_shift) 673 + w = max(2UL, w >> cfs_rq->sum_shift); 674 + #endif 675 + return w; 676 + } 677 + 678 + static inline void 679 + __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 680 + { 681 + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); 682 + s64 w_vruntime, key = entity_key(cfs_rq, se); 683 + 684 + w_vruntime = key * weight; 685 + WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62)); 686 + 687 + cfs_rq->sum_w_vruntime += w_vruntime; 688 + cfs_rq->sum_weight += weight; 689 + } 690 + 691 + static void 692 + sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se) 693 + { 694 + unsigned long weight; 695 + s64 key, tmp; 696 + 697 + again: 698 + weight = avg_vruntime_weight(cfs_rq, se->load.weight); 699 + key = entity_key(cfs_rq, se); 700 + 701 + if (check_mul_overflow(key, weight, &key)) 702 + goto overflow; 703 + 704 + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp)) 705 + goto overflow; 706 + 707 + cfs_rq->sum_w_vruntime = tmp; 708 + cfs_rq->sum_weight += weight; 709 + return; 710 + 711 + overflow: 712 + /* 713 + * There's gotta be a limit -- if we're still failing at this point 714 + * there's really nothing much to be done about things. 715 + */ 716 + BUG_ON(cfs_rq->sum_shift >= 10); 717 + cfs_rq->sum_shift++; 718 + 719 + /* 720 + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1 721 + */ 722 + cfs_rq->sum_w_vruntime = 0; 723 + cfs_rq->sum_weight = 0; 724 + 725 + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost; 726 + node; node = rb_next(node)) 727 + __sum_w_vruntime_add(cfs_rq, __node_2_se(node)); 728 + 729 + goto again; 730 + } 731 + 680 732 static void 681 733 sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 682 734 { 683 - unsigned long weight = scale_load_down(se->load.weight); 684 - s64 key = entity_key(cfs_rq, se); 735 + if (sched_feat(PARANOID_AVG)) 736 + return sum_w_vruntime_add_paranoid(cfs_rq, se); 685 737 686 - cfs_rq->sum_w_vruntime += key * weight; 687 - cfs_rq->sum_weight += weight; 738 + __sum_w_vruntime_add(cfs_rq, se); 688 739 } 689 740 690 741 static void 691 742 sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) 692 743 { 693 - unsigned long weight = scale_load_down(se->load.weight); 744 + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); 694 745 s64 key = entity_key(cfs_rq, se); 695 746 696 747 cfs_rq->sum_w_vruntime -= key * weight; ··· 790 725 s64 runtime = cfs_rq->sum_w_vruntime; 791 726 792 727 if (curr) { 793 - unsigned long w = scale_load_down(curr->load.weight); 728 + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight); 794 729 795 730 runtime += entity_key(cfs_rq, curr) * w; 796 731 weight += w; ··· 800 735 if (runtime < 0) 801 736 runtime -= (weight - 1); 802 737 803 - delta = div_s64(runtime, weight); 738 + delta = div64_long(runtime, weight); 804 739 } else if (curr) { 805 740 /* 806 741 * When there is but one element, it is the average. ··· 829 764 * 830 765 * -r_max < lag < max(r_max, q) 831 766 */ 832 - static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) 767 + static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime) 833 768 { 834 769 u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; 835 770 s64 vlag, limit; 836 771 837 - WARN_ON_ONCE(!se->on_rq); 838 - 839 - vlag = avg_vruntime(cfs_rq) - se->vruntime; 772 + vlag = avruntime - se->vruntime; 840 773 limit = calc_delta_fair(max_slice, se); 841 774 842 - se->vlag = clamp(vlag, -limit, limit); 775 + return clamp(vlag, -limit, limit); 776 + } 777 + 778 + /* 779 + * Delayed dequeue aims to reduce the negative lag of a dequeued task. While 780 + * updating the lag of an entity, check that negative lag didn't increase 781 + * during the delayed dequeue period which would be unfair. 782 + * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO 783 + * is set. 784 + * 785 + * Return true if the lag has been adjusted. 786 + */ 787 + static __always_inline 788 + bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) 789 + { 790 + s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); 791 + bool ret; 792 + 793 + WARN_ON_ONCE(!se->on_rq); 794 + 795 + if (se->sched_delayed) { 796 + /* previous vlag < 0 otherwise se would not be delayed */ 797 + vlag = max(vlag, se->vlag); 798 + if (sched_feat(DELAY_ZERO)) 799 + vlag = min(vlag, 0); 800 + } 801 + ret = (vlag == se->vlag); 802 + se->vlag = vlag; 803 + 804 + return ret; 843 805 } 844 806 845 807 /* ··· 893 801 long load = cfs_rq->sum_weight; 894 802 895 803 if (curr && curr->on_rq) { 896 - unsigned long weight = scale_load_down(curr->load.weight); 804 + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight); 897 805 898 806 avg += entity_key(cfs_rq, curr) * weight; 899 807 load += weight; ··· 1116 1024 /* 1117 1025 * Picking the ->next buddy will affect latency but not fairness. 1118 1026 */ 1119 - if (sched_feat(PICK_BUDDY) && 1027 + if (sched_feat(PICK_BUDDY) && protect && 1120 1028 cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { 1121 1029 /* ->next will never be delayed */ 1122 1030 WARN_ON_ONCE(cfs_rq->next->sched_delayed); ··· 3933 3841 se_weight(se) * -se->avg.load_sum); 3934 3842 } 3935 3843 3936 - static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); 3844 + static void 3845 + rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot) 3846 + { 3847 + unsigned long old_weight = se->load.weight; 3848 + 3849 + /* 3850 + * VRUNTIME 3851 + * -------- 3852 + * 3853 + * COROLLARY #1: The virtual runtime of the entity needs to be 3854 + * adjusted if re-weight at !0-lag point. 3855 + * 3856 + * Proof: For contradiction assume this is not true, so we can 3857 + * re-weight without changing vruntime at !0-lag point. 3858 + * 3859 + * Weight VRuntime Avg-VRuntime 3860 + * before w v V 3861 + * after w' v' V' 3862 + * 3863 + * Since lag needs to be preserved through re-weight: 3864 + * 3865 + * lag = (V - v)*w = (V'- v')*w', where v = v' 3866 + * ==> V' = (V - v)*w/w' + v (1) 3867 + * 3868 + * Let W be the total weight of the entities before reweight, 3869 + * since V' is the new weighted average of entities: 3870 + * 3871 + * V' = (WV + w'v - wv) / (W + w' - w) (2) 3872 + * 3873 + * by using (1) & (2) we obtain: 3874 + * 3875 + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v 3876 + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v 3877 + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v 3878 + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) 3879 + * 3880 + * Since we are doing at !0-lag point which means V != v, we 3881 + * can simplify (3): 3882 + * 3883 + * ==> W / (W + w' - w) = w / w' 3884 + * ==> Ww' = Ww + ww' - ww 3885 + * ==> W * (w' - w) = w * (w' - w) 3886 + * ==> W = w (re-weight indicates w' != w) 3887 + * 3888 + * So the cfs_rq contains only one entity, hence vruntime of 3889 + * the entity @v should always equal to the cfs_rq's weighted 3890 + * average vruntime @V, which means we will always re-weight 3891 + * at 0-lag point, thus breach assumption. Proof completed. 3892 + * 3893 + * 3894 + * COROLLARY #2: Re-weight does NOT affect weighted average 3895 + * vruntime of all the entities. 3896 + * 3897 + * Proof: According to corollary #1, Eq. (1) should be: 3898 + * 3899 + * (V - v)*w = (V' - v')*w' 3900 + * ==> v' = V' - (V - v)*w/w' (4) 3901 + * 3902 + * According to the weighted average formula, we have: 3903 + * 3904 + * V' = (WV - wv + w'v') / (W - w + w') 3905 + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') 3906 + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') 3907 + * = (WV + w'V' - Vw) / (W - w + w') 3908 + * 3909 + * ==> V'*(W - w + w') = WV + w'V' - Vw 3910 + * ==> V' * (W - w) = (W - w) * V (5) 3911 + * 3912 + * If the entity is the only one in the cfs_rq, then reweight 3913 + * always occurs at 0-lag point, so V won't change. Or else 3914 + * there are other entities, hence W != w, then Eq. (5) turns 3915 + * into V' = V. So V won't change in either case, proof done. 3916 + * 3917 + * 3918 + * So according to corollary #1 & #2, the effect of re-weight 3919 + * on vruntime should be: 3920 + * 3921 + * v' = V' - (V - v) * w / w' (4) 3922 + * = V - (V - v) * w / w' 3923 + * = V - vl * w / w' 3924 + * = V - vl' 3925 + */ 3926 + se->vlag = div64_long(se->vlag * old_weight, weight); 3927 + 3928 + /* 3929 + * DEADLINE 3930 + * -------- 3931 + * 3932 + * When the weight changes, the virtual time slope changes and 3933 + * we should adjust the relative virtual deadline accordingly. 3934 + * 3935 + * d' = v' + (d - v)*w/w' 3936 + * = V' - (V - v)*w/w' + (d - v)*w/w' 3937 + * = V - (V - v)*w/w' + (d - v)*w/w' 3938 + * = V + (d - V)*w/w' 3939 + */ 3940 + if (se->rel_deadline) 3941 + se->deadline = div64_long(se->deadline * old_weight, weight); 3942 + 3943 + if (rel_vprot) 3944 + se->vprot = div64_long(se->vprot * old_weight, weight); 3945 + } 3937 3946 3938 3947 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 3939 3948 unsigned long weight) 3940 3949 { 3941 3950 bool curr = cfs_rq->curr == se; 3942 3951 bool rel_vprot = false; 3943 - u64 vprot; 3952 + u64 avruntime = 0; 3944 3953 3945 3954 if (se->on_rq) { 3946 3955 /* commit outstanding execution time */ 3947 3956 update_curr(cfs_rq); 3948 - update_entity_lag(cfs_rq, se); 3949 - se->deadline -= se->vruntime; 3957 + avruntime = avg_vruntime(cfs_rq); 3958 + se->vlag = entity_lag(cfs_rq, se, avruntime); 3959 + se->deadline -= avruntime; 3950 3960 se->rel_deadline = 1; 3951 3961 if (curr && protect_slice(se)) { 3952 - vprot = se->vprot - se->vruntime; 3962 + se->vprot -= avruntime; 3953 3963 rel_vprot = true; 3954 3964 } 3955 3965 ··· 4062 3868 } 4063 3869 dequeue_load_avg(cfs_rq, se); 4064 3870 4065 - /* 4066 - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), 4067 - * we need to scale se->vlag when w_i changes. 4068 - */ 4069 - se->vlag = div_s64(se->vlag * se->load.weight, weight); 4070 - if (se->rel_deadline) 4071 - se->deadline = div_s64(se->deadline * se->load.weight, weight); 4072 - 4073 - if (rel_vprot) 4074 - vprot = div_s64(vprot * se->load.weight, weight); 3871 + rescale_entity(se, weight, rel_vprot); 4075 3872 4076 3873 update_load_set(&se->load, weight); 4077 3874 4078 3875 do { 4079 3876 u32 divider = get_pelt_divider(&se->avg); 4080 - 4081 3877 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 4082 3878 } while (0); 4083 3879 4084 3880 enqueue_load_avg(cfs_rq, se); 4085 3881 if (se->on_rq) { 4086 - place_entity(cfs_rq, se, 0); 4087 3882 if (rel_vprot) 4088 - se->vprot = se->vruntime + vprot; 3883 + se->vprot += avruntime; 3884 + se->deadline += avruntime; 3885 + se->rel_deadline = 0; 3886 + se->vruntime = avruntime - se->vlag; 3887 + 4089 3888 update_load_add(&cfs_rq->load, se->load.weight); 4090 3889 if (!curr) 4091 3890 __enqueue_entity(cfs_rq, se); ··· 5352 5165 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5353 5166 { 5354 5167 u64 vslice, vruntime = avg_vruntime(cfs_rq); 5168 + bool update_zero = false; 5355 5169 s64 lag = 0; 5356 5170 5357 5171 if (!se->custom_slice) ··· 5369 5181 */ 5370 5182 if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { 5371 5183 struct sched_entity *curr = cfs_rq->curr; 5372 - unsigned long load; 5184 + long load, weight; 5373 5185 5374 5186 lag = se->vlag; 5375 5187 ··· 5427 5239 */ 5428 5240 load = cfs_rq->sum_weight; 5429 5241 if (curr && curr->on_rq) 5430 - load += scale_load_down(curr->load.weight); 5242 + load += avg_vruntime_weight(cfs_rq, curr->load.weight); 5431 5243 5432 - lag *= load + scale_load_down(se->load.weight); 5244 + weight = avg_vruntime_weight(cfs_rq, se->load.weight); 5245 + lag *= load + weight; 5433 5246 if (WARN_ON_ONCE(!load)) 5434 5247 load = 1; 5435 - lag = div_s64(lag, load); 5248 + lag = div64_long(lag, load); 5249 + 5250 + /* 5251 + * A heavy entity (relative to the tree) will pull the 5252 + * avg_vruntime close to its vruntime position on enqueue. But 5253 + * the zero_vruntime point is only updated at the next 5254 + * update_deadline()/place_entity()/update_entity_lag(). 5255 + * 5256 + * Specifically (see the comment near avg_vruntime_weight()): 5257 + * 5258 + * sum_w_vruntime = \Sum (v_i - v0) * w_i 5259 + * 5260 + * Note that if v0 is near a light entity, both terms will be 5261 + * small for the light entity, while in that case both terms 5262 + * are large for the heavy entity, leading to risk of 5263 + * overflow. 5264 + * 5265 + * OTOH if v0 is near the heavy entity, then the difference is 5266 + * larger for the light entity, but the factor is small, while 5267 + * for the heavy entity the difference is small but the factor 5268 + * is large. Avoiding the multiplication overflow. 5269 + */ 5270 + if (weight > load) 5271 + update_zero = true; 5436 5272 } 5437 5273 5438 5274 se->vruntime = vruntime - lag; 5439 5275 5440 - if (se->rel_deadline) { 5276 + if (update_zero) 5277 + update_zero_vruntime(cfs_rq, -lag); 5278 + 5279 + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { 5441 5280 se->deadline += se->vruntime; 5442 5281 se->rel_deadline = 0; 5443 5282 return; ··· 5614 5399 } 5615 5400 } 5616 5401 5617 - static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5618 - { 5619 - clear_delayed(se); 5620 - if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5621 - se->vlag = 0; 5622 - } 5623 - 5624 5402 static bool 5625 5403 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5626 5404 { ··· 5639 5431 if (sched_feat(DELAY_DEQUEUE) && delay && 5640 5432 !entity_eligible(cfs_rq, se)) { 5641 5433 update_load_avg(cfs_rq, se, 0); 5434 + update_entity_lag(cfs_rq, se); 5642 5435 set_delayed(se); 5643 5436 return false; 5644 5437 } ··· 5679 5470 update_cfs_group(se); 5680 5471 5681 5472 if (flags & DEQUEUE_DELAYED) 5682 - finish_delayed_dequeue_entity(se); 5473 + clear_delayed(se); 5683 5474 5684 5475 if (cfs_rq->nr_queued == 0) { 5685 5476 update_idle_cfs_rq_clock_pelt(cfs_rq); ··· 7075 6866 7076 6867 static inline bool cpu_overutilized(int cpu) 7077 6868 { 7078 - unsigned long rq_util_min, rq_util_max; 6869 + unsigned long rq_util_max; 7079 6870 7080 6871 if (!sched_energy_enabled()) 7081 6872 return false; 7082 6873 7083 - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); 7084 6874 rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); 7085 6875 7086 6876 /* Return true only if the utilization doesn't fit CPU's capacity */ 7087 - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); 6877 + return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu); 7088 6878 } 7089 6879 7090 6880 /* ··· 7121 6913 rq->nr_running); 7122 6914 } 7123 6915 7124 - static int sched_idle_cpu(int cpu) 6916 + static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p) 7125 6917 { 7126 - return sched_idle_rq(cpu_rq(cpu)); 6918 + return sched_idle_rq(rq) && !task_has_idle_policy(p); 6919 + } 6920 + 6921 + static int choose_idle_cpu(int cpu, struct task_struct *p) 6922 + { 6923 + return available_idle_cpu(cpu) || 6924 + choose_sched_idle_rq(cpu_rq(cpu), p); 7127 6925 } 7128 6926 7129 6927 static void ··· 7145 6931 WARN_ON_ONCE(!se->sched_delayed); 7146 6932 WARN_ON_ONCE(!se->on_rq); 7147 6933 7148 - if (sched_feat(DELAY_ZERO)) { 7149 - update_entity_lag(cfs_rq, se); 7150 - if (se->vlag > 0) { 7151 - cfs_rq->nr_queued--; 7152 - if (se != cfs_rq->curr) 7153 - __dequeue_entity(cfs_rq, se); 7154 - se->vlag = 0; 7155 - place_entity(cfs_rq, se, 0); 7156 - if (se != cfs_rq->curr) 7157 - __enqueue_entity(cfs_rq, se); 7158 - cfs_rq->nr_queued++; 7159 - } 6934 + if (update_entity_lag(cfs_rq, se)) { 6935 + cfs_rq->nr_queued--; 6936 + if (se != cfs_rq->curr) 6937 + __dequeue_entity(cfs_rq, se); 6938 + place_entity(cfs_rq, se, 0); 6939 + if (se != cfs_rq->curr) 6940 + __enqueue_entity(cfs_rq, se); 6941 + cfs_rq->nr_queued++; 7160 6942 } 7161 6943 7162 6944 update_load_avg(cfs_rq, se, 0); ··· 7685 7475 if (!sched_core_cookie_match(rq, p)) 7686 7476 continue; 7687 7477 7688 - if (sched_idle_cpu(i)) 7478 + if (choose_sched_idle_rq(rq, p)) 7689 7479 return i; 7690 7480 7691 7481 if (available_idle_cpu(i)) { ··· 7776 7566 7777 7567 static inline int __select_idle_cpu(int cpu, struct task_struct *p) 7778 7568 { 7779 - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && 7780 - sched_cpu_cookie_match(cpu_rq(cpu), p)) 7569 + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p)) 7781 7570 return cpu; 7782 7571 7783 7572 return -1; ··· 7849 7640 if (!available_idle_cpu(cpu)) { 7850 7641 idle = false; 7851 7642 if (*idle_cpu == -1) { 7852 - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { 7643 + if (choose_sched_idle_rq(cpu_rq(cpu), p) && 7644 + cpumask_test_cpu(cpu, cpus)) { 7853 7645 *idle_cpu = cpu; 7854 7646 break; 7855 7647 } ··· 7885 7675 */ 7886 7676 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) 7887 7677 continue; 7888 - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) 7678 + if (choose_idle_cpu(cpu, p)) 7889 7679 return cpu; 7890 7680 } 7891 7681 ··· 7924 7714 { 7925 7715 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); 7926 7716 int i, cpu, idle_cpu = -1, nr = INT_MAX; 7927 - struct sched_domain_shared *sd_share; 7928 - 7929 - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); 7930 7717 7931 7718 if (sched_feat(SIS_UTIL)) { 7932 - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); 7933 - if (sd_share) { 7934 - /* because !--nr is the condition to stop scan */ 7935 - nr = READ_ONCE(sd_share->nr_idle_scan) + 1; 7936 - /* overloaded LLC is unlikely to have idle cpu/core */ 7937 - if (nr == 1) 7938 - return -1; 7939 - } 7719 + /* 7720 + * Increment because !--nr is the condition to stop scan. 7721 + * 7722 + * Since "sd" is "sd_llc" for target CPU dereferenced in the 7723 + * caller, it is safe to directly dereference "sd->shared". 7724 + * Topology bits always ensure it assigned for "sd_llc" abd it 7725 + * cannot disappear as long as we have a RCU protected 7726 + * reference to one the associated "sd" here. 7727 + */ 7728 + nr = READ_ONCE(sd->shared->nr_idle_scan) + 1; 7729 + /* overloaded LLC is unlikely to have idle cpu/core */ 7730 + if (nr == 1) 7731 + return -1; 7940 7732 } 7733 + 7734 + if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr)) 7735 + return -1; 7941 7736 7942 7737 if (static_branch_unlikely(&sched_cluster_active)) { 7943 7738 struct sched_group *sg = sd->groups; ··· 8012 7797 for_each_cpu_wrap(cpu, cpus, target) { 8013 7798 unsigned long cpu_cap = capacity_of(cpu); 8014 7799 8015 - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) 7800 + if (!choose_idle_cpu(cpu, p)) 8016 7801 continue; 8017 7802 8018 7803 fits = util_fits_cpu(task_util, util_min, util_max, cpu); ··· 8083 7868 */ 8084 7869 lockdep_assert_irqs_disabled(); 8085 7870 8086 - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && 7871 + if (choose_idle_cpu(target, p) && 8087 7872 asym_fits_cpu(task_util, util_min, util_max, target)) 8088 7873 return target; 8089 7874 ··· 8091 7876 * If the previous CPU is cache affine and idle, don't be stupid: 8092 7877 */ 8093 7878 if (prev != target && cpus_share_cache(prev, target) && 8094 - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && 7879 + choose_idle_cpu(prev, p) && 8095 7880 asym_fits_cpu(task_util, util_min, util_max, prev)) { 8096 7881 8097 7882 if (!static_branch_unlikely(&sched_cluster_active) || ··· 8123 7908 if (recent_used_cpu != prev && 8124 7909 recent_used_cpu != target && 8125 7910 cpus_share_cache(recent_used_cpu, target) && 8126 - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && 7911 + choose_idle_cpu(recent_used_cpu, p) && 8127 7912 cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && 8128 7913 asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { 8129 7914 ··· 8623 8408 struct perf_domain *pd; 8624 8409 struct energy_env eenv; 8625 8410 8626 - rcu_read_lock(); 8627 8411 pd = rcu_dereference_all(rd->pd); 8628 8412 if (!pd) 8629 - goto unlock; 8413 + return target; 8630 8414 8631 8415 /* 8632 8416 * Energy-aware wake-up happens on the lowest sched_domain starting ··· 8635 8421 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 8636 8422 sd = sd->parent; 8637 8423 if (!sd) 8638 - goto unlock; 8424 + return target; 8639 8425 8640 8426 target = prev_cpu; 8641 8427 8642 8428 sync_entity_load_avg(&p->se); 8643 8429 if (!task_util_est(p) && p_util_min == 0) 8644 - goto unlock; 8430 + return target; 8645 8431 8646 8432 eenv_task_busy_time(&eenv, p, prev_cpu); 8647 8433 ··· 8736 8522 prev_cpu); 8737 8523 /* CPU utilization has changed */ 8738 8524 if (prev_delta < base_energy) 8739 - goto unlock; 8525 + return target; 8740 8526 prev_delta -= base_energy; 8741 8527 prev_actual_cap = cpu_actual_cap; 8742 8528 best_delta = min(best_delta, prev_delta); ··· 8760 8546 max_spare_cap_cpu); 8761 8547 /* CPU utilization has changed */ 8762 8548 if (cur_delta < base_energy) 8763 - goto unlock; 8549 + return target; 8764 8550 cur_delta -= base_energy; 8765 8551 8766 8552 /* ··· 8777 8563 best_actual_cap = cpu_actual_cap; 8778 8564 } 8779 8565 } 8780 - rcu_read_unlock(); 8781 8566 8782 8567 if ((best_fits > prev_fits) || 8783 8568 ((best_fits > 0) && (best_delta < prev_delta)) || 8784 8569 ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) 8785 8570 target = best_energy_cpu; 8786 - 8787 - return target; 8788 - 8789 - unlock: 8790 - rcu_read_unlock(); 8791 8571 8792 8572 return target; 8793 8573 } ··· 8828 8620 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); 8829 8621 } 8830 8622 8831 - rcu_read_lock(); 8832 8623 for_each_domain(cpu, tmp) { 8833 8624 /* 8834 8625 * If both 'cpu' and 'prev_cpu' are part of this domain, ··· 8853 8646 break; 8854 8647 } 8855 8648 8856 - if (unlikely(sd)) { 8857 - /* Slow path */ 8858 - new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); 8859 - } else if (wake_flags & WF_TTWU) { /* XXX always ? */ 8860 - /* Fast path */ 8861 - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); 8862 - } 8863 - rcu_read_unlock(); 8649 + /* Slow path */ 8650 + if (unlikely(sd)) 8651 + return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); 8652 + 8653 + /* Fast path */ 8654 + if (wake_flags & WF_TTWU) 8655 + return select_idle_sibling(p, prev_cpu, new_cpu); 8864 8656 8865 8657 return new_cpu; 8866 8658 } ··· 9150 8944 return; 9151 8945 9152 8946 preempt: 9153 - if (preempt_action == PREEMPT_WAKEUP_SHORT) 8947 + if (preempt_action == PREEMPT_WAKEUP_SHORT) { 9154 8948 cancel_protect_slice(se); 8949 + clear_buddies(cfs_rq, se); 8950 + } 9155 8951 9156 8952 resched_curr_lazy(rq); 9157 8953 } ··· 10001 9793 } 10002 9794 10003 9795 /* 10004 - * attach_task() -- attach the task detached by detach_task() to its new rq. 10005 - */ 10006 - static void attach_task(struct rq *rq, struct task_struct *p) 10007 - { 10008 - lockdep_assert_rq_held(rq); 10009 - 10010 - WARN_ON_ONCE(task_rq(p) != rq); 10011 - activate_task(rq, p, ENQUEUE_NOCLOCK); 10012 - wakeup_preempt(rq, p, 0); 10013 - } 10014 - 10015 - /* 10016 - * attach_one_task() -- attaches the task returned from detach_one_task() to 10017 - * its new rq. 10018 - */ 10019 - static void attach_one_task(struct rq *rq, struct task_struct *p) 10020 - { 10021 - struct rq_flags rf; 10022 - 10023 - rq_lock(rq, &rf); 10024 - update_rq_clock(rq); 10025 - attach_task(rq, p); 10026 - rq_unlock(rq, &rf); 10027 - } 10028 - 10029 - /* 10030 9796 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their 10031 9797 * new rq. 10032 9798 */ ··· 10237 10055 unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ 10238 10056 unsigned int group_smt_balance; /* Task on busy SMT be moved */ 10239 10057 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ 10058 + unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ 10240 10059 #ifdef CONFIG_NUMA_BALANCING 10241 10060 unsigned int nr_numa_running; 10242 10061 unsigned int nr_preferred_running; ··· 10470 10287 static inline bool 10471 10288 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) 10472 10289 { 10290 + /* 10291 + * With EAS and uclamp, 1 CPU in the group must be overutilized to 10292 + * consider the group overloaded. 10293 + */ 10294 + if (sched_energy_enabled() && !sgs->group_overutilized) 10295 + return false; 10296 + 10473 10297 if (sgs->sum_nr_running <= sgs->group_weight) 10474 10298 return false; 10475 10299 ··· 10660 10470 * @group: sched_group whose statistics are to be updated. 10661 10471 * @sgs: variable to hold the statistics for this group. 10662 10472 * @sg_overloaded: sched_group is overloaded 10663 - * @sg_overutilized: sched_group is overutilized 10664 10473 */ 10665 10474 static inline void update_sg_lb_stats(struct lb_env *env, 10666 10475 struct sd_lb_stats *sds, 10667 10476 struct sched_group *group, 10668 10477 struct sg_lb_stats *sgs, 10669 - bool *sg_overloaded, 10670 - bool *sg_overutilized) 10478 + bool *sg_overloaded) 10671 10479 { 10672 10480 int i, nr_running, local_group, sd_flags = env->sd->flags; 10673 10481 bool balancing_at_rd = !env->sd->parent; ··· 10687 10499 sgs->sum_nr_running += nr_running; 10688 10500 10689 10501 if (cpu_overutilized(i)) 10690 - *sg_overutilized = 1; 10502 + sgs->group_overutilized = 1; 10691 10503 10692 10504 /* 10693 10505 * No need to call idle_cpu() if nr_running is not 0 ··· 11263 11075 unsigned long sum_util) 11264 11076 { 11265 11077 struct sched_domain_shared *sd_share; 11078 + struct sched_domain *sd = env->sd; 11266 11079 int llc_weight, pct; 11267 11080 u64 x, y, tmp; 11268 11081 /* ··· 11277 11088 if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) 11278 11089 return; 11279 11090 11280 - llc_weight = per_cpu(sd_llc_size, env->dst_cpu); 11281 - if (env->sd->span_weight != llc_weight) 11282 - return; 11283 - 11284 - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); 11091 + sd_share = sd->shared; 11285 11092 if (!sd_share) 11286 11093 return; 11287 11094 ··· 11311 11126 */ 11312 11127 /* equation [3] */ 11313 11128 x = sum_util; 11129 + llc_weight = sd->span_weight; 11314 11130 do_div(x, llc_weight); 11315 11131 11316 11132 /* equation [4] */ 11317 - pct = env->sd->imbalance_pct; 11133 + pct = sd->imbalance_pct; 11318 11134 tmp = x * x * pct * pct; 11319 11135 do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); 11320 11136 tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); ··· 11356 11170 update_group_capacity(env->sd, env->dst_cpu); 11357 11171 } 11358 11172 11359 - update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); 11173 + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded); 11360 11174 11361 11175 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { 11362 11176 sds->busiest = sg; 11363 11177 sds->busiest_stat = *sgs; 11364 11178 } 11179 + 11180 + sg_overutilized |= sgs->group_overutilized; 11365 11181 11366 11182 /* Now, start updating sd_lb_stats */ 11367 11183 sds->total_load += sgs->group_load; ··· 12485 12297 sd->newidle_success += success; 12486 12298 12487 12299 if (sd->newidle_call >= 1024) { 12488 - sd->newidle_ratio = sd->newidle_success; 12300 + u64 now = sched_clock(); 12301 + s64 delta = now - sd->newidle_stamp; 12302 + sd->newidle_stamp = now; 12303 + int ratio = 0; 12304 + 12305 + if (delta < 0) 12306 + delta = 0; 12307 + 12308 + if (sched_feat(NI_RATE)) { 12309 + /* 12310 + * ratio delta freq 12311 + * 12312 + * 1024 - 4 s - 128 Hz 12313 + * 512 - 2 s - 256 Hz 12314 + * 256 - 1 s - 512 Hz 12315 + * 128 - .5 s - 1024 Hz 12316 + * 64 - .25 s - 2048 Hz 12317 + */ 12318 + ratio = delta >> 22; 12319 + } 12320 + 12321 + ratio += sd->newidle_success; 12322 + 12323 + sd->newidle_ratio = min(1024, ratio); 12489 12324 sd->newidle_call /= 2; 12490 12325 sd->newidle_success /= 2; 12491 12326 } ··· 12555 12344 { 12556 12345 int continue_balancing = 1; 12557 12346 int cpu = rq->cpu; 12558 - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); 12347 + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); 12559 12348 unsigned long interval; 12560 12349 struct sched_domain *sd; 12561 12350 /* Earliest time when we have to do rebalance again */ ··· 12593 12382 * state even if we migrated tasks. Update it. 12594 12383 */ 12595 12384 idle = idle_cpu(cpu); 12596 - busy = !idle && !sched_idle_cpu(cpu); 12385 + busy = !idle && !sched_idle_rq(rq); 12597 12386 } 12598 12387 sd->last_balance = jiffies; 12599 12388 interval = get_sd_balance_interval(sd, busy); ··· 12638 12427 */ 12639 12428 static inline int find_new_ilb(void) 12640 12429 { 12430 + int this_cpu = smp_processor_id(); 12641 12431 const struct cpumask *hk_mask; 12642 12432 int ilb_cpu; 12643 12433 12644 12434 hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); 12645 12435 12646 12436 for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { 12647 - 12648 - if (ilb_cpu == smp_processor_id()) 12437 + if (ilb_cpu == this_cpu) 12649 12438 continue; 12650 12439 12651 12440 if (idle_cpu(ilb_cpu)) ··· 13215 13004 if (sd->flags & SD_BALANCE_NEWIDLE) { 13216 13005 unsigned int weight = 1; 13217 13006 13218 - if (sched_feat(NI_RANDOM)) { 13007 + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { 13219 13008 /* 13220 13009 * Throw a 1k sided dice; and only run 13221 13010 * newidle_balance according to the success ··· 14241 14030 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; 14242 14031 } 14243 14032 if (ng) { 14244 - gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], 14033 + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; 14245 14034 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; 14246 14035 } 14247 14036 print_numa_stats(m, node, tsf, tpf, gsf, gpf);

+3

kernel/sched/features.h

··· 58 58 SCHED_FEAT(DELAY_DEQUEUE, true) 59 59 SCHED_FEAT(DELAY_ZERO, true) 60 60 61 + SCHED_FEAT(PARANOID_AVG, false) 62 + 61 63 /* 62 64 * Allow wakeup-time preemption of the current task: 63 65 */ ··· 133 131 * Do newidle balancing proportional to its success rate using randomization. 134 132 */ 135 133 SCHED_FEAT(NI_RANDOM, true) 134 + SCHED_FEAT(NI_RATE, true)

+26 -38

kernel/sched/rt.c

··· 1302 1302 int flags) 1303 1303 { 1304 1304 struct task_struct *p = NULL; 1305 + struct rq *rq = rq_of_rt_rq(rt_rq); 1305 1306 1306 1307 if (!schedstat_enabled()) 1307 1308 return; 1308 1309 1309 - if (rt_entity_is_task(rt_se)) 1310 + if (rt_entity_is_task(rt_se)) { 1310 1311 p = rt_task_of(rt_se); 1312 + 1313 + if (p != rq->curr) 1314 + update_stats_wait_end_rt(rt_rq, rt_se); 1315 + } 1311 1316 1312 1317 if ((flags & DEQUEUE_SLEEP) && p) { 1313 1318 unsigned int state; ··· 1858 1853 1859 1854 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1860 1855 { 1861 - struct task_struct *p; 1856 + struct plist_head *head = &rq->rt.pushable_tasks; 1857 + struct task_struct *i, *p = NULL; 1862 1858 1863 1859 if (!has_pushable_tasks(rq)) 1864 1860 return NULL; 1865 1861 1866 - p = plist_first_entry(&rq->rt.pushable_tasks, 1867 - struct task_struct, pushable_tasks); 1862 + plist_for_each_entry(i, head, pushable_tasks) { 1863 + /* make sure task isn't on_cpu (possible with proxy-exec) */ 1864 + if (!task_on_cpu(rq, i)) { 1865 + p = i; 1866 + break; 1867 + } 1868 + } 1869 + 1870 + if (!p) 1871 + return NULL; 1868 1872 1869 1873 BUG_ON(rq->cpu != task_cpu(p)); 1870 1874 BUG_ON(task_current(rq, p)); ··· 2666 2652 { 2667 2653 struct rt_schedulable_data *d = data; 2668 2654 struct task_group *child; 2669 - unsigned long total, sum = 0; 2655 + u64 total, sum = 0; 2670 2656 u64 period, runtime; 2671 2657 2672 2658 period = ktime_to_ns(tg->rt_bandwidth.rt_period); ··· 2688 2674 */ 2689 2675 if (rt_bandwidth_enabled() && !runtime && 2690 2676 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) 2691 - return -EBUSY; 2692 - 2693 - if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) 2694 2677 return -EBUSY; 2695 2678 2696 2679 total = to_ratio(period, runtime); ··· 2829 2818 return rt_period_us; 2830 2819 } 2831 2820 2832 - #ifdef CONFIG_SYSCTL 2833 - static int sched_rt_global_constraints(void) 2834 - { 2835 - int ret = 0; 2836 - 2837 - mutex_lock(&rt_constraints_mutex); 2838 - ret = __rt_schedulable(NULL, 0, 0); 2839 - mutex_unlock(&rt_constraints_mutex); 2840 - 2841 - return ret; 2842 - } 2843 - #endif /* CONFIG_SYSCTL */ 2844 - 2845 2821 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 2846 2822 { 2847 2823 /* Don't accept real-time tasks when there is no way for them to run */ ··· 2838 2840 return 1; 2839 2841 } 2840 2842 2841 - #else /* !CONFIG_RT_GROUP_SCHED: */ 2842 - 2843 - #ifdef CONFIG_SYSCTL 2844 - static int sched_rt_global_constraints(void) 2845 - { 2846 - return 0; 2847 - } 2848 - #endif /* CONFIG_SYSCTL */ 2849 2843 #endif /* !CONFIG_RT_GROUP_SCHED */ 2850 2844 2851 2845 #ifdef CONFIG_SYSCTL ··· 2849 2859 NSEC_PER_USEC > max_rt_runtime))) 2850 2860 return -EINVAL; 2851 2861 2852 - return 0; 2853 - } 2862 + #ifdef CONFIG_RT_GROUP_SCHED 2863 + if (!rt_group_sched_enabled()) 2864 + return 0; 2854 2865 2855 - static void sched_rt_do_global(void) 2856 - { 2866 + scoped_guard(mutex, &rt_constraints_mutex) 2867 + return __rt_schedulable(NULL, 0, 0); 2868 + #endif 2869 + return 0; 2857 2870 } 2858 2871 2859 2872 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, ··· 2882 2889 if (ret) 2883 2890 goto undo; 2884 2891 2885 - ret = sched_rt_global_constraints(); 2886 - if (ret) 2887 - goto undo; 2888 - 2889 - sched_rt_do_global(); 2890 2892 sched_dl_do_global(); 2891 2893 } 2892 2894 if (0) {

+42 -8

kernel/sched/sched.h

··· 356 356 extern void sched_dl_do_global(void); 357 357 extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); 358 358 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 359 - extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 359 + extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags); 360 360 extern bool __checkparam_dl(const struct sched_attr *attr); 361 361 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 362 362 extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); ··· 684 684 685 685 s64 sum_w_vruntime; 686 686 u64 sum_weight; 687 - 688 687 u64 zero_vruntime; 688 + unsigned int sum_shift; 689 + 689 690 #ifdef CONFIG_SCHED_CORE 690 691 unsigned int forceidle_seq; 691 692 u64 zero_vruntime_fi; ··· 1612 1611 extern bool raw_spin_rq_trylock(struct rq *rq) 1613 1612 __cond_acquires(true, __rq_lockp(rq)); 1614 1613 1615 - extern void raw_spin_rq_unlock(struct rq *rq) 1616 - __releases(__rq_lockp(rq)); 1617 - 1618 1614 static inline void raw_spin_rq_lock(struct rq *rq) 1619 1615 __acquires(__rq_lockp(rq)) 1620 1616 { 1621 1617 raw_spin_rq_lock_nested(rq, 0); 1618 + } 1619 + 1620 + static inline void raw_spin_rq_unlock(struct rq *rq) 1621 + __releases(__rq_lockp(rq)) 1622 + { 1623 + raw_spin_unlock(rq_lockp(rq)); 1622 1624 } 1623 1625 1624 1626 static inline void raw_spin_rq_lock_irq(struct rq *rq) ··· 1862 1858 static inline void scx_rq_clock_invalidate(struct rq *rq) {} 1863 1859 #endif /* !CONFIG_SCHED_CLASS_EXT */ 1864 1860 1861 + static inline void assert_balance_callbacks_empty(struct rq *rq) 1862 + { 1863 + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) && 1864 + rq->balance_callback && 1865 + rq->balance_callback != &balance_push_callback); 1866 + } 1867 + 1865 1868 /* 1866 1869 * Lockdep annotation that avoids accidental unlocks; it's like a 1867 1870 * sticky/continuous lockdep_assert_held(). ··· 1885 1874 1886 1875 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1887 1876 rf->clock_update_flags = 0; 1888 - WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); 1877 + assert_balance_callbacks_empty(rq); 1889 1878 } 1890 1879 1891 1880 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) ··· 2865 2854 2866 2855 static inline struct cpuidle_state *idle_get_state(struct rq *rq) 2867 2856 { 2868 - WARN_ON_ONCE(!rcu_read_lock_held()); 2857 + lockdep_assert(rcu_read_lock_any_held()); 2869 2858 2870 2859 return rq->idle_state; 2871 2860 } ··· 2912 2901 #define MAX_BW_BITS (64 - BW_SHIFT) 2913 2902 #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) 2914 2903 2915 - extern unsigned long to_ratio(u64 period, u64 runtime); 2904 + extern u64 to_ratio(u64 period, u64 runtime); 2916 2905 2917 2906 extern void init_entity_runnable_average(struct sched_entity *se); 2918 2907 extern void post_init_entity_util_avg(struct task_struct *p); ··· 3016 3005 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); 3017 3006 3018 3007 extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); 3008 + 3009 + /* 3010 + * attach_task() -- attach the task detached by detach_task() to its new rq. 3011 + */ 3012 + static inline void attach_task(struct rq *rq, struct task_struct *p) 3013 + { 3014 + lockdep_assert_rq_held(rq); 3015 + 3016 + WARN_ON_ONCE(task_rq(p) != rq); 3017 + activate_task(rq, p, ENQUEUE_NOCLOCK); 3018 + wakeup_preempt(rq, p, 0); 3019 + } 3020 + 3021 + /* 3022 + * attach_one_task() -- attaches the task returned from detach_one_task() to 3023 + * its new rq. 3024 + */ 3025 + static inline void attach_one_task(struct rq *rq, struct task_struct *p) 3026 + { 3027 + guard(rq_lock)(rq); 3028 + update_rq_clock(rq); 3029 + attach_task(rq, p); 3030 + } 3019 3031 3020 3032 #ifdef CONFIG_PREEMPT_RT 3021 3033 # define SCHED_NR_MIGRATE_BREAK 8

+11 -5

kernel/sched/syscalls.c

··· 911 911 return -E2BIG; 912 912 } 913 913 914 - static void get_params(struct task_struct *p, struct sched_attr *attr) 914 + static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags) 915 915 { 916 916 if (task_has_dl_policy(p)) { 917 - __getparam_dl(p, attr); 917 + __getparam_dl(p, attr, flags); 918 918 } else if (task_has_rt_policy(p)) { 919 919 attr->sched_priority = p->rt_priority; 920 920 } else { ··· 980 980 return -ESRCH; 981 981 982 982 if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) 983 - get_params(p, &attr); 983 + get_params(p, &attr, 0); 984 984 985 985 return sched_setattr(p, &attr); 986 986 } ··· 1065 1065 int retval; 1066 1066 1067 1067 if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || 1068 - usize < SCHED_ATTR_SIZE_VER0 || flags)) 1068 + usize < SCHED_ATTR_SIZE_VER0)) 1069 1069 return -EINVAL; 1070 1070 1071 1071 scoped_guard (rcu) { 1072 1072 p = find_process_by_pid(pid); 1073 1073 if (!p) 1074 1074 return -ESRCH; 1075 + 1076 + if (flags) { 1077 + if (!task_has_dl_policy(p) || 1078 + flags != SCHED_GETATTR_FLAG_DL_DYNAMIC) 1079 + return -EINVAL; 1080 + } 1075 1081 1076 1082 retval = security_task_getscheduler(p); 1077 1083 if (retval) ··· 1086 1080 kattr.sched_policy = p->policy; 1087 1081 if (p->sched_reset_on_fork) 1088 1082 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 1089 - get_params(p, &kattr); 1083 + get_params(p, &kattr, flags); 1090 1084 kattr.sched_flags &= SCHED_FLAG_ALL; 1091 1085 1092 1086 #ifdef CONFIG_UCLAMP_TASK

+172 -103

kernel/sched/topology.c

··· 4 4 */ 5 5 6 6 #include <linux/sched/isolation.h> 7 + #include <linux/sched/clock.h> 7 8 #include <linux/bsearch.h> 8 9 #include "sched.h" 9 10 ··· 273 272 static int sched_energy_aware_handler(const struct ctl_table *table, int write, 274 273 void *buffer, size_t *lenp, loff_t *ppos) 275 274 { 276 - int ret, state; 275 + int ret; 277 276 278 277 if (write && !capable(CAP_SYS_ADMIN)) 279 278 return -EPERM; ··· 289 288 290 289 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 291 290 if (!ret && write) { 292 - state = static_branch_unlikely(&sched_energy_present); 293 - if (state != sysctl_sched_energy_aware) 291 + if (sysctl_sched_energy_aware != sched_energy_enabled()) 294 292 rebuild_sched_domains_energy(); 295 293 } 296 294 ··· 387 387 388 388 static void sched_energy_set(bool has_eas) 389 389 { 390 - if (!has_eas && static_branch_unlikely(&sched_energy_present)) { 390 + if (!has_eas && sched_energy_enabled()) { 391 391 if (sched_debug()) 392 392 pr_info("%s: stopping EAS\n", __func__); 393 393 static_branch_disable_cpuslocked(&sched_energy_present); 394 - } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { 394 + } else if (has_eas && !sched_energy_enabled()) { 395 395 if (sched_debug()) 396 396 pr_info("%s: starting EAS\n", __func__); 397 397 static_branch_enable_cpuslocked(&sched_energy_present); ··· 684 684 if (sd) { 685 685 id = cpumask_first(sched_domain_span(sd)); 686 686 size = cpumask_weight(sched_domain_span(sd)); 687 + 688 + /* If sd_llc exists, sd_llc_shared should exist too. */ 689 + WARN_ON_ONCE(!sd->shared); 687 690 sds = sd->shared; 688 691 } 689 692 ··· 734 731 735 732 if (sd_parent_degenerate(tmp, parent)) { 736 733 tmp->parent = parent->parent; 734 + 735 + /* Pick reference to parent->shared. */ 736 + if (parent->shared) { 737 + WARN_ON_ONCE(tmp->shared); 738 + tmp->shared = parent->shared; 739 + parent->shared = NULL; 740 + } 737 741 738 742 if (parent->parent) { 739 743 parent->parent->child = tmp; ··· 791 781 } 792 782 793 783 struct s_data { 784 + struct sched_domain_shared * __percpu *sds; 794 785 struct sched_domain * __percpu *sd; 795 786 struct root_domain *rd; 796 787 }; ··· 799 788 enum s_alloc { 800 789 sa_rootdomain, 801 790 sa_sd, 791 + sa_sd_shared, 802 792 sa_sd_storage, 803 793 sa_none, 804 794 }; ··· 1546 1534 static void __sdt_free(const struct cpumask *cpu_map); 1547 1535 static int __sdt_alloc(const struct cpumask *cpu_map); 1548 1536 1537 + static void __sds_free(struct s_data *d, const struct cpumask *cpu_map); 1538 + static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map); 1539 + 1549 1540 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 1550 1541 const struct cpumask *cpu_map) 1551 1542 { ··· 1559 1544 fallthrough; 1560 1545 case sa_sd: 1561 1546 free_percpu(d->sd); 1547 + fallthrough; 1548 + case sa_sd_shared: 1549 + __sds_free(d, cpu_map); 1562 1550 fallthrough; 1563 1551 case sa_sd_storage: 1564 1552 __sdt_free(cpu_map); ··· 1578 1560 1579 1561 if (__sdt_alloc(cpu_map)) 1580 1562 return sa_sd_storage; 1563 + if (__sds_alloc(d, cpu_map)) 1564 + return sa_sd_shared; 1581 1565 d->sd = alloc_percpu(struct sched_domain *); 1582 1566 if (!d->sd) 1583 - return sa_sd_storage; 1567 + return sa_sd_shared; 1584 1568 d->rd = alloc_rootdomain(); 1585 1569 if (!d->rd) 1586 1570 return sa_sd; ··· 1595 1575 * sched_group structure so that the subsequent __free_domain_allocs() 1596 1576 * will not free the data we're using. 1597 1577 */ 1598 - static void claim_allocations(int cpu, struct sched_domain *sd) 1578 + static void claim_allocations(int cpu, struct s_data *d) 1599 1579 { 1600 - struct sd_data *sdd = sd->private; 1580 + struct sched_domain *sd; 1601 1581 1602 - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 1603 - *per_cpu_ptr(sdd->sd, cpu) = NULL; 1582 + if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref)) 1583 + *per_cpu_ptr(d->sds, cpu) = NULL; 1604 1584 1605 - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) 1606 - *per_cpu_ptr(sdd->sds, cpu) = NULL; 1585 + for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) { 1586 + struct sd_data *sdd = sd->private; 1607 1587 1608 - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 1609 - *per_cpu_ptr(sdd->sg, cpu) = NULL; 1588 + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 1589 + *per_cpu_ptr(sdd->sd, cpu) = NULL; 1610 1590 1611 - if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 1612 - *per_cpu_ptr(sdd->sgc, cpu) = NULL; 1591 + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 1592 + *per_cpu_ptr(sdd->sg, cpu) = NULL; 1593 + 1594 + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 1595 + *per_cpu_ptr(sdd->sgc, cpu) = NULL; 1596 + } 1613 1597 } 1614 1598 1615 1599 #ifdef CONFIG_NUMA ··· 1666 1642 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 1667 1643 int sd_id, sd_weight, sd_flags = 0; 1668 1644 struct cpumask *sd_span; 1645 + u64 now = sched_clock(); 1669 1646 1670 - sd_weight = cpumask_weight(tl->mask(tl, cpu)); 1647 + sd_span = sched_domain_span(sd); 1648 + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); 1649 + sd_weight = cpumask_weight(sd_span); 1650 + sd_id = cpumask_first(sd_span); 1671 1651 1672 1652 if (tl->sd_flags) 1673 1653 sd_flags = (*tl->sd_flags)(); 1674 1654 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 1675 - "wrong sd_flags in topology description\n")) 1655 + "wrong sd_flags in topology description\n")) 1676 1656 sd_flags &= TOPOLOGY_SD_FLAGS; 1657 + sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map); 1677 1658 1678 1659 *sd = (struct sched_domain){ 1679 1660 .min_interval = sd_weight, ··· 1708 1679 .newidle_call = 512, 1709 1680 .newidle_success = 256, 1710 1681 .newidle_ratio = 512, 1682 + .newidle_stamp = now, 1711 1683 1712 1684 .max_newidle_lb_cost = 0, 1713 1685 .last_decay_max_lb_cost = jiffies, 1714 1686 .child = child, 1715 1687 .name = tl->name, 1716 1688 }; 1717 - 1718 - sd_span = sched_domain_span(sd); 1719 - cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); 1720 - sd_id = cpumask_first(sd_span); 1721 - 1722 - sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); 1723 1689 1724 1690 WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == 1725 1691 (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), ··· 1749 1725 #endif /* CONFIG_NUMA */ 1750 1726 } else { 1751 1727 sd->cache_nice_tries = 1; 1752 - } 1753 - 1754 - /* 1755 - * For all levels sharing cache; connect a sched_domain_shared 1756 - * instance. 1757 - */ 1758 - if (sd->flags & SD_SHARE_LLC) { 1759 - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); 1760 - atomic_inc(&sd->shared->ref); 1761 - atomic_set(&sd->shared->nr_busy_cpus, sd_weight); 1762 1728 } 1763 1729 1764 1730 sd->private = sdd; ··· 2386 2372 if (!sdd->sd) 2387 2373 return -ENOMEM; 2388 2374 2389 - sdd->sds = alloc_percpu(struct sched_domain_shared *); 2390 - if (!sdd->sds) 2391 - return -ENOMEM; 2392 - 2393 2375 sdd->sg = alloc_percpu(struct sched_group *); 2394 2376 if (!sdd->sg) 2395 2377 return -ENOMEM; ··· 2396 2386 2397 2387 for_each_cpu(j, cpu_map) { 2398 2388 struct sched_domain *sd; 2399 - struct sched_domain_shared *sds; 2400 2389 struct sched_group *sg; 2401 2390 struct sched_group_capacity *sgc; 2402 2391 ··· 2405 2396 return -ENOMEM; 2406 2397 2407 2398 *per_cpu_ptr(sdd->sd, j) = sd; 2408 - 2409 - sds = kzalloc_node(sizeof(struct sched_domain_shared), 2410 - GFP_KERNEL, cpu_to_node(j)); 2411 - if (!sds) 2412 - return -ENOMEM; 2413 - 2414 - *per_cpu_ptr(sdd->sds, j) = sds; 2415 2399 2416 2400 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 2417 2401 GFP_KERNEL, cpu_to_node(j)); ··· 2447 2445 kfree(*per_cpu_ptr(sdd->sd, j)); 2448 2446 } 2449 2447 2450 - if (sdd->sds) 2451 - kfree(*per_cpu_ptr(sdd->sds, j)); 2452 2448 if (sdd->sg) 2453 2449 kfree(*per_cpu_ptr(sdd->sg, j)); 2454 2450 if (sdd->sgc) ··· 2454 2454 } 2455 2455 free_percpu(sdd->sd); 2456 2456 sdd->sd = NULL; 2457 - free_percpu(sdd->sds); 2458 - sdd->sds = NULL; 2459 2457 free_percpu(sdd->sg); 2460 2458 sdd->sg = NULL; 2461 2459 free_percpu(sdd->sgc); 2462 2460 sdd->sgc = NULL; 2463 2461 } 2462 + } 2463 + 2464 + static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map) 2465 + { 2466 + int j; 2467 + 2468 + d->sds = alloc_percpu(struct sched_domain_shared *); 2469 + if (!d->sds) 2470 + return -ENOMEM; 2471 + 2472 + for_each_cpu(j, cpu_map) { 2473 + struct sched_domain_shared *sds; 2474 + 2475 + sds = kzalloc_node(sizeof(struct sched_domain_shared), 2476 + GFP_KERNEL, cpu_to_node(j)); 2477 + if (!sds) 2478 + return -ENOMEM; 2479 + 2480 + *per_cpu_ptr(d->sds, j) = sds; 2481 + } 2482 + 2483 + return 0; 2484 + } 2485 + 2486 + static void __sds_free(struct s_data *d, const struct cpumask *cpu_map) 2487 + { 2488 + int j; 2489 + 2490 + if (!d->sds) 2491 + return; 2492 + 2493 + for_each_cpu(j, cpu_map) 2494 + kfree(*per_cpu_ptr(d->sds, j)); 2495 + 2496 + free_percpu(d->sds); 2497 + d->sds = NULL; 2464 2498 } 2465 2499 2466 2500 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, ··· 2583 2549 } 2584 2550 2585 2551 /* 2552 + * Calculate an allowed NUMA imbalance such that LLCs do not get 2553 + * imbalanced. 2554 + */ 2555 + static void adjust_numa_imbalance(struct sched_domain *sd_llc) 2556 + { 2557 + struct sched_domain *parent; 2558 + unsigned int imb_span = 1; 2559 + unsigned int imb = 0; 2560 + unsigned int nr_llcs; 2561 + 2562 + WARN_ON(!(sd_llc->flags & SD_SHARE_LLC)); 2563 + WARN_ON(!sd_llc->parent); 2564 + 2565 + /* 2566 + * For a single LLC per node, allow an 2567 + * imbalance up to 12.5% of the node. This is 2568 + * arbitrary cutoff based two factors -- SMT and 2569 + * memory channels. For SMT-2, the intent is to 2570 + * avoid premature sharing of HT resources but 2571 + * SMT-4 or SMT-8 *may* benefit from a different 2572 + * cutoff. For memory channels, this is a very 2573 + * rough estimate of how many channels may be 2574 + * active and is based on recent CPUs with 2575 + * many cores. 2576 + * 2577 + * For multiple LLCs, allow an imbalance 2578 + * until multiple tasks would share an LLC 2579 + * on one node while LLCs on another node 2580 + * remain idle. This assumes that there are 2581 + * enough logical CPUs per LLC to avoid SMT 2582 + * factors and that there is a correlation 2583 + * between LLCs and memory channels. 2584 + */ 2585 + nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight; 2586 + if (nr_llcs == 1) 2587 + imb = sd_llc->parent->span_weight >> 3; 2588 + else 2589 + imb = nr_llcs; 2590 + 2591 + imb = max(1U, imb); 2592 + sd_llc->parent->imb_numa_nr = imb; 2593 + 2594 + /* 2595 + * Set span based on the first NUMA domain. 2596 + * 2597 + * NUMA systems always add a NODE domain before 2598 + * iterating the NUMA domains. Since this is before 2599 + * degeneration, start from sd_llc's parent's 2600 + * parent which is the lowest an SD_NUMA domain can 2601 + * be relative to sd_llc. 2602 + */ 2603 + parent = sd_llc->parent->parent; 2604 + while (parent && !(parent->flags & SD_NUMA)) 2605 + parent = parent->parent; 2606 + 2607 + imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight; 2608 + 2609 + /* Update the upper remainder of the topology */ 2610 + parent = sd_llc->parent; 2611 + while (parent) { 2612 + int factor = max(1U, (parent->span_weight / imb_span)); 2613 + 2614 + parent->imb_numa_nr = imb * factor; 2615 + parent = parent->parent; 2616 + } 2617 + } 2618 + 2619 + /* 2586 2620 * Build sched domains for a given set of CPUs and attach the sched domains 2587 2621 * to the individual CPUs 2588 2622 */ ··· 2707 2605 } 2708 2606 } 2709 2607 2710 - /* 2711 - * Calculate an allowed NUMA imbalance such that LLCs do not get 2712 - * imbalanced. 2713 - */ 2714 2608 for_each_cpu(i, cpu_map) { 2715 - unsigned int imb = 0; 2716 - unsigned int imb_span = 1; 2609 + sd = *per_cpu_ptr(d.sd, i); 2610 + if (!sd) 2611 + continue; 2717 2612 2718 - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 2719 - struct sched_domain *child = sd->child; 2613 + /* First, find the topmost SD_SHARE_LLC domain */ 2614 + while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) 2615 + sd = sd->parent; 2720 2616 2721 - if (!(sd->flags & SD_SHARE_LLC) && child && 2722 - (child->flags & SD_SHARE_LLC)) { 2723 - struct sched_domain __rcu *top_p; 2724 - unsigned int nr_llcs; 2617 + if (sd->flags & SD_SHARE_LLC) { 2618 + int sd_id = cpumask_first(sched_domain_span(sd)); 2725 2619 2726 - /* 2727 - * For a single LLC per node, allow an 2728 - * imbalance up to 12.5% of the node. This is 2729 - * arbitrary cutoff based two factors -- SMT and 2730 - * memory channels. For SMT-2, the intent is to 2731 - * avoid premature sharing of HT resources but 2732 - * SMT-4 or SMT-8 *may* benefit from a different 2733 - * cutoff. For memory channels, this is a very 2734 - * rough estimate of how many channels may be 2735 - * active and is based on recent CPUs with 2736 - * many cores. 2737 - * 2738 - * For multiple LLCs, allow an imbalance 2739 - * until multiple tasks would share an LLC 2740 - * on one node while LLCs on another node 2741 - * remain idle. This assumes that there are 2742 - * enough logical CPUs per LLC to avoid SMT 2743 - * factors and that there is a correlation 2744 - * between LLCs and memory channels. 2745 - */ 2746 - nr_llcs = sd->span_weight / child->span_weight; 2747 - if (nr_llcs == 1) 2748 - imb = sd->span_weight >> 3; 2749 - else 2750 - imb = nr_llcs; 2751 - imb = max(1U, imb); 2752 - sd->imb_numa_nr = imb; 2620 + sd->shared = *per_cpu_ptr(d.sds, sd_id); 2621 + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); 2622 + atomic_inc(&sd->shared->ref); 2753 2623 2754 - /* Set span based on the first NUMA domain. */ 2755 - top_p = sd->parent; 2756 - while (top_p && !(top_p->flags & SD_NUMA)) { 2757 - top_p = top_p->parent; 2758 - } 2759 - imb_span = top_p ? top_p->span_weight : sd->span_weight; 2760 - } else { 2761 - int factor = max(1U, (sd->span_weight / imb_span)); 2762 - 2763 - sd->imb_numa_nr = imb * factor; 2764 - } 2624 + /* 2625 + * In presence of higher domains, adjust the 2626 + * NUMA imbalance stats for the hierarchy. 2627 + */ 2628 + if (IS_ENABLED(CONFIG_NUMA) && sd->parent) 2629 + adjust_numa_imbalance(sd); 2765 2630 } 2766 2631 } 2767 2632 ··· 2737 2668 if (!cpumask_test_cpu(i, cpu_map)) 2738 2669 continue; 2739 2670 2740 - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 2741 - claim_allocations(i, sd); 2671 + claim_allocations(i, &d); 2672 + 2673 + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) 2742 2674 init_sched_groups_capacity(i, sd); 2743 - } 2744 2675 } 2745 2676 2746 2677 /* Attach the domains */

Configure Feed

Configure Feed