Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduling fixes from Borislav Petkov:

- Add PREEMPT_RT maintainers

- Fix another aspect of delayed dequeued tasks wrt determining their
state, i.e., whether they're runnable or blocked

- Handle delayed dequeued tasks and their migration wrt PSI properly

- Fix the situation where a delayed dequeue task gets enqueued into a
new class, which should not happen

- Fix a case where memory allocation would happen while the runqueue
lock is held, which is a no-no

- Do not over-schedule when tasks with shorter slices preempt the
currently running task

- Make sure delayed to deque entities are properly handled before
unthrottling

- Other smaller cleanups and improvements

* tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
MAINTAINERS: Add an entry for PREEMPT_RT.
sched/fair: Fix external p->on_rq users
sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug
sched/core: Dequeue PSI signals for blocked tasks that are delayed
sched: Fix delayed_dequeue vs switched_from_fair()
sched/core: Disable page allocation in task_tick_mm_cid()
sched/deadline: Use hrtick_enabled_dl() before start_hrtick_dl()
sched/eevdf: Fix wakeup-preempt by checking cfs_rq->nr_running
sched: Fix sched_delayed vs cfs_bandwidth

+148 -74
+8
MAINTAINERS
··· 19527 19527 F: Documentation/tools/rtla/ 19528 19528 F: tools/tracing/rtla/ 19529 19529 19530 + Real-time Linux (PREEMPT_RT) 19531 + M: Sebastian Andrzej Siewior <bigeasy@linutronix.de> 19532 + M: Clark Williams <clrkwllms@kernel.org> 19533 + M: Steven Rostedt <rostedt@goodmis.org> 19534 + L: linux-rt-devel@lists.linux.dev 19535 + S: Supported 19536 + K: PREEMPT_RT 19537 + 19530 19538 REALTEK AUDIO CODECS 19531 19539 M: Oder Chiou <oder_chiou@realtek.com> 19532 19540 S: Maintained
+5
include/linux/sched.h
··· 2133 2133 2134 2134 #endif /* CONFIG_SMP */ 2135 2135 2136 + static inline bool task_is_runnable(struct task_struct *p) 2137 + { 2138 + return p->on_rq && !p->se.sched_delayed; 2139 + } 2140 + 2136 2141 extern bool sched_task_on_rq(struct task_struct *p); 2137 2142 extern unsigned long get_wchan(struct task_struct *p); 2138 2143 extern struct task_struct *cpu_curr_snapshot(int cpu);
+4 -1
include/linux/task_work.h
··· 14 14 } 15 15 16 16 enum task_work_notify_mode { 17 - TWA_NONE, 17 + TWA_NONE = 0, 18 18 TWA_RESUME, 19 19 TWA_SIGNAL, 20 20 TWA_SIGNAL_NO_IPI, 21 21 TWA_NMI_CURRENT, 22 + 23 + TWA_FLAGS = 0xff00, 24 + TWAF_NO_ALLOC = 0x0100, 22 25 }; 23 26 24 27 static inline bool task_work_pending(struct task_struct *task)
+1 -1
kernel/events/core.c
··· 9251 9251 }, 9252 9252 }; 9253 9253 9254 - if (!sched_in && task->on_rq) { 9254 + if (!sched_in && task_is_runnable(task)) { 9255 9255 switch_event.event_id.header.misc |= 9256 9256 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; 9257 9257 }
+6 -1
kernel/freezer.c
··· 109 109 { 110 110 unsigned int state = READ_ONCE(p->__state); 111 111 112 - if (p->on_rq) 112 + /* 113 + * Allow freezing the sched_delayed tasks; they will not execute until 114 + * ttwu() fixes them up, so it is safe to swap their state now, instead 115 + * of waiting for them to get fully dequeued. 116 + */ 117 + if (task_is_runnable(p)) 113 118 return 0; 114 119 115 120 if (p != current && task_curr(p))
+9
kernel/rcu/tasks.h
··· 986 986 return false; 987 987 988 988 /* 989 + * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but 990 + * since it is a spurious state (it will transition into the 991 + * traditional blocked state or get woken up without outside 992 + * dependencies), not considering it such should only affect timing. 993 + * 994 + * Be conservative for now and not include it. 995 + */ 996 + 997 + /* 989 998 * Idle tasks (or idle injection) within the idle loop are RCU-tasks 990 999 * quiescent states. But CPU boot code performed by the idle task 991 1000 * isn't a quiescent state.
+41 -24
kernel/sched/core.c
··· 548 548 * ON_RQ_MIGRATING state is used for migration without holding both 549 549 * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). 550 550 * 551 + * Additionally it is possible to be ->on_rq but still be considered not 552 + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue 553 + * but will be dequeued as soon as they get picked again. See the 554 + * task_is_runnable() helper. 555 + * 551 556 * p->on_cpu <- { 0, 1 }: 552 557 * 553 558 * is set by prepare_task() and cleared by finish_task() such that it will be ··· 2017 2012 if (!(flags & ENQUEUE_NOCLOCK)) 2018 2013 update_rq_clock(rq); 2019 2014 2020 - if (!(flags & ENQUEUE_RESTORE)) { 2021 - sched_info_enqueue(rq, p); 2022 - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); 2023 - } 2024 - 2025 2015 p->sched_class->enqueue_task(rq, p, flags); 2026 2016 /* 2027 2017 * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear 2028 2018 * ->sched_delayed. 2029 2019 */ 2030 2020 uclamp_rq_inc(rq, p); 2021 + 2022 + if (!(flags & ENQUEUE_RESTORE)) { 2023 + sched_info_enqueue(rq, p); 2024 + psi_enqueue(p, flags & ENQUEUE_MIGRATED); 2025 + } 2031 2026 2032 2027 if (sched_core_enabled(rq)) 2033 2028 sched_core_enqueue(rq, p); ··· 2046 2041 2047 2042 if (!(flags & DEQUEUE_SAVE)) { 2048 2043 sched_info_dequeue(rq, p); 2049 - psi_dequeue(p, flags & DEQUEUE_SLEEP); 2044 + psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); 2050 2045 } 2051 2046 2052 2047 /* ··· 4328 4323 * @arg: Argument to function. 4329 4324 * 4330 4325 * Fix the task in it's current state by avoiding wakeups and or rq operations 4331 - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() 4332 - * to work out what the state is, if required. Given that @func can be invoked 4333 - * with a runqueue lock held, it had better be quite lightweight. 4326 + * and call @func(@arg) on it. This function can use task_is_runnable() and 4327 + * task_curr() to work out what the state is, if required. Given that @func 4328 + * can be invoked with a runqueue lock held, it had better be quite 4329 + * lightweight. 4334 4330 * 4335 4331 * Returns: 4336 4332 * Whatever @func returns ··· 6550 6544 * as a preemption by schedule_debug() and RCU. 6551 6545 */ 6552 6546 bool preempt = sched_mode > SM_NONE; 6547 + bool block = false; 6553 6548 unsigned long *switch_count; 6554 6549 unsigned long prev_state; 6555 6550 struct rq_flags rf; ··· 6636 6629 * After this, schedule() must not care about p->state any more. 6637 6630 */ 6638 6631 block_task(rq, prev, flags); 6632 + block = true; 6639 6633 } 6640 6634 switch_count = &prev->nvcsw; 6641 6635 } ··· 6682 6674 6683 6675 migrate_disable_switch(rq, prev); 6684 6676 psi_account_irqtime(rq, prev, next); 6685 - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 6677 + psi_sched_switch(prev, next, block); 6686 6678 6687 6679 trace_sched_switch(preempt, prev, next, prev_state); 6688 6680 ··· 7025 7017 } 7026 7018 EXPORT_SYMBOL(default_wake_function); 7027 7019 7028 - void __setscheduler_prio(struct task_struct *p, int prio) 7020 + const struct sched_class *__setscheduler_class(struct task_struct *p, int prio) 7029 7021 { 7030 7022 if (dl_prio(prio)) 7031 - p->sched_class = &dl_sched_class; 7032 - else if (rt_prio(prio)) 7033 - p->sched_class = &rt_sched_class; 7034 - #ifdef CONFIG_SCHED_CLASS_EXT 7035 - else if (task_should_scx(p)) 7036 - p->sched_class = &ext_sched_class; 7037 - #endif 7038 - else 7039 - p->sched_class = &fair_sched_class; 7023 + return &dl_sched_class; 7040 7024 7041 - p->prio = prio; 7025 + if (rt_prio(prio)) 7026 + return &rt_sched_class; 7027 + 7028 + #ifdef CONFIG_SCHED_CLASS_EXT 7029 + if (task_should_scx(p)) 7030 + return &ext_sched_class; 7031 + #endif 7032 + 7033 + return &fair_sched_class; 7042 7034 } 7043 7035 7044 7036 #ifdef CONFIG_RT_MUTEXES ··· 7084 7076 { 7085 7077 int prio, oldprio, queued, running, queue_flag = 7086 7078 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7087 - const struct sched_class *prev_class; 7079 + const struct sched_class *prev_class, *next_class; 7088 7080 struct rq_flags rf; 7089 7081 struct rq *rq; 7090 7082 ··· 7142 7134 queue_flag &= ~DEQUEUE_MOVE; 7143 7135 7144 7136 prev_class = p->sched_class; 7137 + next_class = __setscheduler_class(p, prio); 7138 + 7139 + if (prev_class != next_class && p->se.sched_delayed) 7140 + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 7141 + 7145 7142 queued = task_on_rq_queued(p); 7146 7143 running = task_current(rq, p); 7147 7144 if (queued) ··· 7184 7171 p->rt.timeout = 0; 7185 7172 } 7186 7173 7187 - __setscheduler_prio(p, prio); 7174 + p->sched_class = next_class; 7175 + p->prio = prio; 7176 + 7188 7177 check_class_changing(rq, p, prev_class); 7189 7178 7190 7179 if (queued) ··· 10480 10465 return; 10481 10466 if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) 10482 10467 return; 10483 - task_work_add(curr, work, TWA_RESUME); 10468 + 10469 + /* No page allocation under rq lock */ 10470 + task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); 10484 10471 } 10485 10472 10486 10473 void sched_mm_cid_exit_signals(struct task_struct *t)
+1 -1
kernel/sched/deadline.c
··· 2385 2385 2386 2386 deadline_queue_push_tasks(rq); 2387 2387 2388 - if (hrtick_enabled(rq)) 2388 + if (hrtick_enabled_dl(rq)) 2389 2389 start_hrtick_dl(rq, &p->dl); 2390 2390 } 2391 2391
+2 -2
kernel/sched/ext.c
··· 4493 4493 4494 4494 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4495 4495 4496 - __setscheduler_prio(p, p->prio); 4496 + p->sched_class = __setscheduler_class(p, p->prio); 4497 4497 check_class_changing(task_rq(p), p, old_class); 4498 4498 4499 4499 sched_enq_and_set_task(&ctx); ··· 5204 5204 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 5205 5205 5206 5206 p->scx.slice = SCX_SLICE_DFL; 5207 - __setscheduler_prio(p, p->prio); 5207 + p->sched_class = __setscheduler_class(p, p->prio); 5208 5208 check_class_changing(task_rq(p), p, old_class); 5209 5209 5210 5210 sched_enq_and_set_task(&ctx);
+7 -20
kernel/sched/fair.c
··· 1247 1247 1248 1248 account_cfs_rq_runtime(cfs_rq, delta_exec); 1249 1249 1250 - if (rq->nr_running == 1) 1250 + if (cfs_rq->nr_running == 1) 1251 1251 return; 1252 1252 1253 1253 if (resched || did_preempt_short(cfs_rq, curr)) { ··· 6058 6058 for_each_sched_entity(se) { 6059 6059 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6060 6060 6061 - if (se->on_rq) { 6062 - SCHED_WARN_ON(se->sched_delayed); 6061 + /* Handle any unfinished DELAY_DEQUEUE business first. */ 6062 + if (se->sched_delayed) { 6063 + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; 6064 + 6065 + dequeue_entity(qcfs_rq, se, flags); 6066 + } else if (se->on_rq) 6063 6067 break; 6064 - } 6065 6068 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); 6066 6069 6067 6070 if (cfs_rq_is_idle(group_cfs_rq(se))) ··· 13177 13174 static void switched_from_fair(struct rq *rq, struct task_struct *p) 13178 13175 { 13179 13176 detach_task_cfs_rq(p); 13180 - /* 13181 - * Since this is called after changing class, this is a little weird 13182 - * and we cannot use DEQUEUE_DELAYED. 13183 - */ 13184 - if (p->se.sched_delayed) { 13185 - /* First, dequeue it from its new class' structures */ 13186 - dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); 13187 - /* 13188 - * Now, clean up the fair_sched_class side of things 13189 - * related to sched_delayed being true and that wasn't done 13190 - * due to the generic dequeue not using DEQUEUE_DELAYED. 13191 - */ 13192 - finish_delayed_dequeue_entity(&p->se); 13193 - p->se.rel_deadline = 0; 13194 - __block_task(rq, p); 13195 - } 13196 13177 } 13197 13178 13198 13179 static void switched_to_fair(struct rq *rq, struct task_struct *p)
+1 -1
kernel/sched/sched.h
··· 3800 3800 3801 3801 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); 3802 3802 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); 3803 - extern void __setscheduler_prio(struct task_struct *p, int prio); 3803 + extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio); 3804 3804 extern void set_load_weight(struct task_struct *p, bool update_load); 3805 3805 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); 3806 3806 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+33 -15
kernel/sched/stats.h
··· 119 119 /* 120 120 * PSI tracks state that persists across sleeps, such as iowaits and 121 121 * memory stalls. As a result, it has to distinguish between sleeps, 122 - * where a task's runnable state changes, and requeues, where a task 123 - * and its state are being moved between CPUs and runqueues. 122 + * where a task's runnable state changes, and migrations, where a task 123 + * and its runnable state are being moved between CPUs and runqueues. 124 + * 125 + * A notable case is a task whose dequeue is delayed. PSI considers 126 + * those sleeping, but because they are still on the runqueue they can 127 + * go through migration requeues. In this case, *sleeping* states need 128 + * to be transferred. 124 129 */ 125 - static inline void psi_enqueue(struct task_struct *p, bool wakeup) 130 + static inline void psi_enqueue(struct task_struct *p, bool migrate) 126 131 { 127 - int clear = 0, set = TSK_RUNNING; 132 + int clear = 0, set = 0; 128 133 129 134 if (static_branch_likely(&psi_disabled)) 130 135 return; 131 136 132 - if (p->in_memstall) 133 - set |= TSK_MEMSTALL_RUNNING; 134 - 135 - if (!wakeup) { 137 + if (p->se.sched_delayed) { 138 + /* CPU migration of "sleeping" task */ 139 + SCHED_WARN_ON(!migrate); 136 140 if (p->in_memstall) 137 141 set |= TSK_MEMSTALL; 142 + if (p->in_iowait) 143 + set |= TSK_IOWAIT; 144 + } else if (migrate) { 145 + /* CPU migration of runnable task */ 146 + set = TSK_RUNNING; 147 + if (p->in_memstall) 148 + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; 138 149 } else { 150 + /* Wakeup of new or sleeping task */ 139 151 if (p->in_iowait) 140 152 clear |= TSK_IOWAIT; 153 + set = TSK_RUNNING; 154 + if (p->in_memstall) 155 + set |= TSK_MEMSTALL_RUNNING; 141 156 } 142 157 143 158 psi_task_change(p, clear, set); 144 159 } 145 160 146 - static inline void psi_dequeue(struct task_struct *p, bool sleep) 161 + static inline void psi_dequeue(struct task_struct *p, bool migrate) 147 162 { 148 163 if (static_branch_likely(&psi_disabled)) 149 164 return; 165 + 166 + /* 167 + * When migrating a task to another CPU, clear all psi 168 + * state. The enqueue callback above will work it out. 169 + */ 170 + if (migrate) 171 + psi_task_change(p, p->psi_flags, 0); 150 172 151 173 /* 152 174 * A voluntary sleep is a dequeue followed by a task switch. To ··· 176 154 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. 177 155 * Do nothing here. 178 156 */ 179 - if (sleep) 180 - return; 181 - 182 - psi_task_change(p, p->psi_flags, 0); 183 157 } 184 158 185 159 static inline void psi_ttwu_dequeue(struct task_struct *p) ··· 208 190 } 209 191 210 192 #else /* CONFIG_PSI */ 211 - static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} 212 - static inline void psi_dequeue(struct task_struct *p, bool sleep) {} 193 + static inline void psi_enqueue(struct task_struct *p, bool migrate) {} 194 + static inline void psi_dequeue(struct task_struct *p, bool migrate) {} 213 195 static inline void psi_ttwu_dequeue(struct task_struct *p) {} 214 196 static inline void psi_sched_switch(struct task_struct *prev, 215 197 struct task_struct *next,
+9 -4
kernel/sched/syscalls.c
··· 529 529 { 530 530 int oldpolicy = -1, policy = attr->sched_policy; 531 531 int retval, oldprio, newprio, queued, running; 532 - const struct sched_class *prev_class; 532 + const struct sched_class *prev_class, *next_class; 533 533 struct balance_callback *head; 534 534 struct rq_flags rf; 535 535 int reset_on_fork; ··· 706 706 queue_flags &= ~DEQUEUE_MOVE; 707 707 } 708 708 709 + prev_class = p->sched_class; 710 + next_class = __setscheduler_class(p, newprio); 711 + 712 + if (prev_class != next_class && p->se.sched_delayed) 713 + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 714 + 709 715 queued = task_on_rq_queued(p); 710 716 running = task_current(rq, p); 711 717 if (queued) ··· 719 713 if (running) 720 714 put_prev_task(rq, p); 721 715 722 - prev_class = p->sched_class; 723 - 724 716 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { 725 717 __setscheduler_params(p, attr); 726 - __setscheduler_prio(p, newprio); 718 + p->sched_class = next_class; 719 + p->prio = newprio; 727 720 } 728 721 __setscheduler_uclamp(p, attr); 729 722 check_class_changing(rq, p, prev_class);
+13 -2
kernel/task_work.c
··· 55 55 enum task_work_notify_mode notify) 56 56 { 57 57 struct callback_head *head; 58 + int flags = notify & TWA_FLAGS; 58 59 60 + notify &= ~TWA_FLAGS; 59 61 if (notify == TWA_NMI_CURRENT) { 60 62 if (WARN_ON_ONCE(task != current)) 61 63 return -EINVAL; 62 64 if (!IS_ENABLED(CONFIG_IRQ_WORK)) 63 65 return -EINVAL; 64 66 } else { 65 - /* record the work call stack in order to print it in KASAN reports */ 66 - kasan_record_aux_stack(work); 67 + /* 68 + * Record the work call stack in order to print it in KASAN 69 + * reports. 70 + * 71 + * Note that stack allocation can fail if TWAF_NO_ALLOC flag 72 + * is set and new page is needed to expand the stack buffer. 73 + */ 74 + if (flags & TWAF_NO_ALLOC) 75 + kasan_record_aux_stack_noalloc(work); 76 + else 77 + kasan_record_aux_stack(work); 67 78 } 68 79 69 80 head = READ_ONCE(task->task_works);
+6
kernel/time/tick-sched.c
··· 434 434 * smp_mb__after_spin_lock() 435 435 * tick_nohz_task_switch() 436 436 * LOAD p->tick_dep_mask 437 + * 438 + * XXX given a task picks up the dependency on schedule(), should we 439 + * only care about tasks that are currently on the CPU instead of all 440 + * that are on the runqueue? 441 + * 442 + * That is, does this want to be: task_on_cpu() / task_curr()? 437 443 */ 438 444 if (!sched_task_on_rq(tsk)) 439 445 return;
+1 -1
kernel/trace/trace_selftest.c
··· 1485 1485 /* reset the max latency */ 1486 1486 tr->max_latency = 0; 1487 1487 1488 - while (p->on_rq) { 1488 + while (task_is_runnable(p)) { 1489 1489 /* 1490 1490 * Sleep to make sure the -deadline thread is asleep too. 1491 1491 * On virtual machines we can't rely on timings,
+1 -1
virt/kvm/kvm_main.c
··· 6387 6387 6388 6388 WRITE_ONCE(vcpu->scheduled_out, true); 6389 6389 6390 - if (current->on_rq && vcpu->wants_to_run) { 6390 + if (task_is_runnable(current) && vcpu->wants_to_run) { 6391 6391 WRITE_ONCE(vcpu->preempted, true); 6392 6392 WRITE_ONCE(vcpu->ready, true); 6393 6393 }