Merge tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+6 -8

Documentation/scheduler/sched-deadline.rst

··· 749 749 of the command line options. Please refer to rt-app documentation for more 750 750 details (`<rt-app-sources>/doc/*.json`). 751 751 752 - The second testing application is a modification of schedtool, called 753 - schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a 754 - certain pid/application. schedtool-dl is available at: 755 - https://github.com/scheduler-tools/schedtool-dl.git. 752 + The second testing application is done using chrt which has support 753 + for SCHED_DEADLINE. 756 754 757 755 The usage is straightforward:: 758 756 759 - # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app 757 + # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app 760 758 761 759 With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation 762 - of 10ms every 100ms (note that parameters are expressed in microseconds). 763 - You can also use schedtool to create a reservation for an already running 760 + of 10ms every 100ms (note that parameters are expressed in nanoseconds). 761 + You can also use chrt to create a reservation for an already running 764 762 application, given that you know its pid:: 765 763 766 - # schedtool -E -t 10000000:100000000 my_app_pid 764 + # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid 767 765 768 766 Appendix B. Minimal main() 769 767 ==========================

+3 -3

drivers/cpufreq/cppc_cpufreq.c

··· 224 224 * Fake (unused) bandwidth; workaround to "fix" 225 225 * priority inheritance. 226 226 */ 227 - .sched_runtime = 1000000, 228 - .sched_deadline = 10000000, 229 - .sched_period = 10000000, 227 + .sched_runtime = NSEC_PER_MSEC, 228 + .sched_deadline = 10 * NSEC_PER_MSEC, 229 + .sched_period = 10 * NSEC_PER_MSEC, 230 230 }; 231 231 int ret; 232 232

+1 -1

fs/bcachefs/six.c

··· 335 335 */ 336 336 rcu_read_lock(); 337 337 struct task_struct *owner = READ_ONCE(lock->owner); 338 - bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); 338 + bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); 339 339 rcu_read_unlock(); 340 340 341 341 return ret;

+1 -1

fs/proc/base.c

··· 2626 2626 } 2627 2627 2628 2628 task_lock(p); 2629 - if (task_is_realtime(p)) 2629 + if (rt_or_dl_task_policy(p)) 2630 2630 slack_ns = 0; 2631 2631 else if (slack_ns == 0) 2632 2632 slack_ns = p->default_timer_slack_ns;

+1 -1

include/linux/ioprio.h

··· 40 40 { 41 41 if (task->policy == SCHED_IDLE) 42 42 return IOPRIO_CLASS_IDLE; 43 - else if (task_is_realtime(task)) 43 + else if (rt_or_dl_task_policy(task)) 44 44 return IOPRIO_CLASS_RT; 45 45 else 46 46 return IOPRIO_CLASS_BE;

+24 -4

include/linux/sched.h

··· 149 149 * Special states are those that do not use the normal wait-loop pattern. See 150 150 * the comment with set_special_state(). 151 151 */ 152 - #define is_special_task_state(state) \ 153 - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) 152 + #define is_special_task_state(state) \ 153 + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ 154 + TASK_DEAD | TASK_FROZEN)) 154 155 155 156 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 156 157 # define debug_normal_state_change(state_value) \ ··· 542 541 struct rb_node run_node; 543 542 u64 deadline; 544 543 u64 min_vruntime; 544 + u64 min_slice; 545 545 546 546 struct list_head group_node; 547 - unsigned int on_rq; 547 + unsigned char on_rq; 548 + unsigned char sched_delayed; 549 + unsigned char rel_deadline; 550 + unsigned char custom_slice; 551 + /* hole */ 548 552 549 553 u64 exec_start; 550 554 u64 sum_exec_runtime; ··· 645 639 * 646 640 * @dl_overrun tells if the task asked to be informed about runtime 647 641 * overruns. 642 + * 643 + * @dl_server tells if this is a server entity. 644 + * 645 + * @dl_defer tells if this is a deferred or regular server. For 646 + * now only defer server exists. 647 + * 648 + * @dl_defer_armed tells if the deferrable server is waiting 649 + * for the replenishment timer to activate it. 650 + * 651 + * @dl_defer_running tells if the deferrable server is actually 652 + * running, skipping the defer phase. 648 653 */ 649 654 unsigned int dl_throttled : 1; 650 655 unsigned int dl_yielded : 1; 651 656 unsigned int dl_non_contending : 1; 652 657 unsigned int dl_overrun : 1; 653 658 unsigned int dl_server : 1; 659 + unsigned int dl_defer : 1; 660 + unsigned int dl_defer_armed : 1; 661 + unsigned int dl_defer_running : 1; 654 662 655 663 /* 656 664 * Bandwidth enforcement timer. Each -deadline task has its ··· 692 672 */ 693 673 struct rq *rq; 694 674 dl_server_has_tasks_f server_has_tasks; 695 - dl_server_pick_f server_pick; 675 + dl_server_pick_f server_pick_task; 696 676 697 677 #ifdef CONFIG_RT_MUTEXES 698 678 /*

+7 -7

include/linux/sched/deadline.h

··· 10 10 11 11 #include <linux/sched.h> 12 12 13 - #define MAX_DL_PRIO 0 14 - 15 - static inline int dl_prio(int prio) 13 + static inline bool dl_prio(int prio) 16 14 { 17 - if (unlikely(prio < MAX_DL_PRIO)) 18 - return 1; 19 - return 0; 15 + return unlikely(prio < MAX_DL_PRIO); 20 16 } 21 17 22 - static inline int dl_task(struct task_struct *p) 18 + /* 19 + * Returns true if a task has a priority that belongs to DL class. PI-boosted 20 + * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. 21 + */ 22 + static inline bool dl_task(struct task_struct *p) 23 23 { 24 24 return dl_prio(p->prio); 25 25 }

+1

include/linux/sched/prio.h

··· 14 14 */ 15 15 16 16 #define MAX_RT_PRIO 100 17 + #define MAX_DL_PRIO 0 17 18 18 19 #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) 19 20 #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)

+27 -6

include/linux/sched/rt.h

··· 6 6 7 7 struct task_struct; 8 8 9 - static inline int rt_prio(int prio) 9 + static inline bool rt_prio(int prio) 10 10 { 11 - if (unlikely(prio < MAX_RT_PRIO)) 12 - return 1; 13 - return 0; 11 + return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); 14 12 } 15 13 16 - static inline int rt_task(struct task_struct *p) 14 + static inline bool rt_or_dl_prio(int prio) 15 + { 16 + return unlikely(prio < MAX_RT_PRIO); 17 + } 18 + 19 + /* 20 + * Returns true if a task has a priority that belongs to RT class. PI-boosted 21 + * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. 22 + */ 23 + static inline bool rt_task(struct task_struct *p) 17 24 { 18 25 return rt_prio(p->prio); 19 26 } 20 27 21 - static inline bool task_is_realtime(struct task_struct *tsk) 28 + /* 29 + * Returns true if a task has a priority that belongs to RT or DL classes. 30 + * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore 31 + * PI-boosted tasks. 32 + */ 33 + static inline bool rt_or_dl_task(struct task_struct *p) 34 + { 35 + return rt_or_dl_prio(p->prio); 36 + } 37 + 38 + /* 39 + * Returns true if a task has a policy that belongs to RT or DL classes. 40 + * PI-boosted tasks will return false. 41 + */ 42 + static inline bool rt_or_dl_task_policy(struct task_struct *tsk) 22 43 { 23 44 int policy = tsk->policy; 24 45

+3 -3

include/uapi/linux/sched/types.h

··· 58 58 * 59 59 * This is reflected by the following fields of the sched_attr structure: 60 60 * 61 - * @sched_deadline representative of the task's deadline 62 - * @sched_runtime representative of the task's runtime 63 - * @sched_period representative of the task's period 61 + * @sched_deadline representative of the task's deadline in nanoseconds 62 + * @sched_runtime representative of the task's runtime in nanoseconds 63 + * @sched_period representative of the task's period in nanoseconds 64 64 * 65 65 * Given this task model, there are a multiplicity of scheduling algorithms 66 66 * and policies, that can be used to ensure all the tasks will make their

+1 -1

kernel/freezer.c

··· 72 72 bool freeze; 73 73 74 74 raw_spin_lock_irq(&current->pi_lock); 75 - set_current_state(TASK_FROZEN); 75 + WRITE_ONCE(current->__state, TASK_FROZEN); 76 76 /* unstale saved_state so that __thaw_task() will wake us up */ 77 77 current->saved_state = TASK_RUNNING; 78 78 raw_spin_unlock_irq(&current->pi_lock);

+9 -1

kernel/kthread.c

··· 845 845 * event only cares about the address. 846 846 */ 847 847 trace_sched_kthread_work_execute_end(work, func); 848 - } else if (!freezing(current)) 848 + } else if (!freezing(current)) { 849 849 schedule(); 850 + } else { 851 + /* 852 + * Handle the case where the current remains 853 + * TASK_INTERRUPTIBLE. try_to_freeze() expects 854 + * the current to be TASK_RUNNING. 855 + */ 856 + __set_current_state(TASK_RUNNING); 857 + } 850 858 851 859 try_to_freeze(); 852 860 cond_resched();

+2 -2

kernel/locking/rtmutex.c

··· 347 347 { 348 348 int prio = task->prio; 349 349 350 - if (!rt_prio(prio)) 350 + if (!rt_or_dl_prio(prio)) 351 351 return DEFAULT_PRIO; 352 352 353 353 return prio; ··· 435 435 * Note that RT tasks are excluded from same priority (lateral) 436 436 * steals to prevent the introduction of an unbounded latency. 437 437 */ 438 - if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) 438 + if (rt_or_dl_prio(waiter->tree.prio)) 439 439 return false; 440 440 441 441 return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);

+2 -2

kernel/locking/rwsem.c

··· 631 631 * if it is an RT task or wait in the wait queue 632 632 * for too long. 633 633 */ 634 - if (has_handoff || (!rt_task(waiter->task) && 634 + if (has_handoff || (!rt_or_dl_task(waiter->task) && 635 635 !time_after(jiffies, waiter->timeout))) 636 636 return false; 637 637 ··· 914 914 if (owner_state != OWNER_WRITER) { 915 915 if (need_resched()) 916 916 break; 917 - if (rt_task(current) && 917 + if (rt_or_dl_task(current) && 918 918 (prev_owner_state != OWNER_WRITER)) 919 919 break; 920 920 }

+1 -1

kernel/locking/ww_mutex.h

··· 237 237 int a_prio = a->task->prio; 238 238 int b_prio = b->task->prio; 239 239 240 - if (rt_prio(a_prio) || rt_prio(b_prio)) { 240 + if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) { 241 241 242 242 if (a_prio > b_prio) 243 243 return true;

+166 -82

kernel/sched/core.c

··· 163 163 if (p->sched_class == &stop_sched_class) /* trumps deadline */ 164 164 return -2; 165 165 166 - if (rt_prio(p->prio)) /* includes deadline */ 166 + if (p->dl_server) 167 + return -1; /* deadline */ 168 + 169 + if (rt_or_dl_prio(p->prio)) 167 170 return p->prio; /* [-1, 99] */ 168 171 169 172 if (p->sched_class == &idle_sched_class) ··· 195 192 if (-pb < -pa) 196 193 return false; 197 194 198 - if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ 199 - return !dl_time_before(a->dl.deadline, b->dl.deadline); 195 + if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ 196 + const struct sched_dl_entity *a_dl, *b_dl; 197 + 198 + a_dl = &a->dl; 199 + /* 200 + * Since,'a' and 'b' can be CFS tasks served by DL server, 201 + * __task_prio() can return -1 (for DL) even for those. In that 202 + * case, get to the dl_server's DL entity. 203 + */ 204 + if (a->dl_server) 205 + a_dl = a->dl_server; 206 + 207 + b_dl = &b->dl; 208 + if (b->dl_server) 209 + b_dl = b->dl_server; 210 + 211 + return !dl_time_before(a_dl->deadline, b_dl->deadline); 212 + } 200 213 201 214 if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ 202 215 return cfs_prio_less(a, b, in_fi); ··· 259 240 260 241 void sched_core_enqueue(struct rq *rq, struct task_struct *p) 261 242 { 243 + if (p->se.sched_delayed) 244 + return; 245 + 262 246 rq->core->core_task_seq++; 263 247 264 248 if (!p->core_cookie) ··· 272 250 273 251 void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) 274 252 { 253 + if (p->se.sched_delayed) 254 + return; 255 + 275 256 rq->core->core_task_seq++; 276 257 277 258 if (sched_core_enqueued(p)) { ··· 1294 1269 * dequeued by migrating while the constrained task continues to run. 1295 1270 * E.g. going from 2->1 without going through pick_next_task(). 1296 1271 */ 1297 - if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { 1272 + if (__need_bw_check(rq, rq->curr)) { 1298 1273 if (cfs_task_bw_constrained(rq->curr)) 1299 1274 return false; 1300 1275 } ··· 1697 1672 if (unlikely(!p->sched_class->uclamp_enabled)) 1698 1673 return; 1699 1674 1675 + if (p->se.sched_delayed) 1676 + return; 1677 + 1700 1678 for_each_clamp_id(clamp_id) 1701 1679 uclamp_rq_inc_id(rq, p, clamp_id); 1702 1680 ··· 1722 1694 return; 1723 1695 1724 1696 if (unlikely(!p->sched_class->uclamp_enabled)) 1697 + return; 1698 + 1699 + if (p->se.sched_delayed) 1725 1700 return; 1726 1701 1727 1702 for_each_clamp_id(clamp_id) ··· 2006 1975 psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); 2007 1976 } 2008 1977 2009 - uclamp_rq_inc(rq, p); 2010 1978 p->sched_class->enqueue_task(rq, p, flags); 1979 + /* 1980 + * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear 1981 + * ->sched_delayed. 1982 + */ 1983 + uclamp_rq_inc(rq, p); 2011 1984 2012 1985 if (sched_core_enabled(rq)) 2013 1986 sched_core_enqueue(rq, p); 2014 1987 } 2015 1988 2016 - void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1989 + /* 1990 + * Must only return false when DEQUEUE_SLEEP. 1991 + */ 1992 + inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) 2017 1993 { 2018 1994 if (sched_core_enabled(rq)) 2019 1995 sched_core_dequeue(rq, p, flags); ··· 2033 1995 psi_dequeue(p, flags & DEQUEUE_SLEEP); 2034 1996 } 2035 1997 1998 + /* 1999 + * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' 2000 + * and mark the task ->sched_delayed. 2001 + */ 2036 2002 uclamp_rq_dec(rq, p); 2037 - p->sched_class->dequeue_task(rq, p, flags); 2003 + return p->sched_class->dequeue_task(rq, p, flags); 2038 2004 } 2039 2005 2040 2006 void activate_task(struct rq *rq, struct task_struct *p, int flags) ··· 2056 2014 2057 2015 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 2058 2016 { 2059 - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); 2017 + SCHED_WARN_ON(flags & DEQUEUE_SLEEP); 2018 + 2019 + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); 2060 2020 ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2061 2021 2022 + /* 2023 + * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* 2024 + * dequeue_task() and cleared *after* enqueue_task(). 2025 + */ 2026 + 2062 2027 dequeue_task(rq, p, flags); 2028 + } 2029 + 2030 + static void block_task(struct rq *rq, struct task_struct *p, int flags) 2031 + { 2032 + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) 2033 + __block_task(rq, p); 2063 2034 } 2064 2035 2065 2036 /** ··· 2288 2233 struct task_struct *p = current; 2289 2234 2290 2235 if (p->migration_disabled) { 2236 + #ifdef CONFIG_DEBUG_PREEMPT 2237 + /* 2238 + *Warn about overflow half-way through the range. 2239 + */ 2240 + WARN_ON_ONCE((s16)p->migration_disabled < 0); 2241 + #endif 2291 2242 p->migration_disabled++; 2292 2243 return; 2293 2244 } ··· 2312 2251 .flags = SCA_MIGRATE_ENABLE, 2313 2252 }; 2314 2253 2254 + #ifdef CONFIG_DEBUG_PREEMPT 2255 + /* 2256 + * Check both overflow from migrate_disable() and superfluous 2257 + * migrate_enable(). 2258 + */ 2259 + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) 2260 + return; 2261 + #endif 2262 + 2315 2263 if (p->migration_disabled > 1) { 2316 2264 p->migration_disabled--; 2317 2265 return; 2318 2266 } 2319 - 2320 - if (WARN_ON_ONCE(!p->migration_disabled)) 2321 - return; 2322 2267 2323 2268 /* 2324 2269 * Ensure stop_task runs either before or after this, and that ··· 3674 3607 rq->idle_stamp = 0; 3675 3608 } 3676 3609 #endif 3677 - 3678 - p->dl_server = NULL; 3679 3610 } 3680 3611 3681 3612 /* ··· 3709 3644 3710 3645 rq = __task_rq_lock(p, &rf); 3711 3646 if (task_on_rq_queued(p)) { 3647 + update_rq_clock(rq); 3648 + if (p->se.sched_delayed) 3649 + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); 3712 3650 if (!task_on_cpu(rq, p)) { 3713 3651 /* 3714 3652 * When on_rq && !on_cpu the task is preempted, see if 3715 3653 * it should preempt the task that is current now. 3716 3654 */ 3717 - update_rq_clock(rq); 3718 3655 wakeup_preempt(rq, p, wake_flags); 3719 3656 } 3720 3657 ttwu_do_wakeup(p); ··· 4096 4029 * case the whole 'p->on_rq && ttwu_runnable()' case below 4097 4030 * without taking any locks. 4098 4031 * 4032 + * Specifically, given current runs ttwu() we must be before 4033 + * schedule()'s block_task(), as such this must not observe 4034 + * sched_delayed. 4035 + * 4099 4036 * In particular: 4100 4037 * - we rely on Program-Order guarantees for all the ordering, 4101 4038 * - we're serialized against set_special_state() by virtue of 4102 4039 * it disabling IRQs (this allows not taking ->pi_lock). 4103 4040 */ 4041 + SCHED_WARN_ON(p->se.sched_delayed); 4104 4042 if (!ttwu_state_match(p, state, &success)) 4105 4043 goto out; 4106 4044 ··· 4394 4322 p->se.nr_migrations = 0; 4395 4323 p->se.vruntime = 0; 4396 4324 p->se.vlag = 0; 4397 - p->se.slice = sysctl_sched_base_slice; 4398 4325 INIT_LIST_HEAD(&p->se.group_node); 4326 + 4327 + /* A delayed task cannot be in clone(). */ 4328 + SCHED_WARN_ON(p->se.sched_delayed); 4399 4329 4400 4330 #ifdef CONFIG_FAIR_GROUP_SCHED 4401 4331 p->se.cfs_rq = NULL; ··· 4646 4572 4647 4573 p->prio = p->normal_prio = p->static_prio; 4648 4574 set_load_weight(p, false); 4575 + p->se.custom_slice = 0; 4576 + p->se.slice = sysctl_sched_base_slice; 4649 4577 4650 4578 /* 4651 4579 * We don't need the reset flag anymore after the fork. It has ··· 4762 4686 update_rq_clock(rq); 4763 4687 post_init_entity_util_avg(p); 4764 4688 4765 - activate_task(rq, p, ENQUEUE_NOCLOCK); 4689 + activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); 4766 4690 trace_sched_wakeup_new(p); 4767 4691 wakeup_preempt(rq, p, WF_FORK); 4768 4692 #ifdef CONFIG_SMP ··· 5845 5769 schedstat_inc(this_rq()->sched_count); 5846 5770 } 5847 5771 5848 - static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, 5849 - struct rq_flags *rf) 5772 + static void prev_balance(struct rq *rq, struct task_struct *prev, 5773 + struct rq_flags *rf) 5850 5774 { 5851 5775 #ifdef CONFIG_SMP 5852 5776 const struct sched_class *class; ··· 5863 5787 break; 5864 5788 } 5865 5789 #endif 5866 - 5867 - put_prev_task(rq, prev); 5868 5790 } 5869 5791 5870 5792 /* ··· 5873 5799 { 5874 5800 const struct sched_class *class; 5875 5801 struct task_struct *p; 5802 + 5803 + rq->dl_server = NULL; 5876 5804 5877 5805 /* 5878 5806 * Optimization: we know that if all tasks are in the fair class we can ··· 5891 5815 5892 5816 /* Assume the next prioritized class is idle_sched_class */ 5893 5817 if (!p) { 5894 - put_prev_task(rq, prev); 5895 - p = pick_next_task_idle(rq); 5818 + p = pick_task_idle(rq); 5819 + put_prev_set_next_task(rq, prev, p); 5896 5820 } 5897 - 5898 - /* 5899 - * This is the fast path; it cannot be a DL server pick; 5900 - * therefore even if @p == @prev, ->dl_server must be NULL. 5901 - */ 5902 - if (p->dl_server) 5903 - p->dl_server = NULL; 5904 5821 5905 5822 return p; 5906 5823 } 5907 5824 5908 5825 restart: 5909 - put_prev_task_balance(rq, prev, rf); 5910 - 5911 - /* 5912 - * We've updated @prev and no longer need the server link, clear it. 5913 - * Must be done before ->pick_next_task() because that can (re)set 5914 - * ->dl_server. 5915 - */ 5916 - if (prev->dl_server) 5917 - prev->dl_server = NULL; 5826 + prev_balance(rq, prev, rf); 5918 5827 5919 5828 for_each_class(class) { 5920 - p = class->pick_next_task(rq); 5921 - if (p) 5922 - return p; 5829 + if (class->pick_next_task) { 5830 + p = class->pick_next_task(rq, prev); 5831 + if (p) 5832 + return p; 5833 + } else { 5834 + p = class->pick_task(rq); 5835 + if (p) { 5836 + put_prev_set_next_task(rq, prev, p); 5837 + return p; 5838 + } 5839 + } 5923 5840 } 5924 5841 5925 5842 BUG(); /* The idle class should always have a runnable task. */ ··· 5941 5872 { 5942 5873 const struct sched_class *class; 5943 5874 struct task_struct *p; 5875 + 5876 + rq->dl_server = NULL; 5944 5877 5945 5878 for_each_class(class) { 5946 5879 p = class->pick_task(rq); ··· 5982 5911 * another cpu during offline. 5983 5912 */ 5984 5913 rq->core_pick = NULL; 5914 + rq->core_dl_server = NULL; 5985 5915 return __pick_next_task(rq, prev, rf); 5986 5916 } 5987 5917 ··· 6001 5929 WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); 6002 5930 6003 5931 next = rq->core_pick; 6004 - if (next != prev) { 6005 - put_prev_task(rq, prev); 6006 - set_next_task(rq, next); 6007 - } 6008 - 5932 + rq->dl_server = rq->core_dl_server; 6009 5933 rq->core_pick = NULL; 6010 - goto out; 5934 + rq->core_dl_server = NULL; 5935 + goto out_set_next; 6011 5936 } 6012 5937 6013 - put_prev_task_balance(rq, prev, rf); 5938 + prev_balance(rq, prev, rf); 6014 5939 6015 5940 smt_mask = cpu_smt_mask(cpu); 6016 5941 need_sync = !!rq->core->core_cookie; ··· 6048 5979 next = pick_task(rq); 6049 5980 if (!next->core_cookie) { 6050 5981 rq->core_pick = NULL; 5982 + rq->core_dl_server = NULL; 6051 5983 /* 6052 5984 * For robustness, update the min_vruntime_fi for 6053 5985 * unconstrained picks as well. ··· 6076 6006 if (i != cpu && (rq_i != rq->core || !core_clock_updated)) 6077 6007 update_rq_clock(rq_i); 6078 6008 6079 - p = rq_i->core_pick = pick_task(rq_i); 6009 + rq_i->core_pick = p = pick_task(rq_i); 6010 + rq_i->core_dl_server = rq_i->dl_server; 6011 + 6080 6012 if (!max || prio_less(max, p, fi_before)) 6081 6013 max = p; 6082 6014 } ··· 6102 6030 } 6103 6031 6104 6032 rq_i->core_pick = p; 6033 + rq_i->core_dl_server = NULL; 6105 6034 6106 6035 if (p == rq_i->idle) { 6107 6036 if (rq_i->nr_running) { ··· 6163 6090 6164 6091 if (i == cpu) { 6165 6092 rq_i->core_pick = NULL; 6093 + rq_i->core_dl_server = NULL; 6166 6094 continue; 6167 6095 } 6168 6096 ··· 6172 6098 6173 6099 if (rq_i->curr == rq_i->core_pick) { 6174 6100 rq_i->core_pick = NULL; 6101 + rq_i->core_dl_server = NULL; 6175 6102 continue; 6176 6103 } 6177 6104 ··· 6180 6105 } 6181 6106 6182 6107 out_set_next: 6183 - set_next_task(rq, next); 6184 - out: 6108 + put_prev_set_next_task(rq, prev, next); 6185 6109 if (rq->core->core_forceidle_count && next == rq->idle) 6186 6110 queue_core_balance(rq); 6187 6111 ··· 6416 6342 * Constants for the sched_mode argument of __schedule(). 6417 6343 * 6418 6344 * The mode argument allows RT enabled kernels to differentiate a 6419 - * preemption from blocking on an 'sleeping' spin/rwlock. Note that 6420 - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to 6421 - * optimize the AND operation out and just check for zero. 6345 + * preemption from blocking on an 'sleeping' spin/rwlock. 6422 6346 */ 6423 - #define SM_NONE 0x0 6424 - #define SM_PREEMPT 0x1 6425 - #define SM_RTLOCK_WAIT 0x2 6426 - 6427 - #ifndef CONFIG_PREEMPT_RT 6428 - # define SM_MASK_PREEMPT (~0U) 6429 - #else 6430 - # define SM_MASK_PREEMPT SM_PREEMPT 6431 - #endif 6347 + #define SM_IDLE (-1) 6348 + #define SM_NONE 0 6349 + #define SM_PREEMPT 1 6350 + #define SM_RTLOCK_WAIT 2 6432 6351 6433 6352 /* 6434 6353 * __schedule() is the main scheduler function. ··· 6462 6395 * 6463 6396 * WARNING: must be called with preemption disabled! 6464 6397 */ 6465 - static void __sched notrace __schedule(unsigned int sched_mode) 6398 + static void __sched notrace __schedule(int sched_mode) 6466 6399 { 6467 6400 struct task_struct *prev, *next; 6401 + /* 6402 + * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted 6403 + * as a preemption by schedule_debug() and RCU. 6404 + */ 6405 + bool preempt = sched_mode > SM_NONE; 6468 6406 unsigned long *switch_count; 6469 6407 unsigned long prev_state; 6470 6408 struct rq_flags rf; ··· 6480 6408 rq = cpu_rq(cpu); 6481 6409 prev = rq->curr; 6482 6410 6483 - schedule_debug(prev, !!sched_mode); 6411 + schedule_debug(prev, preempt); 6484 6412 6485 6413 if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) 6486 6414 hrtick_clear(rq); 6487 6415 6488 6416 local_irq_disable(); 6489 - rcu_note_context_switch(!!sched_mode); 6417 + rcu_note_context_switch(preempt); 6490 6418 6491 6419 /* 6492 6420 * Make sure that signal_pending_state()->signal_pending() below ··· 6515 6443 6516 6444 switch_count = &prev->nivcsw; 6517 6445 6446 + /* Task state changes only considers SM_PREEMPT as preemption */ 6447 + preempt = sched_mode == SM_PREEMPT; 6448 + 6518 6449 /* 6519 6450 * We must load prev->state once (task_struct::state is volatile), such 6520 6451 * that we form a control dependency vs deactivate_task() below. 6521 6452 */ 6522 6453 prev_state = READ_ONCE(prev->__state); 6523 - if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { 6454 + if (sched_mode == SM_IDLE) { 6455 + if (!rq->nr_running) { 6456 + next = prev; 6457 + goto picked; 6458 + } 6459 + } else if (!preempt && prev_state) { 6524 6460 if (signal_pending_state(prev_state, prev)) { 6525 6461 WRITE_ONCE(prev->__state, TASK_RUNNING); 6526 6462 } else { 6463 + int flags = DEQUEUE_NOCLOCK; 6464 + 6527 6465 prev->sched_contributes_to_load = 6528 6466 (prev_state & TASK_UNINTERRUPTIBLE) && 6529 6467 !(prev_state & TASK_NOLOAD) && 6530 6468 !(prev_state & TASK_FROZEN); 6531 6469 6532 - if (prev->sched_contributes_to_load) 6533 - rq->nr_uninterruptible++; 6470 + if (unlikely(is_special_task_state(prev_state))) 6471 + flags |= DEQUEUE_SPECIAL; 6534 6472 6535 6473 /* 6536 6474 * __schedule() ttwu() ··· 6553 6471 * 6554 6472 * After this, schedule() must not care about p->state any more. 6555 6473 */ 6556 - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); 6557 - 6558 - if (prev->in_iowait) { 6559 - atomic_inc(&rq->nr_iowait); 6560 - delayacct_blkio_start(); 6561 - } 6474 + block_task(rq, prev, flags); 6562 6475 } 6563 6476 switch_count = &prev->nvcsw; 6564 6477 } 6565 6478 6566 6479 next = pick_next_task(rq, prev, &rf); 6480 + picked: 6567 6481 clear_tsk_need_resched(prev); 6568 6482 clear_preempt_need_resched(); 6569 6483 #ifdef CONFIG_SCHED_DEBUG ··· 6601 6523 psi_account_irqtime(rq, prev, next); 6602 6524 psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 6603 6525 6604 - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); 6526 + trace_sched_switch(preempt, prev, next, prev_state); 6605 6527 6606 6528 /* Also unlocks the rq: */ 6607 6529 rq = context_switch(rq, prev, next, &rf); ··· 6677 6599 } 6678 6600 } 6679 6601 6680 - static __always_inline void __schedule_loop(unsigned int sched_mode) 6602 + static __always_inline void __schedule_loop(int sched_mode) 6681 6603 { 6682 6604 do { 6683 6605 preempt_disable(); ··· 6722 6644 */ 6723 6645 WARN_ON_ONCE(current->__state); 6724 6646 do { 6725 - __schedule(SM_NONE); 6647 + __schedule(SM_IDLE); 6726 6648 } while (need_resched()); 6727 6649 } 6728 6650 ··· 8306 8228 #endif /* CONFIG_RT_GROUP_SCHED */ 8307 8229 } 8308 8230 8309 - init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); 8310 - 8311 8231 #ifdef CONFIG_SMP 8312 8232 init_defrootdomain(); 8313 8233 #endif ··· 8360 8284 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8361 8285 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8362 8286 8363 - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 8364 8287 #ifdef CONFIG_RT_GROUP_SCHED 8288 + /* 8289 + * This is required for init cpu because rt.c:__enable_runtime() 8290 + * starts working after scheduler_running, which is not the case 8291 + * yet. 8292 + */ 8293 + rq->rt.rt_runtime = global_rt_runtime(); 8365 8294 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 8366 8295 #endif 8367 8296 #ifdef CONFIG_SMP ··· 8398 8317 #endif /* CONFIG_SMP */ 8399 8318 hrtick_rq_init(rq); 8400 8319 atomic_set(&rq->nr_iowait, 0); 8320 + fair_server_init(rq); 8401 8321 8402 8322 #ifdef CONFIG_SCHED_CORE 8403 8323 rq->core = rq; 8404 8324 rq->core_pick = NULL; 8325 + rq->core_dl_server = NULL; 8405 8326 rq->core_enabled = 0; 8406 8327 rq->core_tree = RB_ROOT; 8407 8328 rq->core_forceidle_count = 0; ··· 8416 8333 } 8417 8334 8418 8335 set_load_weight(&init_task, false); 8336 + init_task.se.slice = sysctl_sched_base_slice, 8419 8337 8420 8338 /* 8421 8339 * The boot idle thread does lazy MMU switching as well: ··· 8632 8548 schedstat_set(p->stats.sleep_start, 0); 8633 8549 schedstat_set(p->stats.block_start, 0); 8634 8550 8635 - if (!dl_task(p) && !rt_task(p)) { 8551 + if (!rt_or_dl_task(p)) { 8636 8552 /* 8637 8553 * Renice negative nice level userspace 8638 8554 * tasks back to 0:

+3 -3

kernel/sched/cpufreq_schedutil.c

··· 654 654 * Fake (unused) bandwidth; workaround to "fix" 655 655 * priority inheritance. 656 656 */ 657 - .sched_runtime = 1000000, 658 - .sched_deadline = 10000000, 659 - .sched_period = 10000000, 657 + .sched_runtime = NSEC_PER_MSEC, 658 + .sched_deadline = 10 * NSEC_PER_MSEC, 659 + .sched_period = 10 * NSEC_PER_MSEC, 660 660 }; 661 661 struct cpufreq_policy *policy = sg_policy->policy; 662 662 int ret;

+403 -100

kernel/sched/deadline.c

··· 320 320 __sub_running_bw(dl_se->dl_bw, dl_rq); 321 321 } 322 322 323 - static void dl_change_utilization(struct task_struct *p, u64 new_bw) 323 + static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) 324 324 { 325 - struct rq *rq; 325 + if (dl_se->dl_non_contending) { 326 + sub_running_bw(dl_se, &rq->dl); 327 + dl_se->dl_non_contending = 0; 326 328 327 - WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); 328 - 329 - if (task_on_rq_queued(p)) 330 - return; 331 - 332 - rq = task_rq(p); 333 - if (p->dl.dl_non_contending) { 334 - sub_running_bw(&p->dl, &rq->dl); 335 - p->dl.dl_non_contending = 0; 336 329 /* 337 330 * If the timer handler is currently running and the 338 331 * timer cannot be canceled, inactive_task_timer() ··· 333 340 * will not touch the rq's active utilization, 334 341 * so we are still safe. 335 342 */ 336 - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) 337 - put_task_struct(p); 343 + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { 344 + if (!dl_server(dl_se)) 345 + put_task_struct(dl_task_of(dl_se)); 346 + } 338 347 } 339 - __sub_rq_bw(p->dl.dl_bw, &rq->dl); 348 + __sub_rq_bw(dl_se->dl_bw, &rq->dl); 340 349 __add_rq_bw(new_bw, &rq->dl); 350 + } 351 + 352 + static void dl_change_utilization(struct task_struct *p, u64 new_bw) 353 + { 354 + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); 355 + 356 + if (task_on_rq_queued(p)) 357 + return; 358 + 359 + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); 341 360 } 342 361 343 362 static void __dl_clear_params(struct sched_dl_entity *dl_se); ··· 776 771 /* for non-boosted task, pi_of(dl_se) == dl_se */ 777 772 dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 778 773 dl_se->runtime = pi_of(dl_se)->dl_runtime; 774 + 775 + /* 776 + * If it is a deferred reservation, and the server 777 + * is not handling an starvation case, defer it. 778 + */ 779 + if (dl_se->dl_defer & !dl_se->dl_defer_running) { 780 + dl_se->dl_throttled = 1; 781 + dl_se->dl_defer_armed = 1; 782 + } 779 783 } 780 784 781 785 /* ··· 823 809 replenish_dl_new_period(dl_se, rq); 824 810 } 825 811 812 + static int start_dl_timer(struct sched_dl_entity *dl_se); 813 + static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); 814 + 826 815 /* 827 816 * Pure Earliest Deadline First (EDF) scheduling does not deal with the 828 817 * possibility of a entity lasting more than what it declared, and thus ··· 854 837 /* 855 838 * This could be the case for a !-dl task that is boosted. 856 839 * Just go with full inherited parameters. 840 + * 841 + * Or, it could be the case of a deferred reservation that 842 + * was not able to consume its runtime in background and 843 + * reached this point with current u > U. 844 + * 845 + * In both cases, set a new period. 857 846 */ 858 - if (dl_se->dl_deadline == 0) 859 - replenish_dl_new_period(dl_se, rq); 847 + if (dl_se->dl_deadline == 0 || 848 + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { 849 + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 850 + dl_se->runtime = pi_of(dl_se)->dl_runtime; 851 + } 860 852 861 853 if (dl_se->dl_yielded && dl_se->runtime > 0) 862 854 dl_se->runtime = 0; ··· 899 873 dl_se->dl_yielded = 0; 900 874 if (dl_se->dl_throttled) 901 875 dl_se->dl_throttled = 0; 876 + 877 + /* 878 + * If this is the replenishment of a deferred reservation, 879 + * clear the flag and return. 880 + */ 881 + if (dl_se->dl_defer_armed) { 882 + dl_se->dl_defer_armed = 0; 883 + return; 884 + } 885 + 886 + /* 887 + * A this point, if the deferred server is not armed, and the deadline 888 + * is in the future, if it is not running already, throttle the server 889 + * and arm the defer timer. 890 + */ 891 + if (dl_se->dl_defer && !dl_se->dl_defer_running && 892 + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { 893 + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { 894 + 895 + /* 896 + * Set dl_se->dl_defer_armed and dl_throttled variables to 897 + * inform the start_dl_timer() that this is a deferred 898 + * activation. 899 + */ 900 + dl_se->dl_defer_armed = 1; 901 + dl_se->dl_throttled = 1; 902 + if (!start_dl_timer(dl_se)) { 903 + /* 904 + * If for whatever reason (delays), a previous timer was 905 + * queued but not serviced, cancel it and clean the 906 + * deferrable server variables intended for start_dl_timer(). 907 + */ 908 + hrtimer_try_to_cancel(&dl_se->dl_timer); 909 + dl_se->dl_defer_armed = 0; 910 + dl_se->dl_throttled = 0; 911 + } 912 + } 913 + } 902 914 } 903 915 904 916 /* ··· 1087 1023 } 1088 1024 1089 1025 replenish_dl_new_period(dl_se, rq); 1026 + } else if (dl_server(dl_se) && dl_se->dl_defer) { 1027 + /* 1028 + * The server can still use its previous deadline, so check if 1029 + * it left the dl_defer_running state. 1030 + */ 1031 + if (!dl_se->dl_defer_running) { 1032 + dl_se->dl_defer_armed = 1; 1033 + dl_se->dl_throttled = 1; 1034 + } 1090 1035 } 1091 1036 } 1092 1037 ··· 1128 1055 * We want the timer to fire at the deadline, but considering 1129 1056 * that it is actually coming from rq->clock and not from 1130 1057 * hrtimer's time base reading. 1058 + * 1059 + * The deferred reservation will have its timer set to 1060 + * (deadline - runtime). At that point, the CBS rule will decide 1061 + * if the current deadline can be used, or if a replenishment is 1062 + * required to avoid add too much pressure on the system 1063 + * (current u > U). 1131 1064 */ 1132 - act = ns_to_ktime(dl_next_period(dl_se)); 1065 + if (dl_se->dl_defer_armed) { 1066 + WARN_ON_ONCE(!dl_se->dl_throttled); 1067 + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); 1068 + } else { 1069 + /* act = deadline - rel-deadline + period */ 1070 + act = ns_to_ktime(dl_next_period(dl_se)); 1071 + } 1072 + 1133 1073 now = hrtimer_cb_get_time(timer); 1134 1074 delta = ktime_to_ns(now) - rq_clock(rq); 1135 1075 act = ktime_add_ns(act, delta); ··· 1192 1106 #endif 1193 1107 } 1194 1108 1109 + /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ 1110 + static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; 1111 + 1112 + static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) 1113 + { 1114 + struct rq *rq = rq_of_dl_se(dl_se); 1115 + u64 fw; 1116 + 1117 + scoped_guard (rq_lock, rq) { 1118 + struct rq_flags *rf = &scope.rf; 1119 + 1120 + if (!dl_se->dl_throttled || !dl_se->dl_runtime) 1121 + return HRTIMER_NORESTART; 1122 + 1123 + sched_clock_tick(); 1124 + update_rq_clock(rq); 1125 + 1126 + if (!dl_se->dl_runtime) 1127 + return HRTIMER_NORESTART; 1128 + 1129 + if (!dl_se->server_has_tasks(dl_se)) { 1130 + replenish_dl_entity(dl_se); 1131 + return HRTIMER_NORESTART; 1132 + } 1133 + 1134 + if (dl_se->dl_defer_armed) { 1135 + /* 1136 + * First check if the server could consume runtime in background. 1137 + * If so, it is possible to push the defer timer for this amount 1138 + * of time. The dl_server_min_res serves as a limit to avoid 1139 + * forwarding the timer for a too small amount of time. 1140 + */ 1141 + if (dl_time_before(rq_clock(dl_se->rq), 1142 + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { 1143 + 1144 + /* reset the defer timer */ 1145 + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; 1146 + 1147 + hrtimer_forward_now(timer, ns_to_ktime(fw)); 1148 + return HRTIMER_RESTART; 1149 + } 1150 + 1151 + dl_se->dl_defer_running = 1; 1152 + } 1153 + 1154 + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); 1155 + 1156 + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) 1157 + resched_curr(rq); 1158 + 1159 + __push_dl_task(rq, rf); 1160 + } 1161 + 1162 + return HRTIMER_NORESTART; 1163 + } 1164 + 1195 1165 /* 1196 1166 * This is the bandwidth enforcement timer callback. If here, we know 1197 1167 * a task is not on its dl_rq, since the fact that the timer was running ··· 1270 1128 struct rq_flags rf; 1271 1129 struct rq *rq; 1272 1130 1273 - if (dl_server(dl_se)) { 1274 - struct rq *rq = rq_of_dl_se(dl_se); 1275 - struct rq_flags rf; 1276 - 1277 - rq_lock(rq, &rf); 1278 - if (dl_se->dl_throttled) { 1279 - sched_clock_tick(); 1280 - update_rq_clock(rq); 1281 - 1282 - if (dl_se->server_has_tasks(dl_se)) { 1283 - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); 1284 - resched_curr(rq); 1285 - __push_dl_task(rq, &rf); 1286 - } else { 1287 - replenish_dl_entity(dl_se); 1288 - } 1289 - 1290 - } 1291 - rq_unlock(rq, &rf); 1292 - 1293 - return HRTIMER_NORESTART; 1294 - } 1131 + if (dl_server(dl_se)) 1132 + return dl_server_timer(timer, dl_se); 1295 1133 1296 1134 p = dl_task_of(dl_se); 1297 1135 rq = task_rq_lock(p, &rf); ··· 1441 1319 return (delta * u_act) >> BW_SHIFT; 1442 1320 } 1443 1321 1444 - static inline void 1445 - update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, 1446 - int flags); 1447 - static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1322 + s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1448 1323 { 1449 1324 s64 scaled_delta_exec; 1450 - 1451 - if (unlikely(delta_exec <= 0)) { 1452 - if (unlikely(dl_se->dl_yielded)) 1453 - goto throttle; 1454 - return; 1455 - } 1456 - 1457 - if (dl_entity_is_special(dl_se)) 1458 - return; 1459 1325 1460 1326 /* 1461 1327 * For tasks that participate in GRUB, we implement GRUB-PA: the ··· 1463 1353 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); 1464 1354 } 1465 1355 1356 + return scaled_delta_exec; 1357 + } 1358 + 1359 + static inline void 1360 + update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, 1361 + int flags); 1362 + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1363 + { 1364 + s64 scaled_delta_exec; 1365 + 1366 + if (unlikely(delta_exec <= 0)) { 1367 + if (unlikely(dl_se->dl_yielded)) 1368 + goto throttle; 1369 + return; 1370 + } 1371 + 1372 + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) 1373 + return; 1374 + 1375 + if (dl_entity_is_special(dl_se)) 1376 + return; 1377 + 1378 + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); 1379 + 1466 1380 dl_se->runtime -= scaled_delta_exec; 1381 + 1382 + /* 1383 + * The fair server can consume its runtime while throttled (not queued/ 1384 + * running as regular CFS). 1385 + * 1386 + * If the server consumes its entire runtime in this state. The server 1387 + * is not required for the current period. Thus, reset the server by 1388 + * starting a new period, pushing the activation. 1389 + */ 1390 + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { 1391 + /* 1392 + * If the server was previously activated - the starving condition 1393 + * took place, it this point it went away because the fair scheduler 1394 + * was able to get runtime in background. So return to the initial 1395 + * state. 1396 + */ 1397 + dl_se->dl_defer_running = 0; 1398 + 1399 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1400 + 1401 + replenish_dl_new_period(dl_se, dl_se->rq); 1402 + 1403 + /* 1404 + * Not being able to start the timer seems problematic. If it could not 1405 + * be started for whatever reason, we need to "unthrottle" the DL server 1406 + * and queue right away. Otherwise nothing might queue it. That's similar 1407 + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. 1408 + */ 1409 + WARN_ON_ONCE(!start_dl_timer(dl_se)); 1410 + 1411 + return; 1412 + } 1467 1413 1468 1414 throttle: 1469 1415 if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { ··· 1548 1382 } 1549 1383 1550 1384 /* 1385 + * The fair server (sole dl_server) does not account for real-time 1386 + * workload because it is running fair work. 1387 + */ 1388 + if (dl_se == &rq->fair_server) 1389 + return; 1390 + 1391 + #ifdef CONFIG_RT_GROUP_SCHED 1392 + /* 1551 1393 * Because -- for now -- we share the rt bandwidth, we need to 1552 1394 * account our runtime there too, otherwise actual rt tasks 1553 1395 * would be able to exceed the shared quota. ··· 1579 1405 rt_rq->rt_time += delta_exec; 1580 1406 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1581 1407 } 1408 + #endif 1409 + } 1410 + 1411 + /* 1412 + * In the non-defer mode, the idle time is not accounted, as the 1413 + * server provides a guarantee. 1414 + * 1415 + * If the dl_server is in defer mode, the idle time is also considered 1416 + * as time available for the fair server, avoiding a penalty for the 1417 + * rt scheduler that did not consumed that time. 1418 + */ 1419 + void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) 1420 + { 1421 + s64 delta_exec, scaled_delta_exec; 1422 + 1423 + if (!rq->fair_server.dl_defer) 1424 + return; 1425 + 1426 + /* no need to discount more */ 1427 + if (rq->fair_server.runtime < 0) 1428 + return; 1429 + 1430 + delta_exec = rq_clock_task(rq) - p->se.exec_start; 1431 + if (delta_exec < 0) 1432 + return; 1433 + 1434 + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); 1435 + 1436 + rq->fair_server.runtime -= scaled_delta_exec; 1437 + 1438 + if (rq->fair_server.runtime < 0) { 1439 + rq->fair_server.dl_defer_running = 0; 1440 + rq->fair_server.runtime = 0; 1441 + } 1442 + 1443 + p->se.exec_start = rq_clock_task(rq); 1582 1444 } 1583 1445 1584 1446 void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) 1585 1447 { 1586 - update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1448 + /* 0 runtime = fair server disabled */ 1449 + if (dl_se->dl_runtime) 1450 + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1587 1451 } 1588 1452 1589 1453 void dl_server_start(struct sched_dl_entity *dl_se) 1590 1454 { 1455 + struct rq *rq = dl_se->rq; 1456 + 1457 + /* 1458 + * XXX: the apply do not work fine at the init phase for the 1459 + * fair server because things are not yet set. We need to improve 1460 + * this before getting generic. 1461 + */ 1591 1462 if (!dl_server(dl_se)) { 1463 + u64 runtime = 50 * NSEC_PER_MSEC; 1464 + u64 period = 1000 * NSEC_PER_MSEC; 1465 + 1466 + dl_server_apply_params(dl_se, runtime, period, 1); 1467 + 1592 1468 dl_se->dl_server = 1; 1469 + dl_se->dl_defer = 1; 1593 1470 setup_new_dl_entity(dl_se); 1594 1471 } 1472 + 1473 + if (!dl_se->dl_runtime) 1474 + return; 1475 + 1595 1476 enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); 1477 + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) 1478 + resched_curr(dl_se->rq); 1596 1479 } 1597 1480 1598 1481 void dl_server_stop(struct sched_dl_entity *dl_se) 1599 1482 { 1483 + if (!dl_se->dl_runtime) 1484 + return; 1485 + 1600 1486 dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); 1487 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1488 + dl_se->dl_defer_armed = 0; 1489 + dl_se->dl_throttled = 0; 1601 1490 } 1602 1491 1603 1492 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, 1604 1493 dl_server_has_tasks_f has_tasks, 1605 - dl_server_pick_f pick) 1494 + dl_server_pick_f pick_task) 1606 1495 { 1607 1496 dl_se->rq = rq; 1608 1497 dl_se->server_has_tasks = has_tasks; 1609 - dl_se->server_pick = pick; 1498 + dl_se->server_pick_task = pick_task; 1499 + } 1500 + 1501 + void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) 1502 + { 1503 + u64 new_bw = dl_se->dl_bw; 1504 + int cpu = cpu_of(rq); 1505 + struct dl_bw *dl_b; 1506 + 1507 + dl_b = dl_bw_of(cpu_of(rq)); 1508 + guard(raw_spinlock)(&dl_b->lock); 1509 + 1510 + if (!dl_bw_cpus(cpu)) 1511 + return; 1512 + 1513 + __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); 1514 + } 1515 + 1516 + int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) 1517 + { 1518 + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); 1519 + u64 new_bw = to_ratio(period, runtime); 1520 + struct rq *rq = dl_se->rq; 1521 + int cpu = cpu_of(rq); 1522 + struct dl_bw *dl_b; 1523 + unsigned long cap; 1524 + int retval = 0; 1525 + int cpus; 1526 + 1527 + dl_b = dl_bw_of(cpu); 1528 + guard(raw_spinlock)(&dl_b->lock); 1529 + 1530 + cpus = dl_bw_cpus(cpu); 1531 + cap = dl_bw_capacity(cpu); 1532 + 1533 + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) 1534 + return -EBUSY; 1535 + 1536 + if (init) { 1537 + __add_rq_bw(new_bw, &rq->dl); 1538 + __dl_add(dl_b, new_bw, cpus); 1539 + } else { 1540 + __dl_sub(dl_b, dl_se->dl_bw, cpus); 1541 + __dl_add(dl_b, new_bw, cpus); 1542 + 1543 + dl_rq_change_utilization(rq, dl_se, new_bw); 1544 + } 1545 + 1546 + dl_se->dl_runtime = runtime; 1547 + dl_se->dl_deadline = period; 1548 + dl_se->dl_period = period; 1549 + 1550 + dl_se->runtime = 0; 1551 + dl_se->deadline = 0; 1552 + 1553 + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 1554 + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); 1555 + 1556 + return retval; 1610 1557 } 1611 1558 1612 1559 /* ··· 1894 1599 return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); 1895 1600 } 1896 1601 1897 - static inline struct sched_statistics * 1602 + static __always_inline struct sched_statistics * 1898 1603 __schedstats_from_dl_se(struct sched_dl_entity *dl_se) 1899 1604 { 1605 + if (!schedstat_enabled()) 1606 + return NULL; 1607 + 1608 + if (dl_server(dl_se)) 1609 + return NULL; 1610 + 1900 1611 return &dl_task_of(dl_se)->stats; 1901 1612 } 1902 1613 1903 1614 static inline void 1904 1615 update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) 1905 1616 { 1906 - struct sched_statistics *stats; 1907 - 1908 - if (!schedstat_enabled()) 1909 - return; 1910 - 1911 - stats = __schedstats_from_dl_se(dl_se); 1912 - __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); 1617 + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); 1618 + if (stats) 1619 + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); 1913 1620 } 1914 1621 1915 1622 static inline void 1916 1623 update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) 1917 1624 { 1918 - struct sched_statistics *stats; 1919 - 1920 - if (!schedstat_enabled()) 1921 - return; 1922 - 1923 - stats = __schedstats_from_dl_se(dl_se); 1924 - __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); 1625 + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); 1626 + if (stats) 1627 + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); 1925 1628 } 1926 1629 1927 1630 static inline void 1928 1631 update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) 1929 1632 { 1930 - struct sched_statistics *stats; 1931 - 1932 - if (!schedstat_enabled()) 1933 - return; 1934 - 1935 - stats = __schedstats_from_dl_se(dl_se); 1936 - __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); 1633 + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); 1634 + if (stats) 1635 + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); 1937 1636 } 1938 1637 1939 1638 static inline void ··· 2024 1735 * be counted in the active utilization; hence, we need to call 2025 1736 * add_running_bw(). 2026 1737 */ 2027 - if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { 1738 + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { 2028 1739 if (flags & ENQUEUE_WAKEUP) 2029 1740 task_contending(dl_se, flags); 2030 1741 ··· 2044 1755 } else if ((flags & ENQUEUE_RESTORE) && 2045 1756 dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { 2046 1757 setup_new_dl_entity(dl_se); 1758 + } 1759 + 1760 + /* 1761 + * If the reservation is still throttled, e.g., it got replenished but is a 1762 + * deferred task and still got to wait, don't enqueue. 1763 + */ 1764 + if (dl_se->dl_throttled && start_dl_timer(dl_se)) 1765 + return; 1766 + 1767 + /* 1768 + * We're about to enqueue, make sure we're not ->dl_throttled! 1769 + * In case the timer was not started, say because the defer time 1770 + * has passed, mark as not throttled and mark unarmed. 1771 + * Also cancel earlier timers, since letting those run is pointless. 1772 + */ 1773 + if (dl_se->dl_throttled) { 1774 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1775 + dl_se->dl_defer_armed = 0; 1776 + dl_se->dl_throttled = 0; 2047 1777 } 2048 1778 2049 1779 __enqueue_dl_entity(dl_se); ··· 2154 1846 enqueue_pushable_dl_task(rq, p); 2155 1847 } 2156 1848 2157 - static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) 1849 + static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) 2158 1850 { 2159 1851 update_curr_dl(rq); 2160 1852 ··· 2164 1856 dequeue_dl_entity(&p->dl, flags); 2165 1857 if (!p->dl.dl_throttled && !dl_server(&p->dl)) 2166 1858 dequeue_pushable_dl_task(rq, p); 1859 + 1860 + return true; 2167 1861 } 2168 1862 2169 1863 /* ··· 2384 2074 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); 2385 2075 2386 2076 deadline_queue_push_tasks(rq); 2077 + 2078 + if (hrtick_enabled(rq)) 2079 + start_hrtick_dl(rq, &p->dl); 2387 2080 } 2388 2081 2389 2082 static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) ··· 2399 2086 return __node_2_dle(left); 2400 2087 } 2401 2088 2402 - static struct task_struct *pick_task_dl(struct rq *rq) 2089 + /* 2090 + * __pick_next_task_dl - Helper to pick the next -deadline task to run. 2091 + * @rq: The runqueue to pick the next task from. 2092 + */ 2093 + static struct task_struct *__pick_task_dl(struct rq *rq) 2403 2094 { 2404 2095 struct sched_dl_entity *dl_se; 2405 2096 struct dl_rq *dl_rq = &rq->dl; ··· 2417 2100 WARN_ON_ONCE(!dl_se); 2418 2101 2419 2102 if (dl_server(dl_se)) { 2420 - p = dl_se->server_pick(dl_se); 2103 + p = dl_se->server_pick_task(dl_se); 2421 2104 if (!p) { 2422 - WARN_ON_ONCE(1); 2423 2105 dl_se->dl_yielded = 1; 2424 2106 update_curr_dl_se(rq, dl_se, 0); 2425 2107 goto again; 2426 2108 } 2427 - p->dl_server = dl_se; 2109 + rq->dl_server = dl_se; 2428 2110 } else { 2429 2111 p = dl_task_of(dl_se); 2430 2112 } ··· 2431 2115 return p; 2432 2116 } 2433 2117 2434 - static struct task_struct *pick_next_task_dl(struct rq *rq) 2118 + static struct task_struct *pick_task_dl(struct rq *rq) 2435 2119 { 2436 - struct task_struct *p; 2437 - 2438 - p = pick_task_dl(rq); 2439 - if (!p) 2440 - return p; 2441 - 2442 - if (!p->dl_server) 2443 - set_next_task_dl(rq, p, true); 2444 - 2445 - if (hrtick_enabled(rq)) 2446 - start_hrtick_dl(rq, &p->dl); 2447 - 2448 - return p; 2120 + return __pick_task_dl(rq); 2449 2121 } 2450 2122 2451 - static void put_prev_task_dl(struct rq *rq, struct task_struct *p) 2123 + static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) 2452 2124 { 2453 2125 struct sched_dl_entity *dl_se = &p->dl; 2454 2126 struct dl_rq *dl_rq = &rq->dl; ··· 3128 2824 3129 2825 .wakeup_preempt = wakeup_preempt_dl, 3130 2826 3131 - .pick_next_task = pick_next_task_dl, 2827 + .pick_task = pick_task_dl, 3132 2828 .put_prev_task = put_prev_task_dl, 3133 2829 .set_next_task = set_next_task_dl, 3134 2830 3135 2831 #ifdef CONFIG_SMP 3136 2832 .balance = balance_dl, 3137 - .pick_task = pick_task_dl, 3138 2833 .select_task_rq = select_task_rq_dl, 3139 2834 .migrate_task_rq = migrate_task_rq_dl, 3140 2835 .set_cpus_allowed = set_cpus_allowed_dl,

+186 -12

kernel/sched/debug.c

··· 333 333 .release = seq_release, 334 334 }; 335 335 336 + enum dl_param { 337 + DL_RUNTIME = 0, 338 + DL_PERIOD, 339 + }; 340 + 341 + static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ 342 + static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ 343 + 344 + static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, 345 + size_t cnt, loff_t *ppos, enum dl_param param) 346 + { 347 + long cpu = (long) ((struct seq_file *) filp->private_data)->private; 348 + struct rq *rq = cpu_rq(cpu); 349 + u64 runtime, period; 350 + size_t err; 351 + int retval; 352 + u64 value; 353 + 354 + err = kstrtoull_from_user(ubuf, cnt, 10, &value); 355 + if (err) 356 + return err; 357 + 358 + scoped_guard (rq_lock_irqsave, rq) { 359 + runtime = rq->fair_server.dl_runtime; 360 + period = rq->fair_server.dl_period; 361 + 362 + switch (param) { 363 + case DL_RUNTIME: 364 + if (runtime == value) 365 + break; 366 + runtime = value; 367 + break; 368 + case DL_PERIOD: 369 + if (value == period) 370 + break; 371 + period = value; 372 + break; 373 + } 374 + 375 + if (runtime > period || 376 + period > fair_server_period_max || 377 + period < fair_server_period_min) { 378 + return -EINVAL; 379 + } 380 + 381 + if (rq->cfs.h_nr_running) { 382 + update_rq_clock(rq); 383 + dl_server_stop(&rq->fair_server); 384 + } 385 + 386 + retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); 387 + if (retval) 388 + cnt = retval; 389 + 390 + if (!runtime) 391 + printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", 392 + cpu_of(rq)); 393 + 394 + if (rq->cfs.h_nr_running) 395 + dl_server_start(&rq->fair_server); 396 + } 397 + 398 + *ppos += cnt; 399 + return cnt; 400 + } 401 + 402 + static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) 403 + { 404 + unsigned long cpu = (unsigned long) m->private; 405 + struct rq *rq = cpu_rq(cpu); 406 + u64 value; 407 + 408 + switch (param) { 409 + case DL_RUNTIME: 410 + value = rq->fair_server.dl_runtime; 411 + break; 412 + case DL_PERIOD: 413 + value = rq->fair_server.dl_period; 414 + break; 415 + } 416 + 417 + seq_printf(m, "%llu\n", value); 418 + return 0; 419 + 420 + } 421 + 422 + static ssize_t 423 + sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, 424 + size_t cnt, loff_t *ppos) 425 + { 426 + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); 427 + } 428 + 429 + static int sched_fair_server_runtime_show(struct seq_file *m, void *v) 430 + { 431 + return sched_fair_server_show(m, v, DL_RUNTIME); 432 + } 433 + 434 + static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) 435 + { 436 + return single_open(filp, sched_fair_server_runtime_show, inode->i_private); 437 + } 438 + 439 + static const struct file_operations fair_server_runtime_fops = { 440 + .open = sched_fair_server_runtime_open, 441 + .write = sched_fair_server_runtime_write, 442 + .read = seq_read, 443 + .llseek = seq_lseek, 444 + .release = single_release, 445 + }; 446 + 447 + static ssize_t 448 + sched_fair_server_period_write(struct file *filp, const char __user *ubuf, 449 + size_t cnt, loff_t *ppos) 450 + { 451 + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); 452 + } 453 + 454 + static int sched_fair_server_period_show(struct seq_file *m, void *v) 455 + { 456 + return sched_fair_server_show(m, v, DL_PERIOD); 457 + } 458 + 459 + static int sched_fair_server_period_open(struct inode *inode, struct file *filp) 460 + { 461 + return single_open(filp, sched_fair_server_period_show, inode->i_private); 462 + } 463 + 464 + static const struct file_operations fair_server_period_fops = { 465 + .open = sched_fair_server_period_open, 466 + .write = sched_fair_server_period_write, 467 + .read = seq_read, 468 + .llseek = seq_lseek, 469 + .release = single_release, 470 + }; 471 + 336 472 static struct dentry *debugfs_sched; 473 + 474 + static void debugfs_fair_server_init(void) 475 + { 476 + struct dentry *d_fair; 477 + unsigned long cpu; 478 + 479 + d_fair = debugfs_create_dir("fair_server", debugfs_sched); 480 + if (!d_fair) 481 + return; 482 + 483 + for_each_possible_cpu(cpu) { 484 + struct dentry *d_cpu; 485 + char buf[32]; 486 + 487 + snprintf(buf, sizeof(buf), "cpu%lu", cpu); 488 + d_cpu = debugfs_create_dir(buf, d_fair); 489 + 490 + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops); 491 + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops); 492 + } 493 + } 337 494 338 495 static __init int sched_init_debug(void) 339 496 { ··· 530 373 #endif 531 374 532 375 debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); 376 + 377 + debugfs_fair_server_init(); 533 378 534 379 return 0; 535 380 } ··· 739 580 else 740 581 SEQ_printf(m, " %c", task_state_to_char(p)); 741 582 742 - SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", 583 + SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", 743 584 p->comm, task_pid_nr(p), 744 585 SPLIT_NS(p->se.vruntime), 745 586 entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', 746 587 SPLIT_NS(p->se.deadline), 588 + p->se.custom_slice ? 'S' : ' ', 747 589 SPLIT_NS(p->se.slice), 748 590 SPLIT_NS(p->se.sum_exec_runtime), 749 591 (long long)(p->nvcsw + p->nivcsw), 750 592 p->prio); 751 593 752 - SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", 594 + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", 753 595 SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), 754 - SPLIT_NS(p->se.sum_exec_runtime), 755 596 SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), 756 597 SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); 757 598 758 599 #ifdef CONFIG_NUMA_BALANCING 759 - SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); 600 + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); 760 601 #endif 761 602 #ifdef CONFIG_CGROUP_SCHED 762 - SEQ_printf_task_group_path(m, task_group(p), " %s") 603 + SEQ_printf_task_group_path(m, task_group(p), " %s") 763 604 #endif 764 605 765 606 SEQ_printf(m, "\n"); ··· 771 612 772 613 SEQ_printf(m, "\n"); 773 614 SEQ_printf(m, "runnable tasks:\n"); 774 - SEQ_printf(m, " S task PID tree-key switches prio" 775 - " wait-time sum-exec sum-sleep\n"); 615 + SEQ_printf(m, " S task PID vruntime eligible " 616 + "deadline slice sum-exec switches " 617 + "prio wait-time sum-sleep sum-block" 618 + #ifdef CONFIG_NUMA_BALANCING 619 + " node group-id" 620 + #endif 621 + #ifdef CONFIG_CGROUP_SCHED 622 + " group-path" 623 + #endif 624 + "\n"); 776 625 SEQ_printf(m, "-------------------------------------------------------" 777 - "------------------------------------------------------\n"); 626 + "------------------------------------------------------" 627 + "------------------------------------------------------" 628 + #ifdef CONFIG_NUMA_BALANCING 629 + "--------------" 630 + #endif 631 + #ifdef CONFIG_CGROUP_SCHED 632 + "--------------" 633 + #endif 634 + "\n"); 778 635 779 636 rcu_read_lock(); 780 637 for_each_process_thread(g, p) { ··· 816 641 SEQ_printf(m, "\n"); 817 642 SEQ_printf(m, "cfs_rq[%d]:\n", cpu); 818 643 #endif 819 - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 820 - SPLIT_NS(cfs_rq->exec_clock)); 821 644 822 645 raw_spin_rq_lock_irqsave(rq, flags); 823 646 root = __pick_root_entity(cfs_rq); ··· 842 669 SPLIT_NS(right_vruntime)); 843 670 spread = right_vruntime - left_vruntime; 844 671 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); 845 - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 846 - cfs_rq->nr_spread_over); 847 672 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 848 673 SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); 849 674 SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", ··· 901 730 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) 902 731 903 732 PU(rt_nr_running); 733 + 734 + #ifdef CONFIG_RT_GROUP_SCHED 904 735 P(rt_throttled); 905 736 PN(rt_time); 906 737 PN(rt_runtime); 738 + #endif 907 739 908 740 #undef PN 909 741 #undef PU

+580 -190

kernel/sched/fair.c

··· 511 511 512 512 static int se_is_idle(struct sched_entity *se) 513 513 { 514 - return 0; 514 + return task_has_idle_policy(task_of(se)); 515 515 } 516 516 517 517 #endif /* CONFIG_FAIR_GROUP_SCHED */ ··· 779 779 } 780 780 781 781 /* ensure we never gain time by being placed backwards. */ 782 - u64_u32_store(cfs_rq->min_vruntime, 783 - __update_min_vruntime(cfs_rq, vruntime)); 782 + cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); 783 + } 784 + 785 + static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) 786 + { 787 + struct sched_entity *root = __pick_root_entity(cfs_rq); 788 + struct sched_entity *curr = cfs_rq->curr; 789 + u64 min_slice = ~0ULL; 790 + 791 + if (curr && curr->on_rq) 792 + min_slice = curr->slice; 793 + 794 + if (root) 795 + min_slice = min(min_slice, root->min_slice); 796 + 797 + return min_slice; 784 798 } 785 799 786 800 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) ··· 813 799 } 814 800 } 815 801 802 + static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) 803 + { 804 + if (node) { 805 + struct sched_entity *rse = __node_2_se(node); 806 + if (rse->min_slice < se->min_slice) 807 + se->min_slice = rse->min_slice; 808 + } 809 + } 810 + 816 811 /* 817 812 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) 818 813 */ 819 814 static inline bool min_vruntime_update(struct sched_entity *se, bool exit) 820 815 { 821 816 u64 old_min_vruntime = se->min_vruntime; 817 + u64 old_min_slice = se->min_slice; 822 818 struct rb_node *node = &se->run_node; 823 819 824 820 se->min_vruntime = se->vruntime; 825 821 __min_vruntime_update(se, node->rb_right); 826 822 __min_vruntime_update(se, node->rb_left); 827 823 828 - return se->min_vruntime == old_min_vruntime; 824 + se->min_slice = se->slice; 825 + __min_slice_update(se, node->rb_right); 826 + __min_slice_update(se, node->rb_left); 827 + 828 + return se->min_vruntime == old_min_vruntime && 829 + se->min_slice == old_min_slice; 829 830 } 830 831 831 832 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, ··· 853 824 { 854 825 avg_vruntime_add(cfs_rq, se); 855 826 se->min_vruntime = se->vruntime; 827 + se->min_slice = se->slice; 856 828 rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, 857 829 __entity_less, &min_vruntime_cb); 858 830 } ··· 1004 974 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i 1005 975 * this is probably good enough. 1006 976 */ 1007 - static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) 977 + static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) 1008 978 { 1009 979 if ((s64)(se->vruntime - se->deadline) < 0) 1010 - return; 980 + return false; 1011 981 1012 982 /* 1013 983 * For EEVDF the virtual time slope is determined by w_i (iow. 1014 984 * nice) while the request time r_i is determined by 1015 985 * sysctl_sched_base_slice. 1016 986 */ 1017 - se->slice = sysctl_sched_base_slice; 987 + if (!se->custom_slice) 988 + se->slice = sysctl_sched_base_slice; 1018 989 1019 990 /* 1020 991 * EEVDF: vd_i = ve_i + r_i / w_i ··· 1025 994 /* 1026 995 * The task has consumed its request, reschedule. 1027 996 */ 1028 - if (cfs_rq->nr_running > 1) { 1029 - resched_curr(rq_of(cfs_rq)); 1030 - clear_buddies(cfs_rq, se); 1031 - } 997 + return true; 1032 998 } 1033 999 1034 1000 #include "pelt.h" ··· 1163 1135 dl_server_update(p->dl_server, delta_exec); 1164 1136 } 1165 1137 1138 + static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1139 + { 1140 + if (!sched_feat(PREEMPT_SHORT)) 1141 + return false; 1142 + 1143 + if (curr->vlag == curr->deadline) 1144 + return false; 1145 + 1146 + return !entity_eligible(cfs_rq, curr); 1147 + } 1148 + 1149 + static inline bool do_preempt_short(struct cfs_rq *cfs_rq, 1150 + struct sched_entity *pse, struct sched_entity *se) 1151 + { 1152 + if (!sched_feat(PREEMPT_SHORT)) 1153 + return false; 1154 + 1155 + if (pse->slice >= se->slice) 1156 + return false; 1157 + 1158 + if (!entity_eligible(cfs_rq, pse)) 1159 + return false; 1160 + 1161 + if (entity_before(pse, se)) 1162 + return true; 1163 + 1164 + if (!entity_eligible(cfs_rq, se)) 1165 + return true; 1166 + 1167 + return false; 1168 + } 1169 + 1166 1170 /* 1167 1171 * Used by other classes to account runtime. 1168 1172 */ ··· 1216 1156 static void update_curr(struct cfs_rq *cfs_rq) 1217 1157 { 1218 1158 struct sched_entity *curr = cfs_rq->curr; 1159 + struct rq *rq = rq_of(cfs_rq); 1219 1160 s64 delta_exec; 1161 + bool resched; 1220 1162 1221 1163 if (unlikely(!curr)) 1222 1164 return; 1223 1165 1224 - delta_exec = update_curr_se(rq_of(cfs_rq), curr); 1166 + delta_exec = update_curr_se(rq, curr); 1225 1167 if (unlikely(delta_exec <= 0)) 1226 1168 return; 1227 1169 1228 1170 curr->vruntime += calc_delta_fair(delta_exec, curr); 1229 - update_deadline(cfs_rq, curr); 1171 + resched = update_deadline(cfs_rq, curr); 1230 1172 update_min_vruntime(cfs_rq); 1231 1173 1232 - if (entity_is_task(curr)) 1233 - update_curr_task(task_of(curr), delta_exec); 1174 + if (entity_is_task(curr)) { 1175 + struct task_struct *p = task_of(curr); 1176 + 1177 + update_curr_task(p, delta_exec); 1178 + 1179 + /* 1180 + * Any fair task that runs outside of fair_server should 1181 + * account against fair_server such that it can account for 1182 + * this time and possibly avoid running this period. 1183 + */ 1184 + if (p->dl_server != &rq->fair_server) 1185 + dl_server_update(&rq->fair_server, delta_exec); 1186 + } 1234 1187 1235 1188 account_cfs_rq_runtime(cfs_rq, delta_exec); 1189 + 1190 + if (rq->nr_running == 1) 1191 + return; 1192 + 1193 + if (resched || did_preempt_short(cfs_rq, curr)) { 1194 + resched_curr(rq); 1195 + clear_buddies(cfs_rq, curr); 1196 + } 1236 1197 } 1237 1198 1238 1199 static void update_curr_fair(struct rq *rq) ··· 5259 5178 u64 vslice, vruntime = avg_vruntime(cfs_rq); 5260 5179 s64 lag = 0; 5261 5180 5262 - se->slice = sysctl_sched_base_slice; 5181 + if (!se->custom_slice) 5182 + se->slice = sysctl_sched_base_slice; 5263 5183 vslice = calc_delta_fair(se->slice, se); 5264 5184 5265 5185 /* ··· 5341 5259 5342 5260 se->vruntime = vruntime - lag; 5343 5261 5262 + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { 5263 + se->deadline += se->vruntime; 5264 + se->rel_deadline = 0; 5265 + return; 5266 + } 5267 + 5344 5268 /* 5345 5269 * When joining the competition; the existing tasks will be, 5346 5270 * on average, halfway through their slice, as such start tasks ··· 5365 5277 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); 5366 5278 5367 5279 static inline bool cfs_bandwidth_used(void); 5280 + 5281 + static void 5282 + requeue_delayed_entity(struct sched_entity *se); 5368 5283 5369 5284 static void 5370 5285 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ··· 5456 5365 5457 5366 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 5458 5367 5459 - static void 5368 + static inline void finish_delayed_dequeue_entity(struct sched_entity *se) 5369 + { 5370 + se->sched_delayed = 0; 5371 + if (sched_feat(DELAY_ZERO) && se->vlag > 0) 5372 + se->vlag = 0; 5373 + } 5374 + 5375 + static bool 5460 5376 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 5461 5377 { 5462 - int action = UPDATE_TG; 5378 + bool sleep = flags & DEQUEUE_SLEEP; 5463 5379 5380 + update_curr(cfs_rq); 5381 + 5382 + if (flags & DEQUEUE_DELAYED) { 5383 + SCHED_WARN_ON(!se->sched_delayed); 5384 + } else { 5385 + bool delay = sleep; 5386 + /* 5387 + * DELAY_DEQUEUE relies on spurious wakeups, special task 5388 + * states must not suffer spurious wakeups, excempt them. 5389 + */ 5390 + if (flags & DEQUEUE_SPECIAL) 5391 + delay = false; 5392 + 5393 + SCHED_WARN_ON(delay && se->sched_delayed); 5394 + 5395 + if (sched_feat(DELAY_DEQUEUE) && delay && 5396 + !entity_eligible(cfs_rq, se)) { 5397 + if (cfs_rq->next == se) 5398 + cfs_rq->next = NULL; 5399 + update_load_avg(cfs_rq, se, 0); 5400 + se->sched_delayed = 1; 5401 + return false; 5402 + } 5403 + } 5404 + 5405 + int action = UPDATE_TG; 5464 5406 if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) 5465 5407 action |= DO_DETACH; 5466 - 5467 - /* 5468 - * Update run-time statistics of the 'current'. 5469 - */ 5470 - update_curr(cfs_rq); 5471 5408 5472 5409 /* 5473 5410 * When dequeuing a sched_entity, we must: ··· 5514 5395 clear_buddies(cfs_rq, se); 5515 5396 5516 5397 update_entity_lag(cfs_rq, se); 5398 + if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { 5399 + se->deadline -= se->vruntime; 5400 + se->rel_deadline = 1; 5401 + } 5402 + 5517 5403 if (se != cfs_rq->curr) 5518 5404 __dequeue_entity(cfs_rq, se); 5519 5405 se->on_rq = 0; ··· 5538 5414 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) 5539 5415 update_min_vruntime(cfs_rq); 5540 5416 5417 + if (flags & DEQUEUE_DELAYED) 5418 + finish_delayed_dequeue_entity(se); 5419 + 5541 5420 if (cfs_rq->nr_running == 0) 5542 5421 update_idle_cfs_rq_clock_pelt(cfs_rq); 5422 + 5423 + return true; 5543 5424 } 5544 5425 5545 5426 static void ··· 5570 5441 } 5571 5442 5572 5443 update_stats_curr_start(cfs_rq, se); 5444 + SCHED_WARN_ON(cfs_rq->curr); 5573 5445 cfs_rq->curr = se; 5574 5446 5575 5447 /* ··· 5591 5461 se->prev_sum_exec_runtime = se->sum_exec_runtime; 5592 5462 } 5593 5463 5464 + static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); 5465 + 5594 5466 /* 5595 5467 * Pick the next process, keeping these things in mind, in this order: 5596 5468 * 1) keep things fair between processes/task groups ··· 5601 5469 * 4) do not run the "skip" process, if something else is available 5602 5470 */ 5603 5471 static struct sched_entity * 5604 - pick_next_entity(struct cfs_rq *cfs_rq) 5472 + pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) 5605 5473 { 5606 5474 /* 5607 5475 * Enabling NEXT_BUDDY will affect latency but not fairness. 5608 5476 */ 5609 5477 if (sched_feat(NEXT_BUDDY) && 5610 - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) 5478 + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { 5479 + /* ->next will never be delayed */ 5480 + SCHED_WARN_ON(cfs_rq->next->sched_delayed); 5611 5481 return cfs_rq->next; 5482 + } 5612 5483 5613 - return pick_eevdf(cfs_rq); 5484 + struct sched_entity *se = pick_eevdf(cfs_rq); 5485 + if (se->sched_delayed) { 5486 + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 5487 + SCHED_WARN_ON(se->sched_delayed); 5488 + SCHED_WARN_ON(se->on_rq); 5489 + return NULL; 5490 + } 5491 + return se; 5614 5492 } 5615 5493 5616 5494 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); ··· 5644 5502 /* in !on_rq case, update occurred at dequeue */ 5645 5503 update_load_avg(cfs_rq, prev, 0); 5646 5504 } 5505 + SCHED_WARN_ON(cfs_rq->curr != prev); 5647 5506 cfs_rq->curr = NULL; 5648 5507 } 5649 5508 ··· 5908 5765 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5909 5766 struct sched_entity *se; 5910 5767 long task_delta, idle_task_delta, dequeue = 1; 5768 + long rq_h_nr_running = rq->cfs.h_nr_running; 5911 5769 5912 5770 raw_spin_lock(&cfs_b->lock); 5913 5771 /* This will start the period timer if necessary */ ··· 5942 5798 idle_task_delta = cfs_rq->idle_h_nr_running; 5943 5799 for_each_sched_entity(se) { 5944 5800 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 5801 + int flags; 5802 + 5945 5803 /* throttled entity or throttle-on-deactivate */ 5946 5804 if (!se->on_rq) 5947 5805 goto done; 5948 5806 5949 - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); 5807 + /* 5808 + * Abuse SPECIAL to avoid delayed dequeue in this instance. 5809 + * This avoids teaching dequeue_entities() about throttled 5810 + * entities and keeps things relatively simple. 5811 + */ 5812 + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; 5813 + if (se->sched_delayed) 5814 + flags |= DEQUEUE_DELAYED; 5815 + dequeue_entity(qcfs_rq, se, flags); 5950 5816 5951 5817 if (cfs_rq_is_idle(group_cfs_rq(se))) 5952 5818 idle_task_delta = cfs_rq->h_nr_running; ··· 5990 5836 /* At this point se is NULL and we are at root level*/ 5991 5837 sub_nr_running(rq, task_delta); 5992 5838 5839 + /* Stop the fair server if throttling resulted in no runnable tasks */ 5840 + if (rq_h_nr_running && !rq->cfs.h_nr_running) 5841 + dl_server_stop(&rq->fair_server); 5993 5842 done: 5994 5843 /* 5995 5844 * Note: distribution will already see us throttled via the ··· 6011 5854 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6012 5855 struct sched_entity *se; 6013 5856 long task_delta, idle_task_delta; 5857 + long rq_h_nr_running = rq->cfs.h_nr_running; 6014 5858 6015 5859 se = cfs_rq->tg->se[cpu_of(rq)]; 6016 5860 ··· 6049 5891 for_each_sched_entity(se) { 6050 5892 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6051 5893 6052 - if (se->on_rq) 5894 + if (se->on_rq) { 5895 + SCHED_WARN_ON(se->sched_delayed); 6053 5896 break; 5897 + } 6054 5898 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); 6055 5899 6056 5900 if (cfs_rq_is_idle(group_cfs_rq(se))) ··· 6082 5922 if (cfs_rq_throttled(qcfs_rq)) 6083 5923 goto unthrottle_throttle; 6084 5924 } 5925 + 5926 + /* Start the fair server if un-throttling resulted in new runnable tasks */ 5927 + if (!rq_h_nr_running && rq->cfs.h_nr_running) 5928 + dl_server_start(&rq->fair_server); 6085 5929 6086 5930 /* At this point se is NULL and we are at root level*/ 6087 5931 add_nr_running(rq, task_delta); ··· 6719 6555 { 6720 6556 int cpu = cpu_of(rq); 6721 6557 6722 - if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) 6558 + if (!cfs_bandwidth_used()) 6723 6559 return; 6724 6560 6725 6561 if (!tick_nohz_full_cpu(cpu)) ··· 6902 6738 } 6903 6739 #endif 6904 6740 6741 + static void 6742 + requeue_delayed_entity(struct sched_entity *se) 6743 + { 6744 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 6745 + 6746 + /* 6747 + * se->sched_delayed should imply: se->on_rq == 1. 6748 + * Because a delayed entity is one that is still on 6749 + * the runqueue competing until elegibility. 6750 + */ 6751 + SCHED_WARN_ON(!se->sched_delayed); 6752 + SCHED_WARN_ON(!se->on_rq); 6753 + 6754 + if (sched_feat(DELAY_ZERO)) { 6755 + update_entity_lag(cfs_rq, se); 6756 + if (se->vlag > 0) { 6757 + cfs_rq->nr_running--; 6758 + if (se != cfs_rq->curr) 6759 + __dequeue_entity(cfs_rq, se); 6760 + se->vlag = 0; 6761 + place_entity(cfs_rq, se, 0); 6762 + if (se != cfs_rq->curr) 6763 + __enqueue_entity(cfs_rq, se); 6764 + cfs_rq->nr_running++; 6765 + } 6766 + } 6767 + 6768 + update_load_avg(cfs_rq, se, 0); 6769 + se->sched_delayed = 0; 6770 + } 6771 + 6905 6772 /* 6906 6773 * The enqueue_task method is called before nr_running is 6907 6774 * increased. Here we update the fair scheduling stats and ··· 6945 6750 struct sched_entity *se = &p->se; 6946 6751 int idle_h_nr_running = task_has_idle_policy(p); 6947 6752 int task_new = !(flags & ENQUEUE_WAKEUP); 6753 + int rq_h_nr_running = rq->cfs.h_nr_running; 6754 + u64 slice = 0; 6948 6755 6949 6756 /* 6950 6757 * The code below (indirectly) updates schedutil which looks at ··· 6954 6757 * Let's add the task's estimated utilization to the cfs_rq's 6955 6758 * estimated utilization, before we update schedutil. 6956 6759 */ 6957 - util_est_enqueue(&rq->cfs, p); 6760 + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) 6761 + util_est_enqueue(&rq->cfs, p); 6762 + 6763 + if (flags & ENQUEUE_DELAYED) { 6764 + requeue_delayed_entity(se); 6765 + return; 6766 + } 6958 6767 6959 6768 /* 6960 6769 * If in_iowait is set, the code below may not trigger any cpufreq ··· 6971 6768 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); 6972 6769 6973 6770 for_each_sched_entity(se) { 6974 - if (se->on_rq) 6771 + if (se->on_rq) { 6772 + if (se->sched_delayed) 6773 + requeue_delayed_entity(se); 6975 6774 break; 6775 + } 6976 6776 cfs_rq = cfs_rq_of(se); 6777 + 6778 + /* 6779 + * Basically set the slice of group entries to the min_slice of 6780 + * their respective cfs_rq. This ensures the group can service 6781 + * its entities in the desired time-frame. 6782 + */ 6783 + if (slice) { 6784 + se->slice = slice; 6785 + se->custom_slice = 1; 6786 + } 6977 6787 enqueue_entity(cfs_rq, se, flags); 6788 + slice = cfs_rq_min_slice(cfs_rq); 6978 6789 6979 6790 cfs_rq->h_nr_running++; 6980 6791 cfs_rq->idle_h_nr_running += idle_h_nr_running; ··· 7010 6793 se_update_runnable(se); 7011 6794 update_cfs_group(se); 7012 6795 6796 + se->slice = slice; 6797 + slice = cfs_rq_min_slice(cfs_rq); 6798 + 7013 6799 cfs_rq->h_nr_running++; 7014 6800 cfs_rq->idle_h_nr_running += idle_h_nr_running; 7015 6801 ··· 7022 6802 /* end evaluation on encountering a throttled cfs_rq */ 7023 6803 if (cfs_rq_throttled(cfs_rq)) 7024 6804 goto enqueue_throttle; 6805 + } 6806 + 6807 + if (!rq_h_nr_running && rq->cfs.h_nr_running) { 6808 + /* Account for idle runtime */ 6809 + if (!rq->nr_running) 6810 + dl_server_update_idle_time(rq, rq->curr); 6811 + dl_server_start(&rq->fair_server); 7025 6812 } 7026 6813 7027 6814 /* At this point se is NULL and we are at root level*/ ··· 7060 6833 static void set_next_buddy(struct sched_entity *se); 7061 6834 7062 6835 /* 7063 - * The dequeue_task method is called before nr_running is 7064 - * decreased. We remove the task from the rbtree and 7065 - * update the fair scheduling stats: 6836 + * Basically dequeue_task_fair(), except it can deal with dequeue_entity() 6837 + * failing half-way through and resume the dequeue later. 6838 + * 6839 + * Returns: 6840 + * -1 - dequeue delayed 6841 + * 0 - dequeue throttled 6842 + * 1 - dequeue complete 7066 6843 */ 7067 - static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 6844 + static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) 7068 6845 { 7069 - struct cfs_rq *cfs_rq; 7070 - struct sched_entity *se = &p->se; 7071 - int task_sleep = flags & DEQUEUE_SLEEP; 7072 - int idle_h_nr_running = task_has_idle_policy(p); 7073 6846 bool was_sched_idle = sched_idle_rq(rq); 6847 + int rq_h_nr_running = rq->cfs.h_nr_running; 6848 + bool task_sleep = flags & DEQUEUE_SLEEP; 6849 + bool task_delayed = flags & DEQUEUE_DELAYED; 6850 + struct task_struct *p = NULL; 6851 + int idle_h_nr_running = 0; 6852 + int h_nr_running = 0; 6853 + struct cfs_rq *cfs_rq; 6854 + u64 slice = 0; 7074 6855 7075 - util_est_dequeue(&rq->cfs, p); 6856 + if (entity_is_task(se)) { 6857 + p = task_of(se); 6858 + h_nr_running = 1; 6859 + idle_h_nr_running = task_has_idle_policy(p); 6860 + } else { 6861 + cfs_rq = group_cfs_rq(se); 6862 + slice = cfs_rq_min_slice(cfs_rq); 6863 + } 7076 6864 7077 6865 for_each_sched_entity(se) { 7078 6866 cfs_rq = cfs_rq_of(se); 7079 - dequeue_entity(cfs_rq, se, flags); 7080 6867 7081 - cfs_rq->h_nr_running--; 6868 + if (!dequeue_entity(cfs_rq, se, flags)) { 6869 + if (p && &p->se == se) 6870 + return -1; 6871 + 6872 + break; 6873 + } 6874 + 6875 + cfs_rq->h_nr_running -= h_nr_running; 7082 6876 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7083 6877 7084 6878 if (cfs_rq_is_idle(cfs_rq)) 7085 - idle_h_nr_running = 1; 6879 + idle_h_nr_running = h_nr_running; 7086 6880 7087 6881 /* end evaluation on encountering a throttled cfs_rq */ 7088 6882 if (cfs_rq_throttled(cfs_rq)) 7089 - goto dequeue_throttle; 6883 + return 0; 7090 6884 7091 6885 /* Don't dequeue parent if it has other entities besides us */ 7092 6886 if (cfs_rq->load.weight) { 6887 + slice = cfs_rq_min_slice(cfs_rq); 6888 + 7093 6889 /* Avoid re-evaluating load for this entity: */ 7094 6890 se = parent_entity(se); 7095 6891 /* ··· 7124 6874 break; 7125 6875 } 7126 6876 flags |= DEQUEUE_SLEEP; 6877 + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); 7127 6878 } 7128 6879 7129 6880 for_each_sched_entity(se) { ··· 7134 6883 se_update_runnable(se); 7135 6884 update_cfs_group(se); 7136 6885 7137 - cfs_rq->h_nr_running--; 6886 + se->slice = slice; 6887 + slice = cfs_rq_min_slice(cfs_rq); 6888 + 6889 + cfs_rq->h_nr_running -= h_nr_running; 7138 6890 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 7139 6891 7140 6892 if (cfs_rq_is_idle(cfs_rq)) 7141 - idle_h_nr_running = 1; 6893 + idle_h_nr_running = h_nr_running; 7142 6894 7143 6895 /* end evaluation on encountering a throttled cfs_rq */ 7144 6896 if (cfs_rq_throttled(cfs_rq)) 7145 - goto dequeue_throttle; 7146 - 6897 + return 0; 7147 6898 } 7148 6899 7149 - /* At this point se is NULL and we are at root level*/ 7150 - sub_nr_running(rq, 1); 6900 + sub_nr_running(rq, h_nr_running); 6901 + 6902 + if (rq_h_nr_running && !rq->cfs.h_nr_running) 6903 + dl_server_stop(&rq->fair_server); 7151 6904 7152 6905 /* balance early to pull high priority tasks */ 7153 6906 if (unlikely(!was_sched_idle && sched_idle_rq(rq))) 7154 6907 rq->next_balance = jiffies; 7155 6908 7156 - dequeue_throttle: 7157 - util_est_update(&rq->cfs, p, task_sleep); 6909 + if (p && task_delayed) { 6910 + SCHED_WARN_ON(!task_sleep); 6911 + SCHED_WARN_ON(p->on_rq != 1); 6912 + 6913 + /* Fix-up what dequeue_task_fair() skipped */ 6914 + hrtick_update(rq); 6915 + 6916 + /* Fix-up what block_task() skipped. */ 6917 + __block_task(rq, p); 6918 + } 6919 + 6920 + return 1; 6921 + } 6922 + 6923 + /* 6924 + * The dequeue_task method is called before nr_running is 6925 + * decreased. We remove the task from the rbtree and 6926 + * update the fair scheduling stats: 6927 + */ 6928 + static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 6929 + { 6930 + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) 6931 + util_est_dequeue(&rq->cfs, p); 6932 + 6933 + if (dequeue_entities(rq, &p->se, flags) < 0) { 6934 + util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); 6935 + return false; 6936 + } 6937 + 6938 + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); 7158 6939 hrtick_update(rq); 6940 + return true; 7159 6941 } 7160 6942 7161 6943 #ifdef CONFIG_SMP ··· 8087 7803 } 8088 7804 8089 7805 /* 7806 + * This function computes an effective utilization for the given CPU, to be 7807 + * used for frequency selection given the linear relation: f = u * f_max. 7808 + * 7809 + * The scheduler tracks the following metrics: 7810 + * 7811 + * cpu_util_{cfs,rt,dl,irq}() 7812 + * cpu_bw_dl() 7813 + * 7814 + * Where the cfs,rt and dl util numbers are tracked with the same metric and 7815 + * synchronized windows and are thus directly comparable. 7816 + * 7817 + * The cfs,rt,dl utilization are the running times measured with rq->clock_task 7818 + * which excludes things like IRQ and steal-time. These latter are then accrued 7819 + * in the IRQ utilization. 7820 + * 7821 + * The DL bandwidth number OTOH is not a measured metric but a value computed 7822 + * based on the task model parameters and gives the minimal utilization 7823 + * required to meet deadlines. 7824 + */ 7825 + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, 7826 + unsigned long *min, 7827 + unsigned long *max) 7828 + { 7829 + unsigned long util, irq, scale; 7830 + struct rq *rq = cpu_rq(cpu); 7831 + 7832 + scale = arch_scale_cpu_capacity(cpu); 7833 + 7834 + /* 7835 + * Early check to see if IRQ/steal time saturates the CPU, can be 7836 + * because of inaccuracies in how we track these -- see 7837 + * update_irq_load_avg(). 7838 + */ 7839 + irq = cpu_util_irq(rq); 7840 + if (unlikely(irq >= scale)) { 7841 + if (min) 7842 + *min = scale; 7843 + if (max) 7844 + *max = scale; 7845 + return scale; 7846 + } 7847 + 7848 + if (min) { 7849 + /* 7850 + * The minimum utilization returns the highest level between: 7851 + * - the computed DL bandwidth needed with the IRQ pressure which 7852 + * steals time to the deadline task. 7853 + * - The minimum performance requirement for CFS and/or RT. 7854 + */ 7855 + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); 7856 + 7857 + /* 7858 + * When an RT task is runnable and uclamp is not used, we must 7859 + * ensure that the task will run at maximum compute capacity. 7860 + */ 7861 + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) 7862 + *min = max(*min, scale); 7863 + } 7864 + 7865 + /* 7866 + * Because the time spend on RT/DL tasks is visible as 'lost' time to 7867 + * CFS tasks and we use the same metric to track the effective 7868 + * utilization (PELT windows are synchronized) we can directly add them 7869 + * to obtain the CPU's actual utilization. 7870 + */ 7871 + util = util_cfs + cpu_util_rt(rq); 7872 + util += cpu_util_dl(rq); 7873 + 7874 + /* 7875 + * The maximum hint is a soft bandwidth requirement, which can be lower 7876 + * than the actual utilization because of uclamp_max requirements. 7877 + */ 7878 + if (max) 7879 + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); 7880 + 7881 + if (util >= scale) 7882 + return scale; 7883 + 7884 + /* 7885 + * There is still idle time; further improve the number by using the 7886 + * IRQ metric. Because IRQ/steal time is hidden from the task clock we 7887 + * need to scale the task numbers: 7888 + * 7889 + * max - irq 7890 + * U' = irq + --------- * U 7891 + * max 7892 + */ 7893 + util = scale_irq_capacity(util, irq, scale); 7894 + util += irq; 7895 + 7896 + return min(scale, util); 7897 + } 7898 + 7899 + unsigned long sched_cpu_util(int cpu) 7900 + { 7901 + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); 7902 + } 7903 + 7904 + /* 8090 7905 * energy_env - Utilization landscape for energy estimation. 8091 7906 * @task_busy_time: Utilization contribution by the task for which we test the 8092 7907 * placement. Given by eenv_task_busy_time(). ··· 8669 8286 8670 8287 static void task_dead_fair(struct task_struct *p) 8671 8288 { 8672 - remove_entity_load_avg(&p->se); 8289 + struct sched_entity *se = &p->se; 8290 + 8291 + if (se->sched_delayed) { 8292 + struct rq_flags rf; 8293 + struct rq *rq; 8294 + 8295 + rq = task_rq_lock(p, &rf); 8296 + if (se->sched_delayed) { 8297 + update_rq_clock(rq); 8298 + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 8299 + } 8300 + task_rq_unlock(rq, p, &rf); 8301 + } 8302 + 8303 + remove_entity_load_avg(se); 8673 8304 } 8674 8305 8675 8306 /* ··· 8719 8322 static int 8720 8323 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8721 8324 { 8722 - if (rq->nr_running) 8325 + if (sched_fair_runnable(rq)) 8723 8326 return 1; 8724 8327 8725 8328 return sched_balance_newidle(rq, rf) != 0; ··· 8778 8381 if (test_tsk_need_resched(curr)) 8779 8382 return; 8780 8383 8781 - /* Idle tasks are by definition preempted by non-idle tasks. */ 8782 - if (unlikely(task_has_idle_policy(curr)) && 8783 - likely(!task_has_idle_policy(p))) 8784 - goto preempt; 8785 - 8786 - /* 8787 - * Batch and idle tasks do not preempt non-idle tasks (their preemption 8788 - * is driven by the tick): 8789 - */ 8790 - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) 8384 + if (!sched_feat(WAKEUP_PREEMPTION)) 8791 8385 return; 8792 8386 8793 8387 find_matching_se(&se, &pse); ··· 8788 8400 pse_is_idle = se_is_idle(pse); 8789 8401 8790 8402 /* 8791 - * Preempt an idle group in favor of a non-idle group (and don't preempt 8403 + * Preempt an idle entity in favor of a non-idle entity (and don't preempt 8792 8404 * in the inverse case). 8793 8405 */ 8794 8406 if (cse_is_idle && !pse_is_idle) ··· 8796 8408 if (cse_is_idle != pse_is_idle) 8797 8409 return; 8798 8410 8411 + /* 8412 + * BATCH and IDLE tasks do not preempt others. 8413 + */ 8414 + if (unlikely(p->policy != SCHED_NORMAL)) 8415 + return; 8416 + 8799 8417 cfs_rq = cfs_rq_of(se); 8800 8418 update_curr(cfs_rq); 8419 + /* 8420 + * If @p has a shorter slice than current and @p is eligible, override 8421 + * current's slice protection in order to allow preemption. 8422 + * 8423 + * Note that even if @p does not turn out to be the most eligible 8424 + * task at this moment, current's slice protection will be lost. 8425 + */ 8426 + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) 8427 + se->vlag = se->deadline + 1; 8801 8428 8802 8429 /* 8803 - * XXX pick_eevdf(cfs_rq) != se ? 8430 + * If @p has become the most eligible task, force preemption. 8804 8431 */ 8805 8432 if (pick_eevdf(cfs_rq) == pse) 8806 8433 goto preempt; ··· 8826 8423 resched_curr(rq); 8827 8424 } 8828 8425 8829 - #ifdef CONFIG_SMP 8830 8426 static struct task_struct *pick_task_fair(struct rq *rq) 8831 8427 { 8832 8428 struct sched_entity *se; ··· 8837 8435 return NULL; 8838 8436 8839 8437 do { 8840 - struct sched_entity *curr = cfs_rq->curr; 8438 + /* Might not have done put_prev_entity() */ 8439 + if (cfs_rq->curr && cfs_rq->curr->on_rq) 8440 + update_curr(cfs_rq); 8841 8441 8842 - /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ 8843 - if (curr) { 8844 - if (curr->on_rq) 8845 - update_curr(cfs_rq); 8846 - else 8847 - curr = NULL; 8442 + if (unlikely(check_cfs_rq_runtime(cfs_rq))) 8443 + goto again; 8848 8444 8849 - if (unlikely(check_cfs_rq_runtime(cfs_rq))) 8850 - goto again; 8851 - } 8852 - 8853 - se = pick_next_entity(cfs_rq); 8445 + se = pick_next_entity(rq, cfs_rq); 8446 + if (!se) 8447 + goto again; 8854 8448 cfs_rq = group_cfs_rq(se); 8855 8449 } while (cfs_rq); 8856 8450 8857 8451 return task_of(se); 8858 8452 } 8859 - #endif 8453 + 8454 + static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); 8455 + static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); 8860 8456 8861 8457 struct task_struct * 8862 8458 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8863 8459 { 8864 - struct cfs_rq *cfs_rq = &rq->cfs; 8865 8460 struct sched_entity *se; 8866 8461 struct task_struct *p; 8867 8462 int new_tasks; 8868 8463 8869 8464 again: 8870 - if (!sched_fair_runnable(rq)) 8465 + p = pick_task_fair(rq); 8466 + if (!p) 8871 8467 goto idle; 8468 + se = &p->se; 8872 8469 8873 8470 #ifdef CONFIG_FAIR_GROUP_SCHED 8874 - if (!prev || prev->sched_class != &fair_sched_class) 8471 + if (prev->sched_class != &fair_sched_class) 8875 8472 goto simple; 8473 + 8474 + __put_prev_set_next_dl_server(rq, prev, p); 8876 8475 8877 8476 /* 8878 8477 * Because of the set_next_buddy() in dequeue_task_fair() it is rather ··· 8881 8478 * 8882 8479 * Therefore attempt to avoid putting and setting the entire cgroup 8883 8480 * hierarchy, only change the part that actually changes. 8884 - */ 8885 - 8886 - do { 8887 - struct sched_entity *curr = cfs_rq->curr; 8888 - 8889 - /* 8890 - * Since we got here without doing put_prev_entity() we also 8891 - * have to consider cfs_rq->curr. If it is still a runnable 8892 - * entity, update_curr() will update its vruntime, otherwise 8893 - * forget we've ever seen it. 8894 - */ 8895 - if (curr) { 8896 - if (curr->on_rq) 8897 - update_curr(cfs_rq); 8898 - else 8899 - curr = NULL; 8900 - 8901 - /* 8902 - * This call to check_cfs_rq_runtime() will do the 8903 - * throttle and dequeue its entity in the parent(s). 8904 - * Therefore the nr_running test will indeed 8905 - * be correct. 8906 - */ 8907 - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { 8908 - cfs_rq = &rq->cfs; 8909 - 8910 - if (!cfs_rq->nr_running) 8911 - goto idle; 8912 - 8913 - goto simple; 8914 - } 8915 - } 8916 - 8917 - se = pick_next_entity(cfs_rq); 8918 - cfs_rq = group_cfs_rq(se); 8919 - } while (cfs_rq); 8920 - 8921 - p = task_of(se); 8922 - 8923 - /* 8481 + * 8924 8482 * Since we haven't yet done put_prev_entity and if the selected task 8925 8483 * is a different task than we started out with, try and touch the 8926 8484 * least amount of cfs_rqs. 8927 8485 */ 8928 8486 if (prev != p) { 8929 8487 struct sched_entity *pse = &prev->se; 8488 + struct cfs_rq *cfs_rq; 8930 8489 8931 8490 while (!(cfs_rq = is_same_group(se, pse))) { 8932 8491 int se_depth = se->depth; ··· 8906 8541 8907 8542 put_prev_entity(cfs_rq, pse); 8908 8543 set_next_entity(cfs_rq, se); 8544 + 8545 + __set_next_task_fair(rq, p, true); 8909 8546 } 8910 8547 8911 - goto done; 8548 + return p; 8549 + 8912 8550 simple: 8913 8551 #endif 8914 - if (prev) 8915 - put_prev_task(rq, prev); 8916 - 8917 - do { 8918 - se = pick_next_entity(cfs_rq); 8919 - set_next_entity(cfs_rq, se); 8920 - cfs_rq = group_cfs_rq(se); 8921 - } while (cfs_rq); 8922 - 8923 - p = task_of(se); 8924 - 8925 - done: __maybe_unused; 8926 - #ifdef CONFIG_SMP 8927 - /* 8928 - * Move the next running task to the front of 8929 - * the list, so our cfs_tasks list becomes MRU 8930 - * one. 8931 - */ 8932 - list_move(&p->se.group_node, &rq->cfs_tasks); 8933 - #endif 8934 - 8935 - if (hrtick_enabled_fair(rq)) 8936 - hrtick_start_fair(rq, p); 8937 - 8938 - update_misfit_status(p, rq); 8939 - sched_fair_update_stop_tick(rq, p); 8940 - 8552 + put_prev_set_next_task(rq, prev, p); 8941 8553 return p; 8942 8554 8943 8555 idle: ··· 8943 8601 return NULL; 8944 8602 } 8945 8603 8946 - static struct task_struct *__pick_next_task_fair(struct rq *rq) 8604 + static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) 8947 8605 { 8948 - return pick_next_task_fair(rq, NULL, NULL); 8606 + return pick_next_task_fair(rq, prev, NULL); 8607 + } 8608 + 8609 + static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) 8610 + { 8611 + return !!dl_se->rq->cfs.nr_running; 8612 + } 8613 + 8614 + static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) 8615 + { 8616 + return pick_task_fair(dl_se->rq); 8617 + } 8618 + 8619 + void fair_server_init(struct rq *rq) 8620 + { 8621 + struct sched_dl_entity *dl_se = &rq->fair_server; 8622 + 8623 + init_dl_entity(dl_se); 8624 + 8625 + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); 8949 8626 } 8950 8627 8951 8628 /* 8952 8629 * Account for a descheduled task: 8953 8630 */ 8954 - static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) 8631 + static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) 8955 8632 { 8956 8633 struct sched_entity *se = &prev->se; 8957 8634 struct cfs_rq *cfs_rq; ··· 9721 9360 9722 9361 hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); 9723 9362 9363 + /* hw_pressure doesn't care about invariance */ 9724 9364 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | 9725 9365 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | 9726 - update_hw_load_avg(now, rq, hw_pressure) | 9366 + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) | 9727 9367 update_irq_load_avg(rq, 0); 9728 9368 9729 9369 if (others_have_blocked(rq)) ··· 13064 12702 */ 13065 12703 static void task_fork_fair(struct task_struct *p) 13066 12704 { 13067 - struct sched_entity *se = &p->se, *curr; 13068 - struct cfs_rq *cfs_rq; 13069 - struct rq *rq = this_rq(); 13070 - struct rq_flags rf; 13071 - 13072 - rq_lock(rq, &rf); 13073 - update_rq_clock(rq); 13074 - 13075 12705 set_task_max_allowed_capacity(p); 13076 - 13077 - cfs_rq = task_cfs_rq(current); 13078 - curr = cfs_rq->curr; 13079 - if (curr) 13080 - update_curr(cfs_rq); 13081 - place_entity(cfs_rq, se, ENQUEUE_INITIAL); 13082 - rq_unlock(rq, &rf); 13083 12706 } 13084 12707 13085 12708 /* ··· 13176 12829 static void switched_from_fair(struct rq *rq, struct task_struct *p) 13177 12830 { 13178 12831 detach_task_cfs_rq(p); 12832 + /* 12833 + * Since this is called after changing class, this is a little weird 12834 + * and we cannot use DEQUEUE_DELAYED. 12835 + */ 12836 + if (p->se.sched_delayed) { 12837 + /* First, dequeue it from its new class' structures */ 12838 + dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); 12839 + /* 12840 + * Now, clean up the fair_sched_class side of things 12841 + * related to sched_delayed being true and that wasn't done 12842 + * due to the generic dequeue not using DEQUEUE_DELAYED. 12843 + */ 12844 + finish_delayed_dequeue_entity(&p->se); 12845 + p->se.rel_deadline = 0; 12846 + __block_task(rq, p); 12847 + } 13179 12848 } 13180 12849 13181 12850 static void switched_to_fair(struct rq *rq, struct task_struct *p) 13182 12851 { 12852 + SCHED_WARN_ON(p->se.sched_delayed); 12853 + 13183 12854 attach_task_cfs_rq(p); 13184 12855 13185 12856 set_task_max_allowed_capacity(p); ··· 13215 12850 } 13216 12851 } 13217 12852 13218 - /* Account for a task changing its policy or group. 13219 - * 13220 - * This routine is mostly called to set cfs_rq->curr field when a task 13221 - * migrates between groups/classes. 13222 - */ 13223 - static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) 12853 + static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) 13224 12854 { 13225 12855 struct sched_entity *se = &p->se; 13226 12856 ··· 13228 12868 list_move(&se->group_node, &rq->cfs_tasks); 13229 12869 } 13230 12870 #endif 12871 + if (!first) 12872 + return; 12873 + 12874 + SCHED_WARN_ON(se->sched_delayed); 12875 + 12876 + if (hrtick_enabled_fair(rq)) 12877 + hrtick_start_fair(rq, p); 12878 + 12879 + update_misfit_status(p, rq); 12880 + sched_fair_update_stop_tick(rq, p); 12881 + } 12882 + 12883 + /* 12884 + * Account for a task changing its policy or group. 12885 + * 12886 + * This routine is mostly called to set cfs_rq->curr field when a task 12887 + * migrates between groups/classes. 12888 + */ 12889 + static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) 12890 + { 12891 + struct sched_entity *se = &p->se; 13231 12892 13232 12893 for_each_sched_entity(se) { 13233 12894 struct cfs_rq *cfs_rq = cfs_rq_of(se); ··· 13257 12876 /* ensure bandwidth has been allocated on our new cfs_rq */ 13258 12877 account_cfs_rq_runtime(cfs_rq, 0); 13259 12878 } 12879 + 12880 + __set_next_task_fair(rq, p, first); 13260 12881 } 13261 12882 13262 12883 void init_cfs_rq(struct cfs_rq *cfs_rq) 13263 12884 { 13264 12885 cfs_rq->tasks_timeline = RB_ROOT_CACHED; 13265 - u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); 12886 + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 13266 12887 #ifdef CONFIG_SMP 13267 12888 raw_spin_lock_init(&cfs_rq->removed.lock); 13268 12889 #endif ··· 13366 12983 13367 12984 void unregister_fair_sched_group(struct task_group *tg) 13368 12985 { 13369 - unsigned long flags; 13370 - struct rq *rq; 13371 12986 int cpu; 13372 12987 13373 12988 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); 13374 12989 13375 12990 for_each_possible_cpu(cpu) { 13376 - if (tg->se[cpu]) 13377 - remove_entity_load_avg(tg->se[cpu]); 12991 + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; 12992 + struct sched_entity *se = tg->se[cpu]; 12993 + struct rq *rq = cpu_rq(cpu); 12994 + 12995 + if (se) { 12996 + if (se->sched_delayed) { 12997 + guard(rq_lock_irqsave)(rq); 12998 + if (se->sched_delayed) { 12999 + update_rq_clock(rq); 13000 + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 13001 + } 13002 + list_del_leaf_cfs_rq(cfs_rq); 13003 + } 13004 + remove_entity_load_avg(se); 13005 + } 13378 13006 13379 13007 /* 13380 13008 * Only empty task groups can be destroyed; so we can speculatively 13381 13009 * check on_list without danger of it being re-added. 13382 13010 */ 13383 - if (!tg->cfs_rq[cpu]->on_list) 13384 - continue; 13385 - 13386 - rq = cpu_rq(cpu); 13387 - 13388 - raw_spin_rq_lock_irqsave(rq, flags); 13389 - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 13390 - raw_spin_rq_unlock_irqrestore(rq, flags); 13011 + if (cfs_rq->on_list) { 13012 + guard(rq_lock_irqsave)(rq); 13013 + list_del_leaf_cfs_rq(cfs_rq); 13014 + } 13391 13015 } 13392 13016 } 13393 13017 ··· 13584 13194 13585 13195 .wakeup_preempt = check_preempt_wakeup_fair, 13586 13196 13197 + .pick_task = pick_task_fair, 13587 13198 .pick_next_task = __pick_next_task_fair, 13588 13199 .put_prev_task = put_prev_task_fair, 13589 13200 .set_next_task = set_next_task_fair, 13590 13201 13591 13202 #ifdef CONFIG_SMP 13592 13203 .balance = balance_fair, 13593 - .pick_task = pick_task_fair, 13594 13204 .select_task_rq = select_task_rq_fair, 13595 13205 .migrate_task_rq = migrate_task_rq_fair, 13596 13206

+28 -2

kernel/sched/features.h

··· 5 5 * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. 6 6 */ 7 7 SCHED_FEAT(PLACE_LAG, true) 8 + /* 9 + * Give new tasks half a slice to ease into the competition. 10 + */ 8 11 SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) 12 + /* 13 + * Preserve relative virtual deadline on 'migration'. 14 + */ 15 + SCHED_FEAT(PLACE_REL_DEADLINE, true) 16 + /* 17 + * Inhibit (wakeup) preemption until the current task has either matched the 18 + * 0-lag point or until is has exhausted it's slice. 19 + */ 9 20 SCHED_FEAT(RUN_TO_PARITY, true) 21 + /* 22 + * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for 23 + * current. 24 + */ 25 + SCHED_FEAT(PREEMPT_SHORT, true) 10 26 11 27 /* 12 28 * Prefer to schedule the task we woke last (assuming it failed ··· 36 20 * cache buddy being migrated away, increases cache locality. 37 21 */ 38 22 SCHED_FEAT(CACHE_HOT_BUDDY, true) 23 + 24 + /* 25 + * Delay dequeueing tasks until they get selected or woken. 26 + * 27 + * By delaying the dequeue for non-eligible tasks, they remain in the 28 + * competition and can burn off their negative lag. When they get selected 29 + * they'll have positive lag by definition. 30 + * 31 + * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0. 32 + */ 33 + SCHED_FEAT(DELAY_DEQUEUE, true) 34 + SCHED_FEAT(DELAY_ZERO, true) 39 35 40 36 /* 41 37 * Allow wakeup-time preemption of the current task: ··· 113 85 SCHED_FEAT(UTIL_EST, true) 114 86 115 87 SCHED_FEAT(LATENCY_WARN, false) 116 - 117 - SCHED_FEAT(HZ_BW, true)

+7 -16

kernel/sched/idle.c

··· 450 450 resched_curr(rq); 451 451 } 452 452 453 - static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 453 + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) 454 454 { 455 + dl_server_update_idle_time(rq, prev); 455 456 } 456 457 457 458 static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) 458 459 { 459 460 update_idle_core(rq); 460 461 schedstat_inc(rq->sched_goidle); 462 + next->se.exec_start = rq_clock_task(rq); 461 463 } 462 464 463 - #ifdef CONFIG_SMP 464 - static struct task_struct *pick_task_idle(struct rq *rq) 465 + struct task_struct *pick_task_idle(struct rq *rq) 465 466 { 466 467 return rq->idle; 467 - } 468 - #endif 469 - 470 - struct task_struct *pick_next_task_idle(struct rq *rq) 471 - { 472 - struct task_struct *next = rq->idle; 473 - 474 - set_next_task_idle(rq, next, true); 475 - 476 - return next; 477 468 } 478 469 479 470 /* 480 471 * It is not legal to sleep in the idle task - print a warning 481 472 * message if some code attempts to do it: 482 473 */ 483 - static void 474 + static bool 484 475 dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) 485 476 { 486 477 raw_spin_rq_unlock_irq(rq); 487 478 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 488 479 dump_stack(); 489 480 raw_spin_rq_lock_irq(rq); 481 + return true; 490 482 } 491 483 492 484 /* ··· 520 528 521 529 .wakeup_preempt = wakeup_preempt_idle, 522 530 523 - .pick_next_task = pick_next_task_idle, 531 + .pick_task = pick_task_idle, 524 532 .put_prev_task = put_prev_task_idle, 525 533 .set_next_task = set_next_task_idle, 526 534 527 535 #ifdef CONFIG_SMP 528 536 .balance = balance_idle, 529 - .pick_task = pick_task_idle, 530 537 .select_task_rq = select_task_rq_idle, 531 538 .set_cpus_allowed = set_cpus_allowed_common, 532 539 #endif

+112 -149

kernel/sched/rt.c

··· 8 8 /* More than 4 hours if BW_SHIFT equals 20. */ 9 9 static const u64 max_rt_runtime = MAX_BW; 10 10 11 - static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 12 - 13 - struct rt_bandwidth def_rt_bandwidth; 14 - 15 11 /* 16 12 * period over which we measure -rt task CPU usage in us. 17 13 * default: 1s ··· 61 65 } 62 66 late_initcall(sched_rt_sysctl_init); 63 67 #endif 68 + 69 + void init_rt_rq(struct rt_rq *rt_rq) 70 + { 71 + struct rt_prio_array *array; 72 + int i; 73 + 74 + array = &rt_rq->active; 75 + for (i = 0; i < MAX_RT_PRIO; i++) { 76 + INIT_LIST_HEAD(array->queue + i); 77 + __clear_bit(i, array->bitmap); 78 + } 79 + /* delimiter for bitsearch: */ 80 + __set_bit(MAX_RT_PRIO, array->bitmap); 81 + 82 + #if defined CONFIG_SMP 83 + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 84 + rt_rq->highest_prio.next = MAX_RT_PRIO-1; 85 + rt_rq->overloaded = 0; 86 + plist_head_init(&rt_rq->pushable_tasks); 87 + #endif /* CONFIG_SMP */ 88 + /* We start is dequeued state, because no RT tasks are queued */ 89 + rt_rq->rt_queued = 0; 90 + 91 + #ifdef CONFIG_RT_GROUP_SCHED 92 + rt_rq->rt_time = 0; 93 + rt_rq->rt_throttled = 0; 94 + rt_rq->rt_runtime = 0; 95 + raw_spin_lock_init(&rt_rq->rt_runtime_lock); 96 + #endif 97 + } 98 + 99 + #ifdef CONFIG_RT_GROUP_SCHED 100 + 101 + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 64 102 65 103 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 66 104 { ··· 160 130 do_start_rt_bandwidth(rt_b); 161 131 } 162 132 163 - void init_rt_rq(struct rt_rq *rt_rq) 164 - { 165 - struct rt_prio_array *array; 166 - int i; 167 - 168 - array = &rt_rq->active; 169 - for (i = 0; i < MAX_RT_PRIO; i++) { 170 - INIT_LIST_HEAD(array->queue + i); 171 - __clear_bit(i, array->bitmap); 172 - } 173 - /* delimiter for bit-search: */ 174 - __set_bit(MAX_RT_PRIO, array->bitmap); 175 - 176 - #if defined CONFIG_SMP 177 - rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 178 - rt_rq->highest_prio.next = MAX_RT_PRIO-1; 179 - rt_rq->overloaded = 0; 180 - plist_head_init(&rt_rq->pushable_tasks); 181 - #endif /* CONFIG_SMP */ 182 - /* We start is dequeued state, because no RT tasks are queued */ 183 - rt_rq->rt_queued = 0; 184 - 185 - rt_rq->rt_time = 0; 186 - rt_rq->rt_throttled = 0; 187 - rt_rq->rt_runtime = 0; 188 - raw_spin_lock_init(&rt_rq->rt_runtime_lock); 189 - } 190 - 191 - #ifdef CONFIG_RT_GROUP_SCHED 192 133 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 193 134 { 194 135 hrtimer_cancel(&rt_b->rt_period_timer); ··· 196 195 { 197 196 if (tg->rt_se) 198 197 destroy_rt_bandwidth(&tg->rt_bandwidth); 199 - 200 198 } 201 199 202 200 void free_rt_sched_group(struct task_group *tg) ··· 253 253 if (!tg->rt_se) 254 254 goto err; 255 255 256 - init_rt_bandwidth(&tg->rt_bandwidth, 257 - ktime_to_ns(def_rt_bandwidth.rt_period), 0); 256 + init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); 258 257 259 258 for_each_possible_cpu(i) { 260 259 rt_rq = kzalloc_node(sizeof(struct rt_rq), ··· 603 604 return &rt_rq->tg->rt_bandwidth; 604 605 } 605 606 606 - #else /* !CONFIG_RT_GROUP_SCHED */ 607 - 608 - static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 609 - { 610 - return rt_rq->rt_runtime; 611 - } 612 - 613 - static inline u64 sched_rt_period(struct rt_rq *rt_rq) 614 - { 615 - return ktime_to_ns(def_rt_bandwidth.rt_period); 616 - } 617 - 618 - typedef struct rt_rq *rt_rq_iter_t; 619 - 620 - #define for_each_rt_rq(rt_rq, iter, rq) \ 621 - for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 622 - 623 - #define for_each_sched_rt_entity(rt_se) \ 624 - for (; rt_se; rt_se = NULL) 625 - 626 - static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 627 - { 628 - return NULL; 629 - } 630 - 631 - static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 632 - { 633 - struct rq *rq = rq_of_rt_rq(rt_rq); 634 - 635 - if (!rt_rq->rt_nr_running) 636 - return; 637 - 638 - enqueue_top_rt_rq(rt_rq); 639 - resched_curr(rq); 640 - } 641 - 642 - static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 643 - { 644 - dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); 645 - } 646 - 647 - static inline int rt_rq_throttled(struct rt_rq *rt_rq) 648 - { 649 - return rt_rq->rt_throttled; 650 - } 651 - 652 - static inline const struct cpumask *sched_rt_period_mask(void) 653 - { 654 - return cpu_online_mask; 655 - } 656 - 657 - static inline 658 - struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 659 - { 660 - return &cpu_rq(cpu)->rt; 661 - } 662 - 663 - static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 664 - { 665 - return &def_rt_bandwidth; 666 - } 667 - 668 - #endif /* CONFIG_RT_GROUP_SCHED */ 669 - 670 607 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 671 608 { 672 609 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); ··· 794 859 const struct cpumask *span; 795 860 796 861 span = sched_rt_period_mask(); 797 - #ifdef CONFIG_RT_GROUP_SCHED 862 + 798 863 /* 799 864 * FIXME: isolated CPUs should really leave the root task group, 800 865 * whether they are isolcpus or were isolated via cpusets, lest ··· 806 871 */ 807 872 if (rt_b == &root_task_group.rt_bandwidth) 808 873 span = cpu_online_mask; 809 - #endif 874 + 810 875 for_each_cpu(i, span) { 811 876 int enqueue = 0; 812 877 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); ··· 873 938 return idle; 874 939 } 875 940 876 - static inline int rt_se_prio(struct sched_rt_entity *rt_se) 877 - { 878 - #ifdef CONFIG_RT_GROUP_SCHED 879 - struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 - 881 - if (rt_rq) 882 - return rt_rq->highest_prio.curr; 883 - #endif 884 - 885 - return rt_task_of(rt_se)->prio; 886 - } 887 - 888 941 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 889 942 { 890 943 u64 runtime = sched_rt_runtime(rt_rq); ··· 916 993 return 0; 917 994 } 918 995 996 + #else /* !CONFIG_RT_GROUP_SCHED */ 997 + 998 + typedef struct rt_rq *rt_rq_iter_t; 999 + 1000 + #define for_each_rt_rq(rt_rq, iter, rq) \ 1001 + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 1002 + 1003 + #define for_each_sched_rt_entity(rt_se) \ 1004 + for (; rt_se; rt_se = NULL) 1005 + 1006 + static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 1007 + { 1008 + return NULL; 1009 + } 1010 + 1011 + static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 1012 + { 1013 + struct rq *rq = rq_of_rt_rq(rt_rq); 1014 + 1015 + if (!rt_rq->rt_nr_running) 1016 + return; 1017 + 1018 + enqueue_top_rt_rq(rt_rq); 1019 + resched_curr(rq); 1020 + } 1021 + 1022 + static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 1023 + { 1024 + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); 1025 + } 1026 + 1027 + static inline int rt_rq_throttled(struct rt_rq *rt_rq) 1028 + { 1029 + return false; 1030 + } 1031 + 1032 + static inline const struct cpumask *sched_rt_period_mask(void) 1033 + { 1034 + return cpu_online_mask; 1035 + } 1036 + 1037 + static inline 1038 + struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 1039 + { 1040 + return &cpu_rq(cpu)->rt; 1041 + } 1042 + 1043 + #ifdef CONFIG_SMP 1044 + static void __enable_runtime(struct rq *rq) { } 1045 + static void __disable_runtime(struct rq *rq) { } 1046 + #endif 1047 + 1048 + #endif /* CONFIG_RT_GROUP_SCHED */ 1049 + 1050 + static inline int rt_se_prio(struct sched_rt_entity *rt_se) 1051 + { 1052 + #ifdef CONFIG_RT_GROUP_SCHED 1053 + struct rt_rq *rt_rq = group_rt_rq(rt_se); 1054 + 1055 + if (rt_rq) 1056 + return rt_rq->highest_prio.curr; 1057 + #endif 1058 + 1059 + return rt_task_of(rt_se)->prio; 1060 + } 1061 + 919 1062 /* 920 1063 * Update the current task's runtime statistics. Skip current tasks that 921 1064 * are not in our scheduling class. ··· 989 1000 static void update_curr_rt(struct rq *rq) 990 1001 { 991 1002 struct task_struct *curr = rq->curr; 992 - struct sched_rt_entity *rt_se = &curr->rt; 993 1003 s64 delta_exec; 994 1004 995 1005 if (curr->sched_class != &rt_sched_class) ··· 997 1009 delta_exec = update_curr_common(rq); 998 1010 if (unlikely(delta_exec <= 0)) 999 1011 return; 1012 + 1013 + #ifdef CONFIG_RT_GROUP_SCHED 1014 + struct sched_rt_entity *rt_se = &curr->rt; 1000 1015 1001 1016 if (!rt_bandwidth_enabled()) 1002 1017 return; ··· 1019 1028 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); 1020 1029 } 1021 1030 } 1031 + #endif 1022 1032 } 1023 1033 1024 1034 static void ··· 1176 1184 static void 1177 1185 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1178 1186 { 1179 - start_rt_bandwidth(&def_rt_bandwidth); 1180 1187 } 1181 1188 1182 1189 static inline ··· 1483 1492 enqueue_pushable_task(rq, p); 1484 1493 } 1485 1494 1486 - static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1495 + static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1487 1496 { 1488 1497 struct sched_rt_entity *rt_se = &p->rt; 1489 1498 ··· 1491 1500 dequeue_rt_entity(rt_se, flags); 1492 1501 1493 1502 dequeue_pushable_task(rq, p); 1503 + 1504 + return true; 1494 1505 } 1495 1506 1496 1507 /* ··· 1748 1755 return p; 1749 1756 } 1750 1757 1751 - static struct task_struct *pick_next_task_rt(struct rq *rq) 1752 - { 1753 - struct task_struct *p = pick_task_rt(rq); 1754 - 1755 - if (p) 1756 - set_next_task_rt(rq, p, true); 1757 - 1758 - return p; 1759 - } 1760 - 1761 - static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1758 + static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) 1762 1759 { 1763 1760 struct sched_rt_entity *rt_se = &p->rt; 1764 1761 struct rt_rq *rt_rq = &rq->rt; ··· 2635 2652 2636 2653 .wakeup_preempt = wakeup_preempt_rt, 2637 2654 2638 - .pick_next_task = pick_next_task_rt, 2655 + .pick_task = pick_task_rt, 2639 2656 .put_prev_task = put_prev_task_rt, 2640 2657 .set_next_task = set_next_task_rt, 2641 2658 2642 2659 #ifdef CONFIG_SMP 2643 2660 .balance = balance_rt, 2644 - .pick_task = pick_task_rt, 2645 2661 .select_task_rq = select_task_rq_rt, 2646 2662 .set_cpus_allowed = set_cpus_allowed_common, 2647 2663 .rq_online = rq_online_rt, ··· 2894 2912 #ifdef CONFIG_SYSCTL 2895 2913 static int sched_rt_global_constraints(void) 2896 2914 { 2897 - unsigned long flags; 2898 - int i; 2899 - 2900 - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 2901 - for_each_possible_cpu(i) { 2902 - struct rt_rq *rt_rq = &cpu_rq(i)->rt; 2903 - 2904 - raw_spin_lock(&rt_rq->rt_runtime_lock); 2905 - rt_rq->rt_runtime = global_rt_runtime(); 2906 - raw_spin_unlock(&rt_rq->rt_runtime_lock); 2907 - } 2908 - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 2909 - 2910 2915 return 0; 2911 2916 } 2912 2917 #endif /* CONFIG_SYSCTL */ ··· 2913 2944 2914 2945 static void sched_rt_do_global(void) 2915 2946 { 2916 - unsigned long flags; 2917 - 2918 - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 2919 - def_rt_bandwidth.rt_runtime = global_rt_runtime(); 2920 - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 2921 - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 2922 2947 } 2923 2948 2924 2949 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,

+78 -23

kernel/sched/sched.h

··· 68 68 #include <linux/wait_api.h> 69 69 #include <linux/wait_bit.h> 70 70 #include <linux/workqueue_api.h> 71 + #include <linux/delayacct.h> 71 72 72 73 #include <trace/events/power.h> 73 74 #include <trace/events/sched.h> ··· 336 335 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 337 336 extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); 338 337 extern int dl_bw_check_overflow(int cpu); 339 - 338 + extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); 340 339 /* 341 340 * SCHED_DEADLINE supports servers (nested scheduling) with the following 342 341 * interface: ··· 362 361 extern void dl_server_stop(struct sched_dl_entity *dl_se); 363 362 extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, 364 363 dl_server_has_tasks_f has_tasks, 365 - dl_server_pick_f pick); 364 + dl_server_pick_f pick_task); 365 + 366 + extern void dl_server_update_idle_time(struct rq *rq, 367 + struct task_struct *p); 368 + extern void fair_server_init(struct rq *rq); 369 + extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); 370 + extern int dl_server_apply_params(struct sched_dl_entity *dl_se, 371 + u64 runtime, u64 period, bool init); 366 372 367 373 #ifdef CONFIG_CGROUP_SCHED 368 374 ··· 607 599 s64 avg_vruntime; 608 600 u64 avg_load; 609 601 610 - u64 exec_clock; 611 602 u64 min_vruntime; 612 603 #ifdef CONFIG_SCHED_CORE 613 604 unsigned int forceidle_seq; 614 605 u64 min_vruntime_fi; 615 - #endif 616 - 617 - #ifndef CONFIG_64BIT 618 - u64 min_vruntime_copy; 619 606 #endif 620 607 621 608 struct rb_root_cached tasks_timeline; ··· 621 618 */ 622 619 struct sched_entity *curr; 623 620 struct sched_entity *next; 624 - 625 - #ifdef CONFIG_SCHED_DEBUG 626 - unsigned int nr_spread_over; 627 - #endif 628 621 629 622 #ifdef CONFIG_SMP 630 623 /* ··· 725 726 #endif /* CONFIG_SMP */ 726 727 int rt_queued; 727 728 729 + #ifdef CONFIG_RT_GROUP_SCHED 728 730 int rt_throttled; 729 731 u64 rt_time; 730 732 u64 rt_runtime; 731 733 /* Nests inside the rq lock: */ 732 734 raw_spinlock_t rt_runtime_lock; 733 735 734 - #ifdef CONFIG_RT_GROUP_SCHED 735 736 unsigned int rt_nr_boosted; 736 737 737 738 struct rq *rq; ··· 819 820 820 821 static inline long se_runnable(struct sched_entity *se) 821 822 { 823 + if (se->sched_delayed) 824 + return false; 825 + 822 826 if (entity_is_task(se)) 823 827 return !!se->on_rq; 824 828 else ··· 836 834 837 835 static inline long se_runnable(struct sched_entity *se) 838 836 { 837 + if (se->sched_delayed) 838 + return false; 839 + 839 840 return !!se->on_rq; 840 841 } 841 842 ··· 1049 1044 struct rt_rq rt; 1050 1045 struct dl_rq dl; 1051 1046 1047 + struct sched_dl_entity fair_server; 1048 + 1052 1049 #ifdef CONFIG_FAIR_GROUP_SCHED 1053 1050 /* list of leaf cfs_rq on this CPU: */ 1054 1051 struct list_head leaf_cfs_rq_list; ··· 1066 1059 unsigned int nr_uninterruptible; 1067 1060 1068 1061 struct task_struct __rcu *curr; 1062 + struct sched_dl_entity *dl_server; 1069 1063 struct task_struct *idle; 1070 1064 struct task_struct *stop; 1071 1065 unsigned long next_balance; ··· 1166 1158 /* latency stats */ 1167 1159 struct sched_info rq_sched_info; 1168 1160 unsigned long long rq_cpu_time; 1169 - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 1170 1161 1171 1162 /* sys_sched_yield() stats */ 1172 1163 unsigned int yld_count; ··· 1194 1187 /* per rq */ 1195 1188 struct rq *core; 1196 1189 struct task_struct *core_pick; 1190 + struct sched_dl_entity *core_dl_server; 1197 1191 unsigned int core_enabled; 1198 1192 unsigned int core_sched_seq; 1199 1193 struct rb_root core_tree; ··· 2255 2247 * 2256 2248 */ 2257 2249 2258 - #define DEQUEUE_SLEEP 0x01 2250 + #define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ 2259 2251 #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ 2260 2252 #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ 2261 2253 #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ 2254 + #define DEQUEUE_SPECIAL 0x10 2262 2255 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ 2256 + #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ 2263 2257 2264 2258 #define ENQUEUE_WAKEUP 0x01 2265 2259 #define ENQUEUE_RESTORE 0x02 ··· 2277 2267 #endif 2278 2268 #define ENQUEUE_INITIAL 0x80 2279 2269 #define ENQUEUE_MIGRATING 0x100 2270 + #define ENQUEUE_DELAYED 0x200 2280 2271 2281 2272 #define RETRY_TASK ((void *)-1UL) 2282 2273 ··· 2296 2285 #endif 2297 2286 2298 2287 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 2299 - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 2288 + bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 2300 2289 void (*yield_task) (struct rq *rq); 2301 2290 bool (*yield_to_task)(struct rq *rq, struct task_struct *p); 2302 2291 2303 2292 void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); 2304 2293 2305 - struct task_struct *(*pick_next_task)(struct rq *rq); 2294 + struct task_struct *(*pick_task)(struct rq *rq); 2295 + /* 2296 + * Optional! When implemented pick_next_task() should be equivalent to: 2297 + * 2298 + * next = pick_task(); 2299 + * if (next) { 2300 + * put_prev_task(prev); 2301 + * set_next_task_first(next); 2302 + * } 2303 + */ 2304 + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); 2306 2305 2307 - void (*put_prev_task)(struct rq *rq, struct task_struct *p); 2306 + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); 2308 2307 void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); 2309 2308 2310 2309 #ifdef CONFIG_SMP 2311 2310 int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 2312 2311 int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); 2313 - 2314 - struct task_struct * (*pick_task)(struct rq *rq); 2315 2312 2316 2313 void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 2317 2314 ··· 2364 2345 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 2365 2346 { 2366 2347 WARN_ON_ONCE(rq->curr != prev); 2367 - prev->sched_class->put_prev_task(rq, prev); 2348 + prev->sched_class->put_prev_task(rq, prev, NULL); 2368 2349 } 2369 2350 2370 2351 static inline void set_next_task(struct rq *rq, struct task_struct *next) ··· 2372 2353 next->sched_class->set_next_task(rq, next, false); 2373 2354 } 2374 2355 2356 + static inline void 2357 + __put_prev_set_next_dl_server(struct rq *rq, 2358 + struct task_struct *prev, 2359 + struct task_struct *next) 2360 + { 2361 + prev->dl_server = NULL; 2362 + next->dl_server = rq->dl_server; 2363 + rq->dl_server = NULL; 2364 + } 2365 + 2366 + static inline void put_prev_set_next_task(struct rq *rq, 2367 + struct task_struct *prev, 2368 + struct task_struct *next) 2369 + { 2370 + WARN_ON_ONCE(rq->curr != prev); 2371 + 2372 + __put_prev_set_next_dl_server(rq, prev, next); 2373 + 2374 + if (next == prev) 2375 + return; 2376 + 2377 + prev->sched_class->put_prev_task(rq, prev, next); 2378 + next->sched_class->set_next_task(rq, next, true); 2379 + } 2375 2380 2376 2381 /* 2377 2382 * Helper to define a sched_class instance; each one is placed in a separate ··· 2451 2408 } 2452 2409 2453 2410 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 2454 - extern struct task_struct *pick_next_task_idle(struct rq *rq); 2411 + extern struct task_struct *pick_task_idle(struct rq *rq); 2455 2412 2456 2413 #define SCA_CHECK 0x01 2457 2414 #define SCA_MIGRATE_DISABLE 0x02 ··· 2558 2515 extern void resched_curr(struct rq *rq); 2559 2516 extern void resched_cpu(int cpu); 2560 2517 2561 - extern struct rt_bandwidth def_rt_bandwidth; 2562 2518 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 2563 2519 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 2564 2520 ··· 2626 2584 2627 2585 /* Check if we still need preemption */ 2628 2586 sched_update_tick_dependency(rq); 2587 + } 2588 + 2589 + static inline void __block_task(struct rq *rq, struct task_struct *p) 2590 + { 2591 + WRITE_ONCE(p->on_rq, 0); 2592 + ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2593 + if (p->sched_contributes_to_load) 2594 + rq->nr_uninterruptible++; 2595 + 2596 + if (p->in_iowait) { 2597 + atomic_inc(&rq->nr_iowait); 2598 + delayacct_blkio_start(); 2599 + } 2629 2600 } 2630 2601 2631 2602 extern void activate_task(struct rq *rq, struct task_struct *p, int flags); ··· 3662 3607 extern void __setscheduler_prio(struct task_struct *p, int prio); 3663 3608 extern void set_load_weight(struct task_struct *p, bool update_load); 3664 3609 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); 3665 - extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); 3610 + extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); 3666 3611 3667 3612 extern void check_class_changed(struct rq *rq, struct task_struct *p, 3668 3613 const struct sched_class *prev_class,

+4 -14

kernel/sched/stop_task.c

··· 41 41 return rq->stop; 42 42 } 43 43 44 - static struct task_struct *pick_next_task_stop(struct rq *rq) 45 - { 46 - struct task_struct *p = pick_task_stop(rq); 47 - 48 - if (p) 49 - set_next_task_stop(rq, p, true); 50 - 51 - return p; 52 - } 53 - 54 44 static void 55 45 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 56 46 { 57 47 add_nr_running(rq, 1); 58 48 } 59 49 60 - static void 50 + static bool 61 51 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 62 52 { 63 53 sub_nr_running(rq, 1); 54 + return true; 64 55 } 65 56 66 57 static void yield_task_stop(struct rq *rq) ··· 59 68 BUG(); /* the stop task should never yield, its pointless. */ 60 69 } 61 70 62 - static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) 71 + static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next) 63 72 { 64 73 update_curr_common(rq); 65 74 } ··· 102 111 103 112 .wakeup_preempt = wakeup_preempt_stop, 104 113 105 - .pick_next_task = pick_next_task_stop, 114 + .pick_task = pick_task_stop, 106 115 .put_prev_task = put_prev_task_stop, 107 116 .set_next_task = set_next_task_stop, 108 117 109 118 #ifdef CONFIG_SMP 110 119 .balance = balance_stop, 111 - .pick_task = pick_task_stop, 112 120 .select_task_rq = select_task_rq_stop, 113 121 .set_cpus_allowed = set_cpus_allowed_common, 114 122 #endif

+25 -109

kernel/sched/syscalls.c

··· 57 57 * keep the priority unchanged. Otherwise, update priority 58 58 * to the normal priority: 59 59 */ 60 - if (!rt_prio(p->prio)) 60 + if (!rt_or_dl_prio(p->prio)) 61 61 return p->normal_prio; 62 62 return p->prio; 63 63 } ··· 258 258 259 259 #endif 260 260 261 - #ifdef CONFIG_SMP 262 - /* 263 - * This function computes an effective utilization for the given CPU, to be 264 - * used for frequency selection given the linear relation: f = u * f_max. 265 - * 266 - * The scheduler tracks the following metrics: 267 - * 268 - * cpu_util_{cfs,rt,dl,irq}() 269 - * cpu_bw_dl() 270 - * 271 - * Where the cfs,rt and dl util numbers are tracked with the same metric and 272 - * synchronized windows and are thus directly comparable. 273 - * 274 - * The cfs,rt,dl utilization are the running times measured with rq->clock_task 275 - * which excludes things like IRQ and steal-time. These latter are then accrued 276 - * in the IRQ utilization. 277 - * 278 - * The DL bandwidth number OTOH is not a measured metric but a value computed 279 - * based on the task model parameters and gives the minimal utilization 280 - * required to meet deadlines. 281 - */ 282 - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, 283 - unsigned long *min, 284 - unsigned long *max) 285 - { 286 - unsigned long util, irq, scale; 287 - struct rq *rq = cpu_rq(cpu); 288 - 289 - scale = arch_scale_cpu_capacity(cpu); 290 - 291 - /* 292 - * Early check to see if IRQ/steal time saturates the CPU, can be 293 - * because of inaccuracies in how we track these -- see 294 - * update_irq_load_avg(). 295 - */ 296 - irq = cpu_util_irq(rq); 297 - if (unlikely(irq >= scale)) { 298 - if (min) 299 - *min = scale; 300 - if (max) 301 - *max = scale; 302 - return scale; 303 - } 304 - 305 - if (min) { 306 - /* 307 - * The minimum utilization returns the highest level between: 308 - * - the computed DL bandwidth needed with the IRQ pressure which 309 - * steals time to the deadline task. 310 - * - The minimum performance requirement for CFS and/or RT. 311 - */ 312 - *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); 313 - 314 - /* 315 - * When an RT task is runnable and uclamp is not used, we must 316 - * ensure that the task will run at maximum compute capacity. 317 - */ 318 - if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) 319 - *min = max(*min, scale); 320 - } 321 - 322 - /* 323 - * Because the time spend on RT/DL tasks is visible as 'lost' time to 324 - * CFS tasks and we use the same metric to track the effective 325 - * utilization (PELT windows are synchronized) we can directly add them 326 - * to obtain the CPU's actual utilization. 327 - */ 328 - util = util_cfs + cpu_util_rt(rq); 329 - util += cpu_util_dl(rq); 330 - 331 - /* 332 - * The maximum hint is a soft bandwidth requirement, which can be lower 333 - * than the actual utilization because of uclamp_max requirements. 334 - */ 335 - if (max) 336 - *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); 337 - 338 - if (util >= scale) 339 - return scale; 340 - 341 - /* 342 - * There is still idle time; further improve the number by using the 343 - * IRQ metric. Because IRQ/steal time is hidden from the task clock we 344 - * need to scale the task numbers: 345 - * 346 - * max - irq 347 - * U' = irq + --------- * U 348 - * max 349 - */ 350 - util = scale_irq_capacity(util, irq, scale); 351 - util += irq; 352 - 353 - return min(scale, util); 354 - } 355 - 356 - unsigned long sched_cpu_util(int cpu) 357 - { 358 - return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); 359 - } 360 - #endif /* CONFIG_SMP */ 361 - 362 261 /** 363 262 * find_process_by_pid - find a process with a matching PID value. 364 263 * @pid: the pid in question. ··· 300 401 301 402 p->policy = policy; 302 403 303 - if (dl_policy(policy)) 404 + if (dl_policy(policy)) { 304 405 __setparam_dl(p, attr); 305 - else if (fair_policy(policy)) 406 + } else if (fair_policy(policy)) { 306 407 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 408 + if (attr->sched_runtime) { 409 + p->se.custom_slice = 1; 410 + p->se.slice = clamp_t(u64, attr->sched_runtime, 411 + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ 412 + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ 413 + } else { 414 + p->se.custom_slice = 0; 415 + p->se.slice = sysctl_sched_base_slice; 416 + } 417 + } 307 418 308 419 /* rt-policy tasks do not have a timerslack */ 309 - if (task_is_realtime(p)) { 420 + if (rt_or_dl_task_policy(p)) { 310 421 p->timer_slack_ns = 0; 311 422 } else if (p->timer_slack_ns == 0) { 312 423 /* when switching back to non-rt policy, restore timerslack */ ··· 617 708 * but store a possible modification of reset_on_fork. 618 709 */ 619 710 if (unlikely(policy == p->policy)) { 620 - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 711 + if (fair_policy(policy) && 712 + (attr->sched_nice != task_nice(p) || 713 + (attr->sched_runtime != p->se.slice))) 621 714 goto change; 622 715 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 623 716 goto change; ··· 764 853 .sched_priority = param->sched_priority, 765 854 .sched_nice = PRIO_TO_NICE(p->static_prio), 766 855 }; 856 + 857 + if (p->se.custom_slice) 858 + attr.sched_runtime = p->se.slice; 767 859 768 860 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 769 861 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ··· 934 1020 935 1021 static void get_params(struct task_struct *p, struct sched_attr *attr) 936 1022 { 937 - if (task_has_dl_policy(p)) 1023 + if (task_has_dl_policy(p)) { 938 1024 __getparam_dl(p, attr); 939 - else if (task_has_rt_policy(p)) 1025 + } else if (task_has_rt_policy(p)) { 940 1026 attr->sched_priority = p->rt_priority; 941 - else 1027 + } else { 942 1028 attr->sched_nice = task_nice(p); 1029 + attr->sched_runtime = p->se.slice; 1030 + } 943 1031 } 944 1032 945 1033 /**

+8

kernel/sched/topology.c

··· 516 516 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 517 517 set_rq_online(rq); 518 518 519 + /* 520 + * Because the rq is not a task, dl_add_task_root_domain() did not 521 + * move the fair server bw to the rd if it already started. 522 + * Add it now. 523 + */ 524 + if (rq->fair_server.dl_server) 525 + __dl_server_attach_root(&rq->fair_server, rq); 526 + 519 527 rq_unlock_irqrestore(rq, &rf); 520 528 521 529 if (old_rd)

+1 -1

kernel/sys.c

··· 2557 2557 error = current->timer_slack_ns; 2558 2558 break; 2559 2559 case PR_SET_TIMERSLACK: 2560 - if (task_is_realtime(current)) 2560 + if (rt_or_dl_task_policy(current)) 2561 2561 break; 2562 2562 if (arg2 <= 0) 2563 2563 current->timer_slack_ns =

+1 -1

kernel/time/hrtimer.c

··· 1977 1977 * expiry. 1978 1978 */ 1979 1979 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1980 - if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) 1980 + if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) 1981 1981 mode |= HRTIMER_MODE_HARD; 1982 1982 } 1983 1983

+1 -1

kernel/trace/trace_sched_wakeup.c

··· 547 547 * - wakeup_dl handles tasks belonging to sched_dl class only. 548 548 */ 549 549 if (tracing_dl || (wakeup_dl && !dl_task(p)) || 550 - (wakeup_rt && !dl_task(p) && !rt_task(p)) || 550 + (wakeup_rt && !rt_or_dl_task(p)) || 551 551 (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) 552 552 return; 553 553

+2 -2

mm/page-writeback.c

··· 418 418 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; 419 419 420 420 tsk = current; 421 - if (rt_task(tsk)) { 421 + if (rt_or_dl_task(tsk)) { 422 422 bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; 423 423 thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; 424 424 } ··· 477 477 else 478 478 dirty = vm_dirty_ratio * node_memory / 100; 479 479 480 - if (rt_task(tsk)) 480 + if (rt_or_dl_task(tsk)) 481 481 dirty += dirty / 4; 482 482 483 483 /*

+1 -1

mm/page_alloc.c

··· 4004 4004 */ 4005 4005 if (alloc_flags & ALLOC_MIN_RESERVE) 4006 4006 alloc_flags &= ~ALLOC_CPUSET; 4007 - } else if (unlikely(rt_task(current)) && in_task()) 4007 + } else if (unlikely(rt_or_dl_task(current)) && in_task()) 4008 4008 alloc_flags |= ALLOC_MIN_RESERVE; 4009 4009 4010 4010 alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);

Configure Feed

Configure Feed