Merge tag 'sched-core-2025-07-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+5

Documentation/admin-guide/kernel-parameters.txt

··· 6410 6410 sa1100ir [NET] 6411 6411 See drivers/net/irda/sa1100_ir.c. 6412 6412 6413 + sched_proxy_exec= [KNL] 6414 + Enables or disables "proxy execution" style 6415 + solution to mutex-based priority inversion. 6416 + Format: <bool> 6417 + 6413 6418 sched_verbose [KNL,EARLY] Enables verbose scheduler debug messages. 6414 6419 6415 6420 schedstats= [KNL,X86] Enable or disable scheduled statistics.

+1

MAINTAINERS

··· 22319 22319 F: include/uapi/linux/sched.h 22320 22320 F: kernel/fork.c 22321 22321 F: kernel/sched/ 22322 + F: tools/sched/ 22322 22323 22323 22324 SCHEDULER - SCHED_EXT 22324 22325 R: Tejun Heo <tj@kernel.org>

+10 -15

arch/powerpc/kernel/smp.c

··· 1700 1700 #ifdef CONFIG_SCHED_SMT 1701 1701 if (has_big_cores) { 1702 1702 pr_info("Big cores detected but using small core scheduling\n"); 1703 - powerpc_topology[i++] = (struct sched_domain_topology_level){ 1704 - smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) 1705 - }; 1703 + powerpc_topology[i++] = 1704 + SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); 1706 1705 } else { 1707 - powerpc_topology[i++] = (struct sched_domain_topology_level){ 1708 - cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) 1709 - }; 1706 + powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); 1710 1707 } 1711 1708 #endif 1712 1709 if (shared_caches) { 1713 - powerpc_topology[i++] = (struct sched_domain_topology_level){ 1714 - shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) 1715 - }; 1710 + powerpc_topology[i++] = 1711 + SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); 1716 1712 } 1713 + 1717 1714 if (has_coregroup_support()) { 1718 - powerpc_topology[i++] = (struct sched_domain_topology_level){ 1719 - cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) 1720 - }; 1715 + powerpc_topology[i++] = 1716 + SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); 1721 1717 } 1722 - powerpc_topology[i++] = (struct sched_domain_topology_level){ 1723 - cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) 1724 - }; 1718 + 1719 + powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); 1725 1720 1726 1721 /* There must be one trailing NULL entry left. */ 1727 1722 BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);

+5 -5

arch/s390/kernel/topology.c

··· 531 531 } 532 532 533 533 static struct sched_domain_topology_level s390_topology[] = { 534 - { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 535 - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 536 - { cpu_book_mask, SD_INIT_NAME(BOOK) }, 537 - { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, 538 - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, 534 + SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), 535 + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), 536 + SDTL_INIT(cpu_book_mask, NULL, BOOK), 537 + SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), 538 + SDTL_INIT(cpu_cpu_mask, NULL, PKG), 539 539 { NULL, }, 540 540 }; 541 541

+24 -27

arch/x86/kernel/smpboot.c

··· 478 478 */ 479 479 static bool x86_has_numa_in_package; 480 480 481 - static struct sched_domain_topology_level x86_topology[6]; 481 + static struct sched_domain_topology_level x86_topology[] = { 482 + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), 483 + #ifdef CONFIG_SCHED_CLUSTER 484 + SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), 485 + #endif 486 + #ifdef CONFIG_SCHED_MC 487 + SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), 488 + #endif 489 + SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), 490 + { NULL }, 491 + }; 482 492 483 493 static void __init build_sched_topology(void) 484 494 { 485 - int i = 0; 495 + struct sched_domain_topology_level *topology = x86_topology; 486 496 487 - #ifdef CONFIG_SCHED_SMT 488 - x86_topology[i++] = (struct sched_domain_topology_level){ 489 - cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) 490 - }; 491 - #endif 492 - #ifdef CONFIG_SCHED_CLUSTER 493 - x86_topology[i++] = (struct sched_domain_topology_level){ 494 - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) 495 - }; 496 - #endif 497 - #ifdef CONFIG_SCHED_MC 498 - x86_topology[i++] = (struct sched_domain_topology_level){ 499 - cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) 500 - }; 501 - #endif 502 497 /* 503 - * When there is NUMA topology inside the package skip the PKG domain 504 - * since the NUMA domains will auto-magically create the right spanning 505 - * domains based on the SLIT. 498 + * When there is NUMA topology inside the package invalidate the 499 + * PKG domain since the NUMA domains will auto-magically create the 500 + * right spanning domains based on the SLIT. 506 501 */ 507 - if (!x86_has_numa_in_package) { 508 - x86_topology[i++] = (struct sched_domain_topology_level){ 509 - cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) 510 - }; 502 + if (x86_has_numa_in_package) { 503 + unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; 504 + 505 + memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); 511 506 } 512 507 513 508 /* 514 - * There must be one trailing NULL entry left. 509 + * Drop the SMT domains if there is only one thread per-core 510 + * since it'll get degenerated by the scheduler anyways. 515 511 */ 516 - BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); 512 + if (cpu_smt_num_threads <= 1) 513 + ++topology; 517 514 518 - set_sched_topology(x86_topology); 515 + set_sched_topology(topology); 519 516 } 520 517 521 518 void set_cpu_sibling_map(int cpu)

-5

include/linux/cpu.h

··· 187 187 188 188 void play_idle_precise(u64 duration_ns, u64 latency_ns); 189 189 190 - static inline void play_idle(unsigned long duration_us) 191 - { 192 - play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX); 193 - } 194 - 195 190 #ifdef CONFIG_HOTPLUG_CPU 196 191 void cpuhp_report_idle_dead(void); 197 192 #else

-9

include/linux/preempt.h

··· 369 369 370 370 #endif 371 371 372 - #ifdef CONFIG_SMP 373 - 374 372 /* 375 373 * Migrate-Disable and why it is undesired. 376 374 * ··· 426 428 */ 427 429 extern void migrate_disable(void); 428 430 extern void migrate_enable(void); 429 - 430 - #else 431 - 432 - static inline void migrate_disable(void) { } 433 - static inline void migrate_enable(void) { } 434 - 435 - #endif /* CONFIG_SMP */ 436 431 437 432 /** 438 433 * preempt_disable_nested - Disable preemption inside a normally preempt disabled section

+2 -4

include/linux/psi_types.h

··· 84 84 struct psi_group_cpu { 85 85 /* 1st cacheline updated by the scheduler */ 86 86 87 - /* Aggregator needs to know of concurrent changes */ 88 - seqcount_t seq ____cacheline_aligned_in_smp; 89 - 90 87 /* States of the tasks belonging to this group */ 91 - unsigned int tasks[NR_PSI_TASK_COUNTS]; 88 + unsigned int tasks[NR_PSI_TASK_COUNTS] 89 + ____cacheline_aligned_in_smp; 92 90 93 91 /* Aggregate pressure state derived from the tasks */ 94 92 u32 state_mask;

+86 -62

include/linux/sched.h

··· 34 34 #include <linux/sched/prio.h> 35 35 #include <linux/sched/types.h> 36 36 #include <linux/signal_types.h> 37 + #include <linux/spinlock.h> 37 38 #include <linux/syscall_user_dispatch_types.h> 38 39 #include <linux/mm_types_task.h> 39 40 #include <linux/netdevice_xmit.h> ··· 396 395 UCLAMP_CNT 397 396 }; 398 397 399 - #ifdef CONFIG_SMP 400 398 extern struct root_domain def_root_domain; 401 399 extern struct mutex sched_domains_mutex; 402 400 extern void sched_domains_mutex_lock(void); 403 401 extern void sched_domains_mutex_unlock(void); 404 - #else 405 - static inline void sched_domains_mutex_lock(void) { } 406 - static inline void sched_domains_mutex_unlock(void) { } 407 - #endif 408 402 409 403 struct sched_param { 410 404 int sched_priority; ··· 580 584 u64 sum_exec_runtime; 581 585 u64 prev_sum_exec_runtime; 582 586 u64 vruntime; 583 - s64 vlag; 587 + union { 588 + /* 589 + * When !@on_rq this field is vlag. 590 + * When cfs_rq->curr == se (which implies @on_rq) 591 + * this field is vprot. See protect_slice(). 592 + */ 593 + s64 vlag; 594 + u64 vprot; 595 + }; 584 596 u64 slice; 585 597 586 598 u64 nr_migrations; ··· 604 600 unsigned long runnable_weight; 605 601 #endif 606 602 607 - #ifdef CONFIG_SMP 608 603 /* 609 604 * Per entity load average tracking. 610 605 * ··· 611 608 * collide with read-mostly values above. 612 609 */ 613 610 struct sched_avg avg; 614 - #endif 615 611 }; 616 612 617 613 struct sched_rt_entity { ··· 703 701 unsigned int dl_defer : 1; 704 702 unsigned int dl_defer_armed : 1; 705 703 unsigned int dl_defer_running : 1; 704 + unsigned int dl_server_idle : 1; 706 705 707 706 /* 708 707 * Bandwidth enforcement timer. Each -deadline task has its ··· 841 838 struct alloc_tag *alloc_tag; 842 839 #endif 843 840 844 - #ifdef CONFIG_SMP 845 841 int on_cpu; 846 842 struct __call_single_node wake_entry; 847 843 unsigned int wakee_flips; ··· 856 854 */ 857 855 int recent_used_cpu; 858 856 int wake_cpu; 859 - #endif 860 857 int on_rq; 861 858 862 859 int prio; ··· 914 913 cpumask_t *user_cpus_ptr; 915 914 cpumask_t cpus_mask; 916 915 void *migration_pending; 917 - #ifdef CONFIG_SMP 918 916 unsigned short migration_disabled; 919 - #endif 920 917 unsigned short migration_flags; 921 918 922 919 #ifdef CONFIG_PREEMPT_RCU ··· 946 947 struct sched_info sched_info; 947 948 948 949 struct list_head tasks; 949 - #ifdef CONFIG_SMP 950 950 struct plist_node pushable_tasks; 951 951 struct rb_node pushable_dl_tasks; 952 - #endif 953 952 954 953 struct mm_struct *mm; 955 954 struct mm_struct *active_mm; ··· 1231 1234 struct rt_mutex_waiter *pi_blocked_on; 1232 1235 #endif 1233 1236 1234 - #ifdef CONFIG_DEBUG_MUTEXES 1235 - /* Mutex deadlock detection: */ 1236 - struct mutex_waiter *blocked_on; 1237 - #endif 1237 + struct mutex *blocked_on; /* lock we're blocked on */ 1238 1238 1239 1239 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER 1240 1240 /* ··· 1656 1662 randomized_struct_fields_end 1657 1663 } __attribute__ ((aligned (64))); 1658 1664 1665 + #ifdef CONFIG_SCHED_PROXY_EXEC 1666 + DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); 1667 + static inline bool sched_proxy_exec(void) 1668 + { 1669 + return static_branch_likely(&__sched_proxy_exec); 1670 + } 1671 + #else 1672 + static inline bool sched_proxy_exec(void) 1673 + { 1674 + return false; 1675 + } 1676 + #endif 1677 + 1659 1678 #define TASK_REPORT_IDLE (TASK_REPORT + 1) 1660 1679 #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) 1661 1680 ··· 1783 1776 1784 1777 static __always_inline bool is_percpu_thread(void) 1785 1778 { 1786 - #ifdef CONFIG_SMP 1787 1779 return (current->flags & PF_NO_SETAFFINITY) && 1788 1780 (current->nr_cpus_allowed == 1); 1789 - #else 1790 - return true; 1791 - #endif 1792 1781 } 1793 1782 1794 1783 /* Per-process atomic flags. */ ··· 1849 1846 extern int task_can_attach(struct task_struct *p); 1850 1847 extern int dl_bw_alloc(int cpu, u64 dl_bw); 1851 1848 extern void dl_bw_free(int cpu, u64 dl_bw); 1852 - #ifdef CONFIG_SMP 1853 1849 1854 1850 /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ 1855 1851 extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ··· 1866 1864 extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); 1867 1865 extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); 1868 1866 extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); 1869 - #else 1870 - static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1871 - { 1872 - } 1873 - static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1874 - { 1875 - /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ 1876 - if ((*cpumask_bits(new_mask) & 1) == 0) 1877 - return -EINVAL; 1878 - return 0; 1879 - } 1880 - static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) 1881 - { 1882 - if (src->user_cpus_ptr) 1883 - return -EINVAL; 1884 - return 0; 1885 - } 1886 - static inline void release_user_cpus_ptr(struct task_struct *p) 1887 - { 1888 - WARN_ON(p->user_cpus_ptr); 1889 - } 1890 - 1891 - static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) 1892 - { 1893 - return 0; 1894 - } 1895 - #endif 1896 1867 1897 1868 extern int yield_to(struct task_struct *p, bool preempt); 1898 1869 extern void set_user_nice(struct task_struct *p, long nice); ··· 1954 1979 extern int wake_up_process(struct task_struct *tsk); 1955 1980 extern void wake_up_new_task(struct task_struct *tsk); 1956 1981 1957 - #ifdef CONFIG_SMP 1958 1982 extern void kick_process(struct task_struct *tsk); 1959 - #else 1960 - static inline void kick_process(struct task_struct *tsk) { } 1961 - #endif 1962 1983 1963 1984 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); 1964 1985 #define set_task_comm(tsk, from) ({ \ ··· 1981 2010 buf; \ 1982 2011 }) 1983 2012 1984 - #ifdef CONFIG_SMP 1985 2013 static __always_inline void scheduler_ipi(void) 1986 2014 { 1987 2015 /* ··· 1990 2020 */ 1991 2021 preempt_fold_need_resched(); 1992 2022 } 1993 - #else 1994 - static inline void scheduler_ipi(void) { } 1995 - #endif 1996 2023 1997 2024 extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); 1998 2025 ··· 2132 2165 __cond_resched_rwlock_write(lock); \ 2133 2166 }) 2134 2167 2168 + #ifndef CONFIG_PREEMPT_RT 2169 + static inline struct mutex *__get_task_blocked_on(struct task_struct *p) 2170 + { 2171 + struct mutex *m = p->blocked_on; 2172 + 2173 + if (m) 2174 + lockdep_assert_held_once(&m->wait_lock); 2175 + return m; 2176 + } 2177 + 2178 + static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) 2179 + { 2180 + WARN_ON_ONCE(!m); 2181 + /* The task should only be setting itself as blocked */ 2182 + WARN_ON_ONCE(p != current); 2183 + /* Currently we serialize blocked_on under the mutex::wait_lock */ 2184 + lockdep_assert_held_once(&m->wait_lock); 2185 + /* 2186 + * Check ensure we don't overwrite existing mutex value 2187 + * with a different mutex. Note, setting it to the same 2188 + * lock repeatedly is ok. 2189 + */ 2190 + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); 2191 + p->blocked_on = m; 2192 + } 2193 + 2194 + static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) 2195 + { 2196 + guard(raw_spinlock_irqsave)(&m->wait_lock); 2197 + __set_task_blocked_on(p, m); 2198 + } 2199 + 2200 + static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) 2201 + { 2202 + WARN_ON_ONCE(!m); 2203 + /* Currently we serialize blocked_on under the mutex::wait_lock */ 2204 + lockdep_assert_held_once(&m->wait_lock); 2205 + /* 2206 + * There may be cases where we re-clear already cleared 2207 + * blocked_on relationships, but make sure we are not 2208 + * clearing the relationship with a different lock. 2209 + */ 2210 + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); 2211 + p->blocked_on = NULL; 2212 + } 2213 + 2214 + static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) 2215 + { 2216 + guard(raw_spinlock_irqsave)(&m->wait_lock); 2217 + __clear_task_blocked_on(p, m); 2218 + } 2219 + #else 2220 + static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) 2221 + { 2222 + } 2223 + 2224 + static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) 2225 + { 2226 + } 2227 + #endif /* !CONFIG_PREEMPT_RT */ 2228 + 2135 2229 static __always_inline bool need_resched(void) 2136 2230 { 2137 2231 return unlikely(tif_need_resched()); ··· 2232 2204 extern unsigned long get_wchan(struct task_struct *p); 2233 2205 extern struct task_struct *cpu_curr_snapshot(int cpu); 2234 2206 2235 - #include <linux/spinlock.h> 2236 - 2237 2207 /* 2238 2208 * In order to reduce various lock holder preemption latencies provide an 2239 2209 * interface to see if a vCPU is currently running or not. ··· 2254 2228 #define TASK_SIZE_OF(tsk) TASK_SIZE 2255 2229 #endif 2256 2230 2257 - #ifdef CONFIG_SMP 2258 2231 static inline bool owner_on_cpu(struct task_struct *owner) 2259 2232 { 2260 2233 /* ··· 2265 2240 2266 2241 /* Returns effective CPU energy utilization, as seen by the scheduler */ 2267 2242 unsigned long sched_cpu_util(int cpu); 2268 - #endif /* CONFIG_SMP */ 2269 2243 2270 2244 #ifdef CONFIG_SCHED_CORE 2271 2245 extern void sched_core_free(struct task_struct *tsk);

-4

include/linux/sched/deadline.h

··· 29 29 return (s64)(a - b) < 0; 30 30 } 31 31 32 - #ifdef CONFIG_SMP 33 - 34 32 struct root_domain; 35 33 extern void dl_add_task_root_domain(struct task_struct *p); 36 34 extern void dl_clear_root_domain(struct root_domain *rd); 37 35 extern void dl_clear_root_domain_cpu(int cpu); 38 - 39 - #endif /* CONFIG_SMP */ 40 36 41 37 extern u64 dl_cookie; 42 38 extern bool dl_bw_visited(int cpu, u64 cookie);

-4

include/linux/sched/idle.h

··· 11 11 CPU_MAX_IDLE_TYPES 12 12 }; 13 13 14 - #ifdef CONFIG_SMP 15 14 extern void wake_up_if_idle(int cpu); 16 - #else 17 - static inline void wake_up_if_idle(int cpu) { } 18 - #endif 19 15 20 16 /* 21 17 * Idle thread specific functions to determine the need_resched

+2 -2

include/linux/sched/nohz.h

··· 6 6 * This is the interface between the scheduler and nohz/dynticks: 7 7 */ 8 8 9 - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 9 + #ifdef CONFIG_NO_HZ_COMMON 10 10 extern void nohz_balance_enter_idle(int cpu); 11 11 extern int get_nohz_timer_target(void); 12 12 #else ··· 23 23 static inline void calc_load_nohz_stop(void) { } 24 24 #endif /* CONFIG_NO_HZ_COMMON */ 25 25 26 - #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) 26 + #ifdef CONFIG_NO_HZ_COMMON 27 27 extern void wake_up_nohz_cpu(int cpu); 28 28 #else 29 29 static inline void wake_up_nohz_cpu(int cpu) { }

-8

include/linux/sched/sd_flags.h

··· 154 154 SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) 155 155 156 156 /* 157 - * sched_groups of this level overlap 158 - * 159 - * SHARED_PARENT: Set for all NUMA levels above NODE. 160 - * NEEDS_GROUPS: Overlaps can only exist with more than one group. 161 - */ 162 - SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) 163 - 164 - /* 165 157 * Cross-node balancing 166 158 * 167 159 * SHARED_PARENT: Set for all NUMA levels above NODE.

+10 -21

include/linux/sched/task.h

··· 109 109 extern void free_task(struct task_struct *tsk); 110 110 111 111 /* sched_exec is called by processes performing an exec */ 112 - #ifdef CONFIG_SMP 113 112 extern void sched_exec(void); 114 - #else 115 - #define sched_exec() {} 116 - #endif 117 113 118 114 static inline struct task_struct *get_task_struct(struct task_struct *t) 119 115 { ··· 131 135 return; 132 136 133 137 /* 134 - * In !RT, it is always safe to call __put_task_struct(). 135 - * Under RT, we can only call it in preemptible context. 136 - */ 137 - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { 138 - static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); 139 - 140 - lock_map_acquire_try(&put_task_map); 141 - __put_task_struct(t); 142 - lock_map_release(&put_task_map); 143 - return; 144 - } 145 - 146 - /* 147 - * under PREEMPT_RT, we can't call put_task_struct 138 + * Under PREEMPT_RT, we can't call __put_task_struct 148 139 * in atomic context because it will indirectly 149 - * acquire sleeping locks. 140 + * acquire sleeping locks. The same is true if the 141 + * current process has a mutex enqueued (blocked on 142 + * a PI chain). 150 143 * 151 - * call_rcu() will schedule delayed_put_task_struct_rcu() 144 + * In !RT, it is always safe to call __put_task_struct(). 145 + * Though, in order to simplify the code, resort to the 146 + * deferred call too. 147 + * 148 + * call_rcu() will schedule __put_task_struct_rcu_cb() 152 149 * to be called in process context. 153 150 * 154 151 * __put_task_struct() is called when ··· 154 165 * 155 166 * delayed_free_task() also uses ->rcu, but it is only called 156 167 * when it fails to fork a process. Therefore, there is no 157 - * way it can conflict with put_task_struct(). 168 + * way it can conflict with __put_task_struct(). 158 169 */ 159 170 call_rcu(&t->rcu, __put_task_struct_rcu_cb); 160 171 }

+2 -37

include/linux/sched/topology.h

··· 9 9 /* 10 10 * sched-domains (multiprocessor balancing) declarations: 11 11 */ 12 - #ifdef CONFIG_SMP 13 12 14 13 /* Generate SD flag indexes */ 15 14 #define SD_FLAG(name, mflags) __##name, ··· 175 176 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 176 177 typedef int (*sched_domain_flags_f)(void); 177 178 178 - #define SDTL_OVERLAP 0x01 179 - 180 179 struct sd_data { 181 180 struct sched_domain *__percpu *sd; 182 181 struct sched_domain_shared *__percpu *sds; ··· 185 188 struct sched_domain_topology_level { 186 189 sched_domain_mask_f mask; 187 190 sched_domain_flags_f sd_flags; 188 - int flags; 189 191 int numa_level; 190 192 struct sd_data data; 191 193 char *name; ··· 193 197 extern void __init set_sched_topology(struct sched_domain_topology_level *tl); 194 198 extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); 195 199 196 - 197 - # define SD_INIT_NAME(type) .name = #type 198 - 199 - #else /* CONFIG_SMP */ 200 - 201 - struct sched_domain_attr; 202 - 203 - static inline void 204 - partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 205 - struct sched_domain_attr *dattr_new) 206 - { 207 - } 208 - 209 - static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) 210 - { 211 - return true; 212 - } 213 - 214 - static inline bool cpus_share_cache(int this_cpu, int that_cpu) 215 - { 216 - return true; 217 - } 218 - 219 - static inline bool cpus_share_resources(int this_cpu, int that_cpu) 220 - { 221 - return true; 222 - } 223 - 224 - static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) 225 - { 226 - } 227 - 228 - #endif /* !CONFIG_SMP */ 200 + #define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \ 201 + { .mask = maskfn, .sd_flags = flagsfn, .name = #dname }) 229 202 230 203 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 231 204 extern void rebuild_sched_domains_energy(void);

+15

init/Kconfig

··· 142 142 config RUSTC_HAS_UNNECESSARY_TRANSMUTES 143 143 def_bool RUSTC_VERSION >= 108800 144 144 145 + config RUSTC_HAS_FILE_WITH_NUL 146 + def_bool RUSTC_VERSION >= 108900 147 + 145 148 config PAHOLE_VERSION 146 149 int 147 150 default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE)) ··· 877 874 precision. 878 875 879 876 If in doubt, use the default value. 877 + 878 + config SCHED_PROXY_EXEC 879 + bool "Proxy Execution" 880 + # Avoid some build failures w/ PREEMPT_RT until it can be fixed 881 + depends on !PREEMPT_RT 882 + # Need to investigate how to inform sched_ext of split contexts 883 + depends on !SCHED_CLASS_EXT 884 + # Not particularly useful until we get to multi-rq proxying 885 + depends on EXPERT 886 + help 887 + This option enables proxy execution, a mechanism for mutex-owning 888 + tasks to inherit the scheduling context of higher priority waiters. 880 889 881 890 endmenu 882 891

+1 -2

kernel/fork.c

··· 2127 2127 lockdep_init_task(p); 2128 2128 #endif 2129 2129 2130 - #ifdef CONFIG_DEBUG_MUTEXES 2131 2130 p->blocked_on = NULL; /* not blocked yet */ 2132 - #endif 2131 + 2133 2132 #ifdef CONFIG_BCACHE 2134 2133 p->sequential_io = 0; 2135 2134 p->sequential_io_avg = 0;

+5 -4

kernel/locking/mutex-debug.c

··· 53 53 { 54 54 lockdep_assert_held(&lock->wait_lock); 55 55 56 - /* Mark the current thread as blocked on the lock: */ 57 - task->blocked_on = waiter; 56 + /* Current thread can't be already blocked (since it's executing!) */ 57 + DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); 58 58 } 59 59 60 60 void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 61 61 struct task_struct *task) 62 62 { 63 + struct mutex *blocked_on = __get_task_blocked_on(task); 64 + 63 65 DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); 64 66 DEBUG_LOCKS_WARN_ON(waiter->task != task); 65 - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); 66 - task->blocked_on = NULL; 67 + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); 67 68 68 69 INIT_LIST_HEAD(&waiter->list); 69 70 waiter->task = NULL;

+18

kernel/locking/mutex.c

··· 644 644 goto err_early_kill; 645 645 } 646 646 647 + __set_task_blocked_on(current, lock); 647 648 set_current_state(state); 648 649 trace_contention_begin(lock, LCB_F_MUTEX); 649 650 for (;;) { ··· 681 680 682 681 first = __mutex_waiter_is_first(lock, &waiter); 683 682 683 + /* 684 + * As we likely have been woken up by task 685 + * that has cleared our blocked_on state, re-set 686 + * it to the lock we are trying to acquire. 687 + */ 688 + set_task_blocked_on(current, lock); 684 689 set_current_state(state); 685 690 /* 686 691 * Here we order against unlock; we must either see it change ··· 698 691 699 692 if (first) { 700 693 trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); 694 + /* 695 + * mutex_optimistic_spin() can call schedule(), so 696 + * clear blocked on so we don't become unselectable 697 + * to run. 698 + */ 699 + clear_task_blocked_on(current, lock); 701 700 if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) 702 701 break; 702 + set_task_blocked_on(current, lock); 703 703 trace_contention_begin(lock, LCB_F_MUTEX); 704 704 } 705 705 ··· 714 700 } 715 701 raw_spin_lock_irqsave(&lock->wait_lock, flags); 716 702 acquired: 703 + __clear_task_blocked_on(current, lock); 717 704 __set_current_state(TASK_RUNNING); 718 705 719 706 if (ww_ctx) { ··· 744 729 return 0; 745 730 746 731 err: 732 + __clear_task_blocked_on(current, lock); 747 733 __set_current_state(TASK_RUNNING); 748 734 __mutex_remove_waiter(lock, &waiter); 749 735 err_early_kill: 736 + WARN_ON(__get_task_blocked_on(current)); 750 737 trace_contention_end(lock, ret); 751 738 raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); 752 739 debug_mutex_free_waiter(&waiter); ··· 959 942 next = waiter->task; 960 943 961 944 debug_mutex_wake_waiter(lock, waiter); 945 + __clear_task_blocked_on(next, lock); 962 946 wake_q_add(&wake_q, next); 963 947 } 964 948

+2 -1

kernel/locking/mutex.h

··· 6 6 * 7 7 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 8 */ 9 - 9 + #ifndef CONFIG_PREEMPT_RT 10 10 /* 11 11 * This is the control structure for tasks blocked on mutex, which resides 12 12 * on the blocked task's kernel stack: ··· 70 70 # define debug_mutex_unlock(lock) do { } while (0) 71 71 # define debug_mutex_init(lock, name, key) do { } while (0) 72 72 #endif /* !CONFIG_DEBUG_MUTEXES */ 73 + #endif /* CONFIG_PREEMPT_RT */

+14 -2

kernel/locking/ww_mutex.h

··· 284 284 #ifndef WW_RT 285 285 debug_mutex_wake_waiter(lock, waiter); 286 286 #endif 287 + /* 288 + * When waking up the task to die, be sure to clear the 289 + * blocked_on pointer. Otherwise we can see circular 290 + * blocked_on relationships that can't resolve. 291 + */ 292 + __clear_task_blocked_on(waiter->task, lock); 287 293 wake_q_add(wake_q, waiter->task); 288 294 } 289 295 ··· 337 331 * it's wounded in __ww_mutex_check_kill() or has a 338 332 * wakeup pending to re-read the wounded state. 339 333 */ 340 - if (owner != current) 334 + if (owner != current) { 335 + /* 336 + * When waking up the task to wound, be sure to clear the 337 + * blocked_on pointer. Otherwise we can see circular 338 + * blocked_on relationships that can't resolve. 339 + */ 340 + __clear_task_blocked_on(owner, lock); 341 341 wake_q_add(wake_q, owner); 342 - 342 + } 343 343 return true; 344 344 } 345 345

+6 -3

kernel/sched/autogroup.c

··· 4 4 * Auto-group scheduling implementation: 5 5 */ 6 6 7 + #include "autogroup.h" 8 + #include "sched.h" 9 + 7 10 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 8 11 static struct autogroup autogroup_default; 9 12 static atomic_t autogroup_seq_nr; ··· 28 25 { 29 26 register_sysctl_init("kernel", sched_autogroup_sysctls); 30 27 } 31 - #else 28 + #else /* !CONFIG_SYSCTL: */ 32 29 #define sched_autogroup_sysctl_init() do { } while (0) 33 - #endif 30 + #endif /* !CONFIG_SYSCTL */ 34 31 35 32 void __init autogroup_init(struct task_struct *init_task) 36 33 { ··· 111 108 free_rt_sched_group(tg); 112 109 tg->rt_se = root_task_group.rt_se; 113 110 tg->rt_rq = root_task_group.rt_rq; 114 - #endif 111 + #endif /* CONFIG_RT_GROUP_SCHED */ 115 112 tg->autogroup = ag; 116 113 117 114 sched_online_group(tg, &root_task_group);

+4 -2

kernel/sched/autogroup.h

··· 2 2 #ifndef _KERNEL_SCHED_AUTOGROUP_H 3 3 #define _KERNEL_SCHED_AUTOGROUP_H 4 4 5 + #include "sched.h" 6 + 5 7 #ifdef CONFIG_SCHED_AUTOGROUP 6 8 7 9 struct autogroup { ··· 43 41 44 42 extern int autogroup_path(struct task_group *tg, char *buf, int buflen); 45 43 46 - #else /* !CONFIG_SCHED_AUTOGROUP */ 44 + #else /* !CONFIG_SCHED_AUTOGROUP: */ 47 45 48 46 static inline void autogroup_init(struct task_struct *init_task) { } 49 47 static inline void autogroup_free(struct task_group *tg) { } ··· 63 61 return 0; 64 62 } 65 63 66 - #endif /* CONFIG_SCHED_AUTOGROUP */ 64 + #endif /* !CONFIG_SCHED_AUTOGROUP */ 67 65 68 66 #endif /* _KERNEL_SCHED_AUTOGROUP_H */

+2 -4

kernel/sched/build_policy.c

··· 50 50 #include "idle.c" 51 51 52 52 #include "rt.c" 53 + #include "cpudeadline.c" 53 54 54 - #ifdef CONFIG_SMP 55 - # include "cpudeadline.c" 56 - # include "pelt.c" 57 - #endif 55 + #include "pelt.c" 58 56 59 57 #include "cputime.c" 60 58 #include "deadline.c"

+4 -5

kernel/sched/build_utility.c

··· 80 80 #include "wait_bit.c" 81 81 #include "wait.c" 82 82 83 - #ifdef CONFIG_SMP 84 - # include "cpupri.c" 85 - # include "stop_task.c" 86 - # include "topology.c" 87 - #endif 83 + #include "cpupri.c" 84 + #include "stop_task.c" 85 + 86 + #include "topology.c" 88 87 89 88 #ifdef CONFIG_SCHED_CORE 90 89 # include "core_sched.c"

+5 -2

kernel/sched/clock.c

··· 54 54 * 55 55 */ 56 56 57 + #include <linux/sched/clock.h> 58 + #include "sched.h" 59 + 57 60 /* 58 61 * Scheduler clock - returns current time in nanosec units. 59 62 * This is default implementation. ··· 474 471 } 475 472 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 476 473 477 - #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 474 + #else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ 478 475 479 476 void __init sched_clock_init(void) 480 477 { ··· 492 489 return sched_clock(); 493 490 } 494 491 495 - #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 492 + #endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 496 493 497 494 /* 498 495 * Running clock - returns the time that has elapsed while a guest has been

+5

kernel/sched/completion.c

··· 13 13 * Waiting for completion is a typically sync point, but not an exclusion point. 14 14 */ 15 15 16 + #include <linux/linkage.h> 17 + #include <linux/sched/debug.h> 18 + #include <linux/completion.h> 19 + #include "sched.h" 20 + 16 21 static void complete_with_flags(struct completion *x, int wake_flags) 17 22 { 18 23 unsigned long flags;

+518 -353

kernel/sched/core.c

··· 96 96 #include "../workqueue_internal.h" 97 97 #include "../../io_uring/io-wq.h" 98 98 #include "../smpboot.h" 99 + #include "../locking/mutex.h" 99 100 100 101 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); 101 102 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); ··· 119 118 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); 120 119 121 120 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 121 + 122 + #ifdef CONFIG_SCHED_PROXY_EXEC 123 + DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); 124 + static int __init setup_proxy_exec(char *str) 125 + { 126 + bool proxy_enable = true; 127 + 128 + if (*str && kstrtobool(str + 1, &proxy_enable)) { 129 + pr_warn("Unable to parse sched_proxy_exec=\n"); 130 + return 0; 131 + } 132 + 133 + if (proxy_enable) { 134 + pr_info("sched_proxy_exec enabled via boot arg\n"); 135 + static_branch_enable(&__sched_proxy_exec); 136 + } else { 137 + pr_info("sched_proxy_exec disabled via boot arg\n"); 138 + static_branch_disable(&__sched_proxy_exec); 139 + } 140 + return 1; 141 + } 142 + #else 143 + static int __init setup_proxy_exec(char *str) 144 + { 145 + pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); 146 + return 0; 147 + } 148 + #endif 149 + __setup("sched_proxy_exec", setup_proxy_exec); 122 150 123 151 /* 124 152 * Debugging: various feature bits ··· 511 481 schedule_work(&_work); 512 482 } 513 483 514 - #else /* !CONFIG_SCHED_CORE */ 484 + #else /* !CONFIG_SCHED_CORE: */ 515 485 516 486 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } 517 487 static inline void 518 488 sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } 519 489 520 - #endif /* CONFIG_SCHED_CORE */ 490 + #endif /* !CONFIG_SCHED_CORE */ 521 491 522 492 /* need a wrapper since we may need to trace from modules */ 523 493 EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); ··· 680 650 raw_spin_unlock(rq_lockp(rq)); 681 651 } 682 652 683 - #ifdef CONFIG_SMP 684 653 /* 685 654 * double_rq_lock - safely lock two runqueues 686 655 */ ··· 696 667 697 668 double_rq_clock_clear_update(rq1, rq2); 698 669 } 699 - #endif 700 670 701 671 /* 702 672 * __task_rq_lock - lock the rq @p resides on. ··· 881 853 return HRTIMER_NORESTART; 882 854 } 883 855 884 - #ifdef CONFIG_SMP 885 - 886 856 static void __hrtick_restart(struct rq *rq) 887 857 { 888 858 struct hrtimer *timer = &rq->hrtick_timer; ··· 925 899 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 926 900 } 927 901 928 - #else 929 - /* 930 - * Called to set the hrtick timer state. 931 - * 932 - * called with rq->lock held and IRQs disabled 933 - */ 934 - void hrtick_start(struct rq *rq, u64 delay) 935 - { 936 - /* 937 - * Don't schedule slices shorter than 10000ns, that just 938 - * doesn't make sense. Rely on vruntime for fairness. 939 - */ 940 - delay = max_t(u64, delay, 10000LL); 941 - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), 942 - HRTIMER_MODE_REL_PINNED_HARD); 943 - } 944 - 945 - #endif /* CONFIG_SMP */ 946 - 947 902 static void hrtick_rq_init(struct rq *rq) 948 903 { 949 - #ifdef CONFIG_SMP 950 904 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); 951 - #endif 952 905 hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 953 906 } 954 - #else /* CONFIG_SCHED_HRTICK */ 907 + #else /* !CONFIG_SCHED_HRTICK: */ 955 908 static inline void hrtick_clear(struct rq *rq) 956 909 { 957 910 } ··· 938 933 static inline void hrtick_rq_init(struct rq *rq) 939 934 { 940 935 } 941 - #endif /* CONFIG_SCHED_HRTICK */ 936 + #endif /* !CONFIG_SCHED_HRTICK */ 942 937 943 938 /* 944 939 * try_cmpxchg based fetch_or() macro so it works for different integer types: ··· 954 949 _val; \ 955 950 }) 956 951 957 - #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 952 + #ifdef TIF_POLLING_NRFLAG 958 953 /* 959 954 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 960 955 * this avoids any races wrt polling state changes and thereby avoids ··· 993 988 return true; 994 989 } 995 990 996 - #ifdef CONFIG_SMP 997 991 static inline bool set_nr_if_polling(struct task_struct *p) 998 992 { 999 993 return false; 1000 994 } 1001 - #endif 1002 995 #endif 1003 996 1004 997 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ··· 1170 1167 raw_spin_rq_unlock_irqrestore(rq, flags); 1171 1168 } 1172 1169 1173 - #ifdef CONFIG_SMP 1174 1170 #ifdef CONFIG_NO_HZ_COMMON 1175 1171 /* 1176 1172 * In the semi idle case, use the nearest busy CPU for migrating timers ··· 1376 1374 return true; 1377 1375 } 1378 1376 #endif /* CONFIG_NO_HZ_FULL */ 1379 - #endif /* CONFIG_SMP */ 1380 1377 1381 - #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 1382 - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 1378 + #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) 1383 1379 /* 1384 1380 * Iterate task_group tree rooted at *from, calling @down when first entering a 1385 1381 * node and @up when leaving it for the final time. ··· 1971 1971 sysctl_sched_uclamp_util_min_rt_default = old_min_rt; 1972 1972 return result; 1973 1973 } 1974 - #endif 1974 + #endif /* CONFIG_SYSCTL */ 1975 1975 1976 1976 static void uclamp_fork(struct task_struct *p) 1977 1977 { ··· 2037 2037 } 2038 2038 } 2039 2039 2040 - #else /* !CONFIG_UCLAMP_TASK */ 2040 + #else /* !CONFIG_UCLAMP_TASK: */ 2041 2041 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } 2042 2042 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } 2043 2043 static inline void uclamp_fork(struct task_struct *p) { } 2044 2044 static inline void uclamp_post_fork(struct task_struct *p) { } 2045 2045 static inline void init_uclamp(void) { } 2046 - #endif /* CONFIG_UCLAMP_TASK */ 2046 + #endif /* !CONFIG_UCLAMP_TASK */ 2047 2047 2048 2048 bool sched_task_on_rq(struct task_struct *p) 2049 2049 { ··· 2352 2352 2353 2353 return ncsw; 2354 2354 } 2355 - 2356 - #ifdef CONFIG_SMP 2357 2355 2358 2356 static void 2359 2357 __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); ··· 2934 2936 struct set_affinity_pending my_pending = { }, *pending = NULL; 2935 2937 bool stop_pending, complete = false; 2936 2938 2937 - /* Can the task run on the task's current CPU? If so, we're done */ 2938 - if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { 2939 + /* 2940 + * Can the task run on the task's current CPU? If so, we're done 2941 + * 2942 + * We are also done if the task is the current donor, boosting a lock- 2943 + * holding proxy, (and potentially has been migrated outside its 2944 + * current or previous affinity mask) 2945 + */ 2946 + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) || 2947 + (task_current_donor(rq, p) && !task_current(rq, p))) { 2939 2948 struct task_struct *push_task = NULL; 2940 2949 2941 2950 if ((flags & SCA_MIGRATE_ENABLE) && ··· 3310 3305 WARN_ON_ONCE(ret); 3311 3306 } 3312 3307 3308 + #ifdef CONFIG_SMP 3309 + 3313 3310 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 3314 3311 { 3315 3312 unsigned int state = READ_ONCE(p->__state); ··· 3365 3358 3366 3359 __set_task_cpu(p, new_cpu); 3367 3360 } 3361 + #endif /* CONFIG_SMP */ 3368 3362 3369 3363 #ifdef CONFIG_NUMA_BALANCING 3370 3364 static void __migrate_swap_task(struct task_struct *p, int cpu) ··· 3665 3657 } 3666 3658 } 3667 3659 3668 - #else /* CONFIG_SMP */ 3669 - 3670 - static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } 3671 - 3672 - static inline bool rq_has_pinned_tasks(struct rq *rq) 3673 - { 3674 - return false; 3675 - } 3676 - 3677 - #endif /* !CONFIG_SMP */ 3678 - 3679 3660 static void 3680 3661 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 3681 3662 { ··· 3675 3678 3676 3679 rq = this_rq(); 3677 3680 3678 - #ifdef CONFIG_SMP 3679 3681 if (cpu == rq->cpu) { 3680 3682 __schedstat_inc(rq->ttwu_local); 3681 3683 __schedstat_inc(p->stats.nr_wakeups_local); ··· 3694 3698 3695 3699 if (wake_flags & WF_MIGRATED) 3696 3700 __schedstat_inc(p->stats.nr_wakeups_migrate); 3697 - #endif /* CONFIG_SMP */ 3698 3701 3699 3702 __schedstat_inc(rq->ttwu_count); 3700 3703 __schedstat_inc(p->stats.nr_wakeups); ··· 3722 3727 if (p->sched_contributes_to_load) 3723 3728 rq->nr_uninterruptible--; 3724 3729 3725 - #ifdef CONFIG_SMP 3726 3730 if (wake_flags & WF_RQ_SELECTED) 3727 3731 en_flags |= ENQUEUE_RQ_SELECTED; 3728 3732 if (wake_flags & WF_MIGRATED) 3729 3733 en_flags |= ENQUEUE_MIGRATED; 3730 3734 else 3731 - #endif 3732 3735 if (p->in_iowait) { 3733 3736 delayacct_blkio_end(p); 3734 3737 atomic_dec(&task_rq(p)->nr_iowait); ··· 3737 3744 3738 3745 ttwu_do_wakeup(p); 3739 3746 3740 - #ifdef CONFIG_SMP 3741 3747 if (p->sched_class->task_woken) { 3742 3748 /* 3743 3749 * Our task @p is fully woken up and running; so it's safe to ··· 3758 3766 3759 3767 rq->idle_stamp = 0; 3760 3768 } 3761 - #endif 3762 3769 } 3763 3770 3764 3771 /* ··· 3811 3820 return ret; 3812 3821 } 3813 3822 3814 - #ifdef CONFIG_SMP 3815 3823 void sched_ttwu_pending(void *arg) 3816 3824 { 3817 3825 struct llist_node *llist = arg; ··· 3877 3887 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); 3878 3888 3879 3889 WRITE_ONCE(rq->ttwu_pending, 1); 3890 + #ifdef CONFIG_SMP 3880 3891 __smp_call_single_queue(cpu, &p->wake_entry.llist); 3892 + #endif 3881 3893 } 3882 3894 3883 3895 void wake_up_if_idle(int cpu) ··· 3984 3992 3985 3993 return false; 3986 3994 } 3987 - 3988 - #else /* !CONFIG_SMP */ 3989 - 3990 - static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) 3991 - { 3992 - return false; 3993 - } 3994 - 3995 - #endif /* CONFIG_SMP */ 3996 3995 3997 3996 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) 3998 3997 { ··· 4240 4257 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) 4241 4258 break; 4242 4259 4243 - #ifdef CONFIG_SMP 4244 4260 /* 4245 4261 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 4246 4262 * possible to, falsely, observe p->on_cpu == 0. ··· 4318 4336 psi_ttwu_dequeue(p); 4319 4337 set_task_cpu(p, cpu); 4320 4338 } 4321 - #else 4322 - cpu = task_cpu(p); 4323 - #endif /* CONFIG_SMP */ 4324 4339 4325 4340 ttwu_queue(p, cpu, wake_flags); 4326 4341 } ··· 4350 4371 if (p->on_rq) 4351 4372 return true; 4352 4373 4353 - #ifdef CONFIG_SMP 4354 4374 /* 4355 4375 * Ensure the task has finished __schedule() and will not be referenced 4356 4376 * anymore. Again, see try_to_wake_up() for a longer comment. 4357 4377 */ 4358 4378 smp_rmb(); 4359 4379 smp_cond_load_acquire(&p->on_cpu, !VAL); 4360 - #endif 4361 4380 4362 4381 return false; 4363 4382 } ··· 4511 4534 p->capture_control = NULL; 4512 4535 #endif 4513 4536 init_numa_balancing(clone_flags, p); 4514 - #ifdef CONFIG_SMP 4515 4537 p->wake_entry.u_flags = CSD_TYPE_TTWU; 4516 4538 p->migration_pending = NULL; 4517 - #endif 4518 4539 init_sched_mm_cid(p); 4519 4540 } 4520 4541 ··· 4575 4600 } 4576 4601 return err; 4577 4602 } 4578 - #endif 4579 - #endif 4603 + #endif /* CONFIG_PROC_SYSCTL */ 4604 + #endif /* CONFIG_NUMA_BALANCING */ 4580 4605 4581 4606 #ifdef CONFIG_SCHEDSTATS 4582 4607 ··· 4763 4788 if (likely(sched_info_on())) 4764 4789 memset(&p->sched_info, 0, sizeof(p->sched_info)); 4765 4790 #endif 4766 - #if defined(CONFIG_SMP) 4767 4791 p->on_cpu = 0; 4768 - #endif 4769 4792 init_task_preempt_count(p); 4770 - #ifdef CONFIG_SMP 4771 4793 plist_node_init(&p->pushable_tasks, MAX_PRIO); 4772 4794 RB_CLEAR_NODE(&p->pushable_dl_tasks); 4773 - #endif 4795 + 4774 4796 return 0; 4775 4797 } 4776 4798 ··· 4844 4872 4845 4873 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 4846 4874 WRITE_ONCE(p->__state, TASK_RUNNING); 4847 - #ifdef CONFIG_SMP 4848 4875 /* 4849 4876 * Fork balancing, do it here and not earlier because: 4850 4877 * - cpus_ptr can change in the fork path ··· 4855 4884 p->recent_used_cpu = task_cpu(p); 4856 4885 rseq_migrate(p); 4857 4886 __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); 4858 - #endif 4859 4887 rq = __task_rq_lock(p, &rf); 4860 4888 update_rq_clock(rq); 4861 4889 post_init_entity_util_avg(p); ··· 4862 4892 activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); 4863 4893 trace_sched_wakeup_new(p); 4864 4894 wakeup_preempt(rq, p, wake_flags); 4865 - #ifdef CONFIG_SMP 4866 4895 if (p->sched_class->task_woken) { 4867 4896 /* 4868 4897 * Nothing relies on rq->lock after this, so it's fine to ··· 4871 4902 p->sched_class->task_woken(rq, p); 4872 4903 rq_repin_lock(rq, &rf); 4873 4904 } 4874 - #endif 4875 4905 task_rq_unlock(rq, p, &rf); 4876 4906 } 4877 4907 ··· 4947 4979 __fire_sched_out_preempt_notifiers(curr, next); 4948 4980 } 4949 4981 4950 - #else /* !CONFIG_PREEMPT_NOTIFIERS */ 4982 + #else /* !CONFIG_PREEMPT_NOTIFIERS: */ 4951 4983 4952 4984 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 4953 4985 { ··· 4959 4991 { 4960 4992 } 4961 4993 4962 - #endif /* CONFIG_PREEMPT_NOTIFIERS */ 4994 + #endif /* !CONFIG_PREEMPT_NOTIFIERS */ 4963 4995 4964 4996 static inline void prepare_task(struct task_struct *next) 4965 4997 { 4966 - #ifdef CONFIG_SMP 4967 4998 /* 4968 4999 * Claim the task as running, we do this before switching to it 4969 5000 * such that any running task will have this set. ··· 4971 5004 * its ordering comment. 4972 5005 */ 4973 5006 WRITE_ONCE(next->on_cpu, 1); 4974 - #endif 4975 5007 } 4976 5008 4977 5009 static inline void finish_task(struct task_struct *prev) 4978 5010 { 4979 - #ifdef CONFIG_SMP 4980 5011 /* 4981 5012 * This must be the very last reference to @prev from this CPU. After 4982 5013 * p->on_cpu is cleared, the task can be moved to a different CPU. We ··· 4987 5022 * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). 4988 5023 */ 4989 5024 smp_store_release(&prev->on_cpu, 0); 4990 - #endif 4991 5025 } 4992 - 4993 - #ifdef CONFIG_SMP 4994 5026 4995 5027 static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) 4996 5028 { ··· 5069 5107 raw_spin_rq_unlock_irqrestore(rq, flags); 5070 5108 } 5071 5109 } 5072 - 5073 - #else 5074 - 5075 - static inline void __balance_callbacks(struct rq *rq) 5076 - { 5077 - } 5078 - 5079 - #endif 5080 5110 5081 5111 static inline void 5082 5112 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) ··· 5457 5503 return sum; 5458 5504 } 5459 5505 5460 - #ifdef CONFIG_SMP 5461 - 5462 5506 /* 5463 5507 * sched_exec - execve() is a valuable balancing opportunity, because at 5464 5508 * this point the task has the smallest effective memory and cache footprint. ··· 5479 5527 } 5480 5528 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 5481 5529 } 5482 - 5483 - #endif 5484 5530 5485 5531 DEFINE_PER_CPU(struct kernel_stat, kstat); 5486 5532 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ··· 5514 5564 struct rq *rq; 5515 5565 u64 ns; 5516 5566 5517 - #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 5567 + #ifdef CONFIG_64BIT 5518 5568 /* 5519 5569 * 64-bit doesn't need locks to atomically read a 64-bit value. 5520 5570 * So we have a optimization chance when the task's delta_exec is 0. ··· 5641 5691 if (donor->flags & PF_WQ_WORKER) 5642 5692 wq_worker_tick(donor); 5643 5693 5644 - #ifdef CONFIG_SMP 5645 5694 if (!scx_switched_all()) { 5646 5695 rq->idle_balance = idle_cpu(cpu); 5647 5696 sched_balance_trigger(rq); 5648 5697 } 5649 - #endif 5650 5698 } 5651 5699 5652 5700 #ifdef CONFIG_NO_HZ_FULL ··· 5784 5836 return 0; 5785 5837 } 5786 5838 5787 - #else /* !CONFIG_NO_HZ_FULL */ 5839 + #else /* !CONFIG_NO_HZ_FULL: */ 5788 5840 static inline void sched_tick_start(int cpu) { } 5789 5841 static inline void sched_tick_stop(int cpu) { } 5790 - #endif 5842 + #endif /* !CONFIG_NO_HZ_FULL */ 5791 5843 5792 5844 #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ 5793 5845 defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ··· 6502 6554 rq->core = rq; 6503 6555 } 6504 6556 6505 - #else /* !CONFIG_SCHED_CORE */ 6557 + #else /* !CONFIG_SCHED_CORE: */ 6506 6558 6507 6559 static inline void sched_core_cpu_starting(unsigned int cpu) {} 6508 6560 static inline void sched_core_cpu_deactivate(unsigned int cpu) {} ··· 6514 6566 return __pick_next_task(rq, prev, rf); 6515 6567 } 6516 6568 6517 - #endif /* CONFIG_SCHED_CORE */ 6569 + #endif /* !CONFIG_SCHED_CORE */ 6518 6570 6519 6571 /* 6520 6572 * Constants for the sched_mode argument of __schedule(). ··· 6530 6582 /* 6531 6583 * Helper function for __schedule() 6532 6584 * 6533 - * If a task does not have signals pending, deactivate it 6534 - * Otherwise marks the task's __state as RUNNING 6585 + * Tries to deactivate the task, unless the should_block arg 6586 + * is false or if a signal is pending. In the case a signal 6587 + * is pending, marks the task's __state as RUNNING (and clear 6588 + * blocked_on). 6535 6589 */ 6536 6590 static bool try_to_block_task(struct rq *rq, struct task_struct *p, 6537 - unsigned long *task_state_p) 6591 + unsigned long *task_state_p, bool should_block) 6538 6592 { 6539 6593 unsigned long task_state = *task_state_p; 6540 6594 int flags = DEQUEUE_NOCLOCK; ··· 6546 6596 *task_state_p = TASK_RUNNING; 6547 6597 return false; 6548 6598 } 6599 + 6600 + /* 6601 + * We check should_block after signal_pending because we 6602 + * will want to wake the task in that case. But if 6603 + * should_block is false, its likely due to the task being 6604 + * blocked on a mutex, and we want to keep it on the runqueue 6605 + * to be selectable for proxy-execution. 6606 + */ 6607 + if (!should_block) 6608 + return false; 6549 6609 6550 6610 p->sched_contributes_to_load = 6551 6611 (task_state & TASK_UNINTERRUPTIBLE) && ··· 6578 6618 */ 6579 6619 block_task(rq, p, flags); 6580 6620 return true; 6621 + } 6622 + 6623 + #ifdef CONFIG_SCHED_PROXY_EXEC 6624 + static inline struct task_struct *proxy_resched_idle(struct rq *rq) 6625 + { 6626 + put_prev_set_next_task(rq, rq->donor, rq->idle); 6627 + rq_set_donor(rq, rq->idle); 6628 + set_tsk_need_resched(rq->idle); 6629 + return rq->idle; 6630 + } 6631 + 6632 + static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) 6633 + { 6634 + unsigned long state = READ_ONCE(donor->__state); 6635 + 6636 + /* Don't deactivate if the state has been changed to TASK_RUNNING */ 6637 + if (state == TASK_RUNNING) 6638 + return false; 6639 + /* 6640 + * Because we got donor from pick_next_task(), it is *crucial* 6641 + * that we call proxy_resched_idle() before we deactivate it. 6642 + * As once we deactivate donor, donor->on_rq is set to zero, 6643 + * which allows ttwu() to immediately try to wake the task on 6644 + * another rq. So we cannot use *any* references to donor 6645 + * after that point. So things like cfs_rq->curr or rq->donor 6646 + * need to be changed from next *before* we deactivate. 6647 + */ 6648 + proxy_resched_idle(rq); 6649 + return try_to_block_task(rq, donor, &state, true); 6650 + } 6651 + 6652 + static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) 6653 + { 6654 + if (!__proxy_deactivate(rq, donor)) { 6655 + /* 6656 + * XXX: For now, if deactivation failed, set donor 6657 + * as unblocked, as we aren't doing proxy-migrations 6658 + * yet (more logic will be needed then). 6659 + */ 6660 + donor->blocked_on = NULL; 6661 + } 6662 + return NULL; 6663 + } 6664 + 6665 + /* 6666 + * Find runnable lock owner to proxy for mutex blocked donor 6667 + * 6668 + * Follow the blocked-on relation: 6669 + * task->blocked_on -> mutex->owner -> task... 6670 + * 6671 + * Lock order: 6672 + * 6673 + * p->pi_lock 6674 + * rq->lock 6675 + * mutex->wait_lock 6676 + * 6677 + * Returns the task that is going to be used as execution context (the one 6678 + * that is actually going to be run on cpu_of(rq)). 6679 + */ 6680 + static struct task_struct * 6681 + find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) 6682 + { 6683 + struct task_struct *owner = NULL; 6684 + int this_cpu = cpu_of(rq); 6685 + struct task_struct *p; 6686 + struct mutex *mutex; 6687 + 6688 + /* Follow blocked_on chain. */ 6689 + for (p = donor; task_is_blocked(p); p = owner) { 6690 + mutex = p->blocked_on; 6691 + /* Something changed in the chain, so pick again */ 6692 + if (!mutex) 6693 + return NULL; 6694 + /* 6695 + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() 6696 + * and ensure @owner sticks around. 6697 + */ 6698 + guard(raw_spinlock)(&mutex->wait_lock); 6699 + 6700 + /* Check again that p is blocked with wait_lock held */ 6701 + if (mutex != __get_task_blocked_on(p)) { 6702 + /* 6703 + * Something changed in the blocked_on chain and 6704 + * we don't know if only at this level. So, let's 6705 + * just bail out completely and let __schedule() 6706 + * figure things out (pick_again loop). 6707 + */ 6708 + return NULL; 6709 + } 6710 + 6711 + owner = __mutex_owner(mutex); 6712 + if (!owner) { 6713 + __clear_task_blocked_on(p, mutex); 6714 + return p; 6715 + } 6716 + 6717 + if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { 6718 + /* XXX Don't handle blocked owners/delayed dequeue yet */ 6719 + return proxy_deactivate(rq, donor); 6720 + } 6721 + 6722 + if (task_cpu(owner) != this_cpu) { 6723 + /* XXX Don't handle migrations yet */ 6724 + return proxy_deactivate(rq, donor); 6725 + } 6726 + 6727 + if (task_on_rq_migrating(owner)) { 6728 + /* 6729 + * One of the chain of mutex owners is currently migrating to this 6730 + * CPU, but has not yet been enqueued because we are holding the 6731 + * rq lock. As a simple solution, just schedule rq->idle to give 6732 + * the migration a chance to complete. Much like the migrate_task 6733 + * case we should end up back in find_proxy_task(), this time 6734 + * hopefully with all relevant tasks already enqueued. 6735 + */ 6736 + return proxy_resched_idle(rq); 6737 + } 6738 + 6739 + /* 6740 + * Its possible to race where after we check owner->on_rq 6741 + * but before we check (owner_cpu != this_cpu) that the 6742 + * task on another cpu was migrated back to this cpu. In 6743 + * that case it could slip by our checks. So double check 6744 + * we are still on this cpu and not migrating. If we get 6745 + * inconsistent results, try again. 6746 + */ 6747 + if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu) 6748 + return NULL; 6749 + 6750 + if (owner == p) { 6751 + /* 6752 + * It's possible we interleave with mutex_unlock like: 6753 + * 6754 + * lock(&rq->lock); 6755 + * find_proxy_task() 6756 + * mutex_unlock() 6757 + * lock(&wait_lock); 6758 + * donor(owner) = current->blocked_donor; 6759 + * unlock(&wait_lock); 6760 + * 6761 + * wake_up_q(); 6762 + * ... 6763 + * ttwu_runnable() 6764 + * __task_rq_lock() 6765 + * lock(&wait_lock); 6766 + * owner == p 6767 + * 6768 + * Which leaves us to finish the ttwu_runnable() and make it go. 6769 + * 6770 + * So schedule rq->idle so that ttwu_runnable() can get the rq 6771 + * lock and mark owner as running. 6772 + */ 6773 + return proxy_resched_idle(rq); 6774 + } 6775 + /* 6776 + * OK, now we're absolutely sure @owner is on this 6777 + * rq, therefore holding @rq->lock is sufficient to 6778 + * guarantee its existence, as per ttwu_remote(). 6779 + */ 6780 + } 6781 + 6782 + WARN_ON_ONCE(owner && !owner->on_rq); 6783 + return owner; 6784 + } 6785 + #else /* SCHED_PROXY_EXEC */ 6786 + static struct task_struct * 6787 + find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) 6788 + { 6789 + WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n"); 6790 + return donor; 6791 + } 6792 + #endif /* SCHED_PROXY_EXEC */ 6793 + 6794 + static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) 6795 + { 6796 + if (!sched_proxy_exec()) 6797 + return; 6798 + /* 6799 + * pick_next_task() calls set_next_task() on the chosen task 6800 + * at some point, which ensures it is not push/pullable. 6801 + * However, the chosen/donor task *and* the mutex owner form an 6802 + * atomic pair wrt push/pull. 6803 + * 6804 + * Make sure owner we run is not pushable. Unfortunately we can 6805 + * only deal with that by means of a dequeue/enqueue cycle. :-/ 6806 + */ 6807 + dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); 6808 + enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); 6581 6809 } 6582 6810 6583 6811 /* ··· 6880 6732 goto picked; 6881 6733 } 6882 6734 } else if (!preempt && prev_state) { 6883 - try_to_block_task(rq, prev, &prev_state); 6735 + /* 6736 + * We pass task_is_blocked() as the should_block arg 6737 + * in order to keep mutex-blocked tasks on the runqueue 6738 + * for slection with proxy-exec (without proxy-exec 6739 + * task_is_blocked() will always be false). 6740 + */ 6741 + try_to_block_task(rq, prev, &prev_state, 6742 + !task_is_blocked(prev)); 6884 6743 switch_count = &prev->nvcsw; 6885 6744 } 6886 6745 6887 - next = pick_next_task(rq, prev, &rf); 6746 + pick_again: 6747 + next = pick_next_task(rq, rq->donor, &rf); 6888 6748 rq_set_donor(rq, next); 6749 + if (unlikely(task_is_blocked(next))) { 6750 + next = find_proxy_task(rq, next, &rf); 6751 + if (!next) 6752 + goto pick_again; 6753 + if (next == rq->idle) 6754 + goto keep_resched; 6755 + } 6889 6756 picked: 6890 6757 clear_tsk_need_resched(prev); 6891 6758 clear_preempt_need_resched(); 6759 + keep_resched: 6892 6760 rq->last_seen_need_resched_ns = 0; 6893 6761 6894 6762 is_switch = prev != next; ··· 6915 6751 * changes to task_struct made by pick_next_task(). 6916 6752 */ 6917 6753 RCU_INIT_POINTER(rq->curr, next); 6754 + 6755 + if (!task_current_donor(rq, next)) 6756 + proxy_tag_curr(rq, next); 6757 + 6918 6758 /* 6919 6759 * The membarrier system call requires each architecture 6920 6760 * to have a full memory barrier after updating ··· 6953 6785 /* Also unlocks the rq: */ 6954 6786 rq = context_switch(rq, prev, next, &rf); 6955 6787 } else { 6788 + /* In case next was already curr but just got blocked_donor */ 6789 + if (!task_current_donor(rq, next)) 6790 + proxy_tag_curr(rq, next); 6791 + 6956 6792 rq_unpin_lock(rq, &rf); 6957 6793 __balance_callbacks(rq); 6958 6794 raw_spin_rq_unlock_irq(rq); ··· 7165 6993 EXPORT_SYMBOL(preempt_schedule); 7166 6994 7167 6995 #ifdef CONFIG_PREEMPT_DYNAMIC 7168 - #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7169 - #ifndef preempt_schedule_dynamic_enabled 7170 - #define preempt_schedule_dynamic_enabled preempt_schedule 7171 - #define preempt_schedule_dynamic_disabled NULL 7172 - #endif 6996 + # ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL 6997 + # ifndef preempt_schedule_dynamic_enabled 6998 + # define preempt_schedule_dynamic_enabled preempt_schedule 6999 + # define preempt_schedule_dynamic_disabled NULL 7000 + # endif 7173 7001 DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); 7174 7002 EXPORT_STATIC_CALL_TRAMP(preempt_schedule); 7175 - #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7003 + # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7176 7004 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); 7177 7005 void __sched notrace dynamic_preempt_schedule(void) 7178 7006 { ··· 7182 7010 } 7183 7011 NOKPROBE_SYMBOL(dynamic_preempt_schedule); 7184 7012 EXPORT_SYMBOL(dynamic_preempt_schedule); 7185 - #endif 7186 - #endif 7013 + # endif 7014 + #endif /* CONFIG_PREEMPT_DYNAMIC */ 7187 7015 7188 7016 /** 7189 7017 * preempt_schedule_notrace - preempt_schedule called by tracing ··· 7238 7066 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 7239 7067 7240 7068 #ifdef CONFIG_PREEMPT_DYNAMIC 7241 - #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7242 - #ifndef preempt_schedule_notrace_dynamic_enabled 7243 - #define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace 7244 - #define preempt_schedule_notrace_dynamic_disabled NULL 7245 - #endif 7069 + # if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7070 + # ifndef preempt_schedule_notrace_dynamic_enabled 7071 + # define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace 7072 + # define preempt_schedule_notrace_dynamic_disabled NULL 7073 + # endif 7246 7074 DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); 7247 7075 EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); 7248 - #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7076 + # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7249 7077 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); 7250 7078 void __sched notrace dynamic_preempt_schedule_notrace(void) 7251 7079 { ··· 7255 7083 } 7256 7084 NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); 7257 7085 EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); 7258 - #endif 7086 + # endif 7259 7087 #endif 7260 7088 7261 7089 #endif /* CONFIG_PREEMPTION */ ··· 7474 7302 7475 7303 preempt_enable(); 7476 7304 } 7477 - #endif 7305 + #endif /* CONFIG_RT_MUTEXES */ 7478 7306 7479 7307 #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) 7480 7308 int __sched __cond_resched(void) ··· 7505 7333 #endif 7506 7334 7507 7335 #ifdef CONFIG_PREEMPT_DYNAMIC 7508 - #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7509 - #define cond_resched_dynamic_enabled __cond_resched 7510 - #define cond_resched_dynamic_disabled ((void *)&__static_call_return0) 7336 + # ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL 7337 + # define cond_resched_dynamic_enabled __cond_resched 7338 + # define cond_resched_dynamic_disabled ((void *)&__static_call_return0) 7511 7339 DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); 7512 7340 EXPORT_STATIC_CALL_TRAMP(cond_resched); 7513 7341 7514 - #define might_resched_dynamic_enabled __cond_resched 7515 - #define might_resched_dynamic_disabled ((void *)&__static_call_return0) 7342 + # define might_resched_dynamic_enabled __cond_resched 7343 + # define might_resched_dynamic_disabled ((void *)&__static_call_return0) 7516 7344 DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); 7517 7345 EXPORT_STATIC_CALL_TRAMP(might_resched); 7518 - #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7346 + # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7519 7347 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); 7520 7348 int __sched dynamic_cond_resched(void) 7521 7349 { ··· 7533 7361 return __cond_resched(); 7534 7362 } 7535 7363 EXPORT_SYMBOL(dynamic_might_resched); 7536 - #endif 7537 - #endif 7364 + # endif 7365 + #endif /* CONFIG_PREEMPT_DYNAMIC */ 7538 7366 7539 7367 /* 7540 7368 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ··· 7600 7428 7601 7429 #ifdef CONFIG_PREEMPT_DYNAMIC 7602 7430 7603 - #ifdef CONFIG_GENERIC_IRQ_ENTRY 7604 - #include <linux/irq-entry-common.h> 7605 - #endif 7431 + # ifdef CONFIG_GENERIC_IRQ_ENTRY 7432 + # include <linux/irq-entry-common.h> 7433 + # endif 7606 7434 7607 7435 /* 7608 7436 * SC:cond_resched ··· 7657 7485 7658 7486 int sched_dynamic_mode(const char *str) 7659 7487 { 7660 - #ifndef CONFIG_PREEMPT_RT 7488 + # ifndef CONFIG_PREEMPT_RT 7661 7489 if (!strcmp(str, "none")) 7662 7490 return preempt_dynamic_none; 7663 7491 7664 7492 if (!strcmp(str, "voluntary")) 7665 7493 return preempt_dynamic_voluntary; 7666 - #endif 7494 + # endif 7667 7495 7668 7496 if (!strcmp(str, "full")) 7669 7497 return preempt_dynamic_full; 7670 7498 7671 - #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY 7499 + # ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY 7672 7500 if (!strcmp(str, "lazy")) 7673 7501 return preempt_dynamic_lazy; 7674 - #endif 7502 + # endif 7675 7503 7676 7504 return -EINVAL; 7677 7505 } 7678 7506 7679 - #define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) 7680 - #define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) 7507 + # define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) 7508 + # define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) 7681 7509 7682 - #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7683 - #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) 7684 - #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) 7685 - #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7686 - #define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) 7687 - #define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) 7688 - #else 7689 - #error "Unsupported PREEMPT_DYNAMIC mechanism" 7690 - #endif 7510 + # if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7511 + # define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) 7512 + # define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) 7513 + # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7514 + # define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) 7515 + # define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) 7516 + # else 7517 + # error "Unsupported PREEMPT_DYNAMIC mechanism" 7518 + # endif 7691 7519 7692 7520 static DEFINE_MUTEX(sched_dynamic_mutex); 7693 7521 ··· 7791 7619 } 7792 7620 } 7793 7621 7794 - #define PREEMPT_MODEL_ACCESSOR(mode) \ 7622 + # define PREEMPT_MODEL_ACCESSOR(mode) \ 7795 7623 bool preempt_model_##mode(void) \ 7796 7624 { \ 7797 7625 WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ ··· 7992 7820 */ 7993 7821 void __init init_idle(struct task_struct *idle, int cpu) 7994 7822 { 7995 - #ifdef CONFIG_SMP 7996 7823 struct affinity_context ac = (struct affinity_context) { 7997 7824 .new_mask = cpumask_of(cpu), 7998 7825 .flags = 0, 7999 7826 }; 8000 - #endif 8001 7827 struct rq *rq = cpu_rq(cpu); 8002 7828 unsigned long flags; 8003 7829 ··· 8011 7841 idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; 8012 7842 kthread_set_per_cpu(idle, cpu); 8013 7843 8014 - #ifdef CONFIG_SMP 8015 7844 /* 8016 7845 * No validation and serialization required at boot time and for 8017 7846 * setting up the idle tasks of not yet online CPUs. 8018 7847 */ 8019 7848 set_cpus_allowed_common(idle, &ac); 8020 - #endif 8021 7849 /* 8022 7850 * We're having a chicken and egg problem, even though we are 8023 7851 * holding rq->lock, the CPU isn't yet set to this CPU so the ··· 8034 7866 rq_set_donor(rq, idle); 8035 7867 rcu_assign_pointer(rq->curr, idle); 8036 7868 idle->on_rq = TASK_ON_RQ_QUEUED; 8037 - #ifdef CONFIG_SMP 8038 7869 idle->on_cpu = 1; 8039 - #endif 8040 7870 raw_spin_rq_unlock(rq); 8041 7871 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); 8042 7872 ··· 8047 7881 idle->sched_class = &idle_sched_class; 8048 7882 ftrace_graph_init_idle_task(idle, cpu); 8049 7883 vtime_init_idle(idle, cpu); 8050 - #ifdef CONFIG_SMP 8051 7884 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 8052 - #endif 8053 7885 } 8054 - 8055 - #ifdef CONFIG_SMP 8056 7886 8057 7887 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 8058 7888 const struct cpumask *trial) ··· 8285 8123 TASK_UNINTERRUPTIBLE); 8286 8124 } 8287 8125 8288 - #else 8126 + #else /* !CONFIG_HOTPLUG_CPU: */ 8289 8127 8290 8128 static inline void balance_push(struct rq *rq) 8291 8129 { ··· 8299 8137 { 8300 8138 } 8301 8139 8302 - #endif /* CONFIG_HOTPLUG_CPU */ 8140 + #endif /* !CONFIG_HOTPLUG_CPU */ 8303 8141 8304 8142 void set_rq_online(struct rq *rq) 8305 8143 { ··· 8608 8446 sched_core_cpu_dying(cpu); 8609 8447 return 0; 8610 8448 } 8611 - #endif 8449 + #endif /* CONFIG_HOTPLUG_CPU */ 8612 8450 8613 8451 void __init sched_init_smp(void) 8614 8452 { ··· 8632 8470 init_sched_rt_class(); 8633 8471 init_sched_dl_class(); 8634 8472 8473 + sched_init_dl_servers(); 8474 + 8635 8475 sched_smp_initialized = true; 8636 8476 } 8637 8477 ··· 8643 8479 return 0; 8644 8480 } 8645 8481 early_initcall(migration_init); 8646 - 8647 - #else 8648 - void __init sched_init_smp(void) 8649 - { 8650 - sched_init_granularity(); 8651 - } 8652 - #endif /* CONFIG_SMP */ 8653 8482 8654 8483 int in_sched_functions(unsigned long addr) 8655 8484 { ··· 8669 8512 int i; 8670 8513 8671 8514 /* Make sure the linker didn't screw up */ 8672 - #ifdef CONFIG_SMP 8673 8515 BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); 8674 - #endif 8675 8516 BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); 8676 8517 BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); 8677 8518 BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); ··· 8712 8557 #endif /* CONFIG_RT_GROUP_SCHED */ 8713 8558 } 8714 8559 8715 - #ifdef CONFIG_SMP 8716 8560 init_defrootdomain(); 8717 - #endif 8718 8561 8719 8562 #ifdef CONFIG_RT_GROUP_SCHED 8720 8563 init_rt_bandwidth(&root_task_group.rt_bandwidth, ··· 8773 8620 rq->rt.rt_runtime = global_rt_runtime(); 8774 8621 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 8775 8622 #endif 8776 - #ifdef CONFIG_SMP 8777 8623 rq->sd = NULL; 8778 8624 rq->rd = NULL; 8779 8625 rq->cpu_capacity = SCHED_CAPACITY_SCALE; ··· 8798 8646 #ifdef CONFIG_HOTPLUG_CPU 8799 8647 rcuwait_init(&rq->hotplug_wait); 8800 8648 #endif 8801 - #endif /* CONFIG_SMP */ 8802 8649 hrtick_rq_init(rq); 8803 8650 atomic_set(&rq->nr_iowait, 0); 8804 8651 fair_server_init(rq); ··· 8845 8694 8846 8695 calc_load_update = jiffies + LOAD_FREQ; 8847 8696 8848 - #ifdef CONFIG_SMP 8849 8697 idle_thread_set_boot_cpu(); 8698 + 8850 8699 balance_push_set(smp_processor_id(), false); 8851 - #endif 8852 8700 init_sched_fair_class(); 8853 8701 init_sched_ext_class(); 8854 8702 ··· 8980 8830 } 8981 8831 EXPORT_SYMBOL_GPL(__cant_sleep); 8982 8832 8983 - #ifdef CONFIG_SMP 8833 + # ifdef CONFIG_SMP 8984 8834 void __cant_migrate(const char *file, int line) 8985 8835 { 8986 8836 static unsigned long prev_jiffy; ··· 9011 8861 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 9012 8862 } 9013 8863 EXPORT_SYMBOL_GPL(__cant_migrate); 9014 - #endif 9015 - #endif 8864 + # endif /* CONFIG_SMP */ 8865 + #endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ 9016 8866 9017 8867 #ifdef CONFIG_MAGIC_SYSRQ 9018 8868 void normalize_rt_tasks(void) ··· 9052 8902 9053 8903 #endif /* CONFIG_MAGIC_SYSRQ */ 9054 8904 9055 - #if defined(CONFIG_KGDB_KDB) 8905 + #ifdef CONFIG_KGDB_KDB 9056 8906 /* 9057 8907 * These functions are only useful for KDB. 9058 8908 * ··· 9076 8926 return cpu_curr(cpu); 9077 8927 } 9078 8928 9079 - #endif /* defined(CONFIG_KGDB_KDB) */ 8929 + #endif /* CONFIG_KGDB_KDB */ 9080 8930 9081 8931 #ifdef CONFIG_CGROUP_SCHED 9082 8932 /* task_group_lock serializes the addition/removal of task groups */ ··· 9572 9422 #ifdef CONFIG_CFS_BANDWIDTH 9573 9423 static DEFINE_MUTEX(cfs_constraints_mutex); 9574 9424 9575 - const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 9576 - static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 9577 - /* More than 203 days if BW_SHIFT equals 20. */ 9578 - static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; 9579 - 9580 9425 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 9581 9426 9582 - static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, 9583 - u64 burst) 9427 + static int tg_set_cfs_bandwidth(struct task_group *tg, 9428 + u64 period_us, u64 quota_us, u64 burst_us) 9584 9429 { 9585 9430 int i, ret = 0, runtime_enabled, runtime_was_enabled; 9586 9431 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 9432 + u64 period, quota, burst; 9587 9433 9588 - if (tg == &root_task_group) 9589 - return -EINVAL; 9434 + period = (u64)period_us * NSEC_PER_USEC; 9590 9435 9591 - /* 9592 - * Ensure we have at some amount of bandwidth every period. This is 9593 - * to prevent reaching a state of large arrears when throttled via 9594 - * entity_tick() resulting in prolonged exit starvation. 9595 - */ 9596 - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 9597 - return -EINVAL; 9436 + if (quota_us == RUNTIME_INF) 9437 + quota = RUNTIME_INF; 9438 + else 9439 + quota = (u64)quota_us * NSEC_PER_USEC; 9598 9440 9599 - /* 9600 - * Likewise, bound things on the other side by preventing insane quota 9601 - * periods. This also allows us to normalize in computing quota 9602 - * feasibility. 9603 - */ 9604 - if (period > max_cfs_quota_period) 9605 - return -EINVAL; 9606 - 9607 - /* 9608 - * Bound quota to defend quota against overflow during bandwidth shift. 9609 - */ 9610 - if (quota != RUNTIME_INF && quota > max_cfs_runtime) 9611 - return -EINVAL; 9612 - 9613 - if (quota != RUNTIME_INF && (burst > quota || 9614 - burst + quota > max_cfs_runtime)) 9615 - return -EINVAL; 9441 + burst = (u64)burst_us * NSEC_PER_USEC; 9616 9442 9617 9443 /* 9618 9444 * Prevent race between setting of cfs_rq->runtime_enabled and ··· 9643 9517 return 0; 9644 9518 } 9645 9519 9646 - static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 9647 - { 9648 - u64 quota, period, burst; 9649 - 9650 - period = ktime_to_ns(tg->cfs_bandwidth.period); 9651 - burst = tg->cfs_bandwidth.burst; 9652 - if (cfs_quota_us < 0) 9653 - quota = RUNTIME_INF; 9654 - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) 9655 - quota = (u64)cfs_quota_us * NSEC_PER_USEC; 9656 - else 9657 - return -EINVAL; 9658 - 9659 - return tg_set_cfs_bandwidth(tg, period, quota, burst); 9660 - } 9661 - 9662 - static long tg_get_cfs_quota(struct task_group *tg) 9663 - { 9664 - u64 quota_us; 9665 - 9666 - if (tg->cfs_bandwidth.quota == RUNTIME_INF) 9667 - return -1; 9668 - 9669 - quota_us = tg->cfs_bandwidth.quota; 9670 - do_div(quota_us, NSEC_PER_USEC); 9671 - 9672 - return quota_us; 9673 - } 9674 - 9675 - static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 9676 - { 9677 - u64 quota, period, burst; 9678 - 9679 - if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) 9680 - return -EINVAL; 9681 - 9682 - period = (u64)cfs_period_us * NSEC_PER_USEC; 9683 - quota = tg->cfs_bandwidth.quota; 9684 - burst = tg->cfs_bandwidth.burst; 9685 - 9686 - return tg_set_cfs_bandwidth(tg, period, quota, burst); 9687 - } 9688 - 9689 - static long tg_get_cfs_period(struct task_group *tg) 9520 + static u64 tg_get_cfs_period(struct task_group *tg) 9690 9521 { 9691 9522 u64 cfs_period_us; 9692 9523 ··· 9653 9570 return cfs_period_us; 9654 9571 } 9655 9572 9656 - static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) 9573 + static u64 tg_get_cfs_quota(struct task_group *tg) 9657 9574 { 9658 - u64 quota, period, burst; 9575 + u64 quota_us; 9659 9576 9660 - if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) 9661 - return -EINVAL; 9577 + if (tg->cfs_bandwidth.quota == RUNTIME_INF) 9578 + return RUNTIME_INF; 9662 9579 9663 - burst = (u64)cfs_burst_us * NSEC_PER_USEC; 9664 - period = ktime_to_ns(tg->cfs_bandwidth.period); 9665 - quota = tg->cfs_bandwidth.quota; 9580 + quota_us = tg->cfs_bandwidth.quota; 9581 + do_div(quota_us, NSEC_PER_USEC); 9666 9582 9667 - return tg_set_cfs_bandwidth(tg, period, quota, burst); 9583 + return quota_us; 9668 9584 } 9669 9585 9670 - static long tg_get_cfs_burst(struct task_group *tg) 9586 + static u64 tg_get_cfs_burst(struct task_group *tg) 9671 9587 { 9672 9588 u64 burst_us; 9673 9589 ··· 9674 9592 do_div(burst_us, NSEC_PER_USEC); 9675 9593 9676 9594 return burst_us; 9677 - } 9678 - 9679 - static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 9680 - struct cftype *cft) 9681 - { 9682 - return tg_get_cfs_quota(css_tg(css)); 9683 - } 9684 - 9685 - static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 9686 - struct cftype *cftype, s64 cfs_quota_us) 9687 - { 9688 - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 9689 - } 9690 - 9691 - static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 9692 - struct cftype *cft) 9693 - { 9694 - return tg_get_cfs_period(css_tg(css)); 9695 - } 9696 - 9697 - static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 9698 - struct cftype *cftype, u64 cfs_period_us) 9699 - { 9700 - return tg_set_cfs_period(css_tg(css), cfs_period_us); 9701 - } 9702 - 9703 - static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, 9704 - struct cftype *cft) 9705 - { 9706 - return tg_get_cfs_burst(css_tg(css)); 9707 - } 9708 - 9709 - static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, 9710 - struct cftype *cftype, u64 cfs_burst_us) 9711 - { 9712 - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); 9713 9595 } 9714 9596 9715 9597 struct cfs_schedulable_data { ··· 9808 9762 9809 9763 return 0; 9810 9764 } 9765 + 9766 + const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ 9767 + static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ 9768 + /* More than 203 days if BW_SHIFT equals 20. */ 9769 + static const u64 max_bw_runtime_us = MAX_BW; 9770 + 9771 + static void tg_bandwidth(struct task_group *tg, 9772 + u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) 9773 + { 9774 + if (period_us_p) 9775 + *period_us_p = tg_get_cfs_period(tg); 9776 + if (quota_us_p) 9777 + *quota_us_p = tg_get_cfs_quota(tg); 9778 + if (burst_us_p) 9779 + *burst_us_p = tg_get_cfs_burst(tg); 9780 + } 9781 + 9782 + static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, 9783 + struct cftype *cft) 9784 + { 9785 + u64 period_us; 9786 + 9787 + tg_bandwidth(css_tg(css), &period_us, NULL, NULL); 9788 + return period_us; 9789 + } 9790 + 9791 + static int tg_set_bandwidth(struct task_group *tg, 9792 + u64 period_us, u64 quota_us, u64 burst_us) 9793 + { 9794 + const u64 max_usec = U64_MAX / NSEC_PER_USEC; 9795 + 9796 + if (tg == &root_task_group) 9797 + return -EINVAL; 9798 + 9799 + /* Values should survive translation to nsec */ 9800 + if (period_us > max_usec || 9801 + (quota_us != RUNTIME_INF && quota_us > max_usec) || 9802 + burst_us > max_usec) 9803 + return -EINVAL; 9804 + 9805 + /* 9806 + * Ensure we have some amount of bandwidth every period. This is to 9807 + * prevent reaching a state of large arrears when throttled via 9808 + * entity_tick() resulting in prolonged exit starvation. 9809 + */ 9810 + if (quota_us < min_bw_quota_period_us || 9811 + period_us < min_bw_quota_period_us) 9812 + return -EINVAL; 9813 + 9814 + /* 9815 + * Likewise, bound things on the other side by preventing insane quota 9816 + * periods. This also allows us to normalize in computing quota 9817 + * feasibility. 9818 + */ 9819 + if (period_us > max_bw_quota_period_us) 9820 + return -EINVAL; 9821 + 9822 + /* 9823 + * Bound quota to defend quota against overflow during bandwidth shift. 9824 + */ 9825 + if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) 9826 + return -EINVAL; 9827 + 9828 + if (quota_us != RUNTIME_INF && (burst_us > quota_us || 9829 + burst_us + quota_us > max_bw_runtime_us)) 9830 + return -EINVAL; 9831 + 9832 + return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); 9833 + } 9834 + 9835 + static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, 9836 + struct cftype *cft) 9837 + { 9838 + u64 quota_us; 9839 + 9840 + tg_bandwidth(css_tg(css), NULL, &quota_us, NULL); 9841 + return quota_us; /* (s64)RUNTIME_INF becomes -1 */ 9842 + } 9843 + 9844 + static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, 9845 + struct cftype *cft) 9846 + { 9847 + u64 burst_us; 9848 + 9849 + tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); 9850 + return burst_us; 9851 + } 9852 + 9853 + static int cpu_period_write_u64(struct cgroup_subsys_state *css, 9854 + struct cftype *cftype, u64 period_us) 9855 + { 9856 + struct task_group *tg = css_tg(css); 9857 + u64 quota_us, burst_us; 9858 + 9859 + tg_bandwidth(tg, NULL, &quota_us, &burst_us); 9860 + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); 9861 + } 9862 + 9863 + static int cpu_quota_write_s64(struct cgroup_subsys_state *css, 9864 + struct cftype *cftype, s64 quota_us) 9865 + { 9866 + struct task_group *tg = css_tg(css); 9867 + u64 period_us, burst_us; 9868 + 9869 + if (quota_us < 0) 9870 + quota_us = RUNTIME_INF; 9871 + 9872 + tg_bandwidth(tg, &period_us, NULL, &burst_us); 9873 + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); 9874 + } 9875 + 9876 + static int cpu_burst_write_u64(struct cgroup_subsys_state *css, 9877 + struct cftype *cftype, u64 burst_us) 9878 + { 9879 + struct task_group *tg = css_tg(css); 9880 + u64 period_us, quota_us; 9881 + 9882 + tg_bandwidth(tg, &period_us, &quota_us, NULL); 9883 + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); 9884 + } 9811 9885 #endif /* CONFIG_CFS_BANDWIDTH */ 9812 9886 9813 9887 #ifdef CONFIG_RT_GROUP_SCHED ··· 9973 9807 scx_group_set_idle(css_tg(css), idle); 9974 9808 return ret; 9975 9809 } 9976 - #endif 9810 + #endif /* CONFIG_GROUP_SCHED_WEIGHT */ 9977 9811 9978 9812 static struct cftype cpu_legacy_files[] = { 9979 9813 #ifdef CONFIG_GROUP_SCHED_WEIGHT ··· 9990 9824 #endif 9991 9825 #ifdef CONFIG_CFS_BANDWIDTH 9992 9826 { 9993 - .name = "cfs_quota_us", 9994 - .read_s64 = cpu_cfs_quota_read_s64, 9995 - .write_s64 = cpu_cfs_quota_write_s64, 9827 + .name = "cfs_period_us", 9828 + .read_u64 = cpu_period_read_u64, 9829 + .write_u64 = cpu_period_write_u64, 9996 9830 }, 9997 9831 { 9998 - .name = "cfs_period_us", 9999 - .read_u64 = cpu_cfs_period_read_u64, 10000 - .write_u64 = cpu_cfs_period_write_u64, 9832 + .name = "cfs_quota_us", 9833 + .read_s64 = cpu_quota_read_s64, 9834 + .write_s64 = cpu_quota_write_s64, 10001 9835 }, 10002 9836 { 10003 9837 .name = "cfs_burst_us", 10004 - .read_u64 = cpu_cfs_burst_read_u64, 10005 - .write_u64 = cpu_cfs_burst_write_u64, 9838 + .read_u64 = cpu_burst_read_u64, 9839 + .write_u64 = cpu_burst_write_u64, 10006 9840 }, 10007 9841 { 10008 9842 .name = "stat", ··· 10101 9935 cfs_b->nr_periods, cfs_b->nr_throttled, 10102 9936 throttled_usec, cfs_b->nr_burst, burst_usec); 10103 9937 } 10104 - #endif 9938 + #endif /* CONFIG_CFS_BANDWIDTH */ 10105 9939 return 0; 10106 9940 } 10107 9941 ··· 10199 10033 } 10200 10034 10201 10035 /* caller should put the current value in *@periodp before calling */ 10202 - static int __maybe_unused cpu_period_quota_parse(char *buf, 10203 - u64 *periodp, u64 *quotap) 10036 + static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, 10037 + u64 *quota_us_p) 10204 10038 { 10205 10039 char tok[21]; /* U64_MAX */ 10206 10040 10207 - if (sscanf(buf, "%20s %llu", tok, periodp) < 1) 10041 + if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) 10208 10042 return -EINVAL; 10209 10043 10210 - *periodp *= NSEC_PER_USEC; 10211 - 10212 - if (sscanf(tok, "%llu", quotap)) 10213 - *quotap *= NSEC_PER_USEC; 10214 - else if (!strcmp(tok, "max")) 10215 - *quotap = RUNTIME_INF; 10216 - else 10217 - return -EINVAL; 10044 + if (sscanf(tok, "%llu", quota_us_p) < 1) { 10045 + if (!strcmp(tok, "max")) 10046 + *quota_us_p = RUNTIME_INF; 10047 + else 10048 + return -EINVAL; 10049 + } 10218 10050 10219 10051 return 0; 10220 10052 } ··· 10221 10057 static int cpu_max_show(struct seq_file *sf, void *v) 10222 10058 { 10223 10059 struct task_group *tg = css_tg(seq_css(sf)); 10060 + u64 period_us, quota_us; 10224 10061 10225 - cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); 10062 + tg_bandwidth(tg, &period_us, &quota_us, NULL); 10063 + cpu_period_quota_print(sf, period_us, quota_us); 10226 10064 return 0; 10227 10065 } 10228 10066 ··· 10232 10066 char *buf, size_t nbytes, loff_t off) 10233 10067 { 10234 10068 struct task_group *tg = css_tg(of_css(of)); 10235 - u64 period = tg_get_cfs_period(tg); 10236 - u64 burst = tg->cfs_bandwidth.burst; 10237 - u64 quota; 10069 + u64 period_us, quota_us, burst_us; 10238 10070 int ret; 10239 10071 10240 - ret = cpu_period_quota_parse(buf, &period, &quota); 10072 + tg_bandwidth(tg, &period_us, NULL, &burst_us); 10073 + ret = cpu_period_quota_parse(buf, &period_us, &quota_us); 10241 10074 if (!ret) 10242 - ret = tg_set_cfs_bandwidth(tg, period, quota, burst); 10075 + ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); 10243 10076 return ret ?: nbytes; 10244 10077 } 10245 - #endif 10078 + #endif /* CONFIG_CFS_BANDWIDTH */ 10246 10079 10247 10080 static struct cftype cpu_files[] = { 10248 10081 #ifdef CONFIG_GROUP_SCHED_WEIGHT ··· 10274 10109 { 10275 10110 .name = "max.burst", 10276 10111 .flags = CFTYPE_NOT_ON_ROOT, 10277 - .read_u64 = cpu_cfs_burst_read_u64, 10278 - .write_u64 = cpu_cfs_burst_write_u64, 10112 + .read_u64 = cpu_burst_read_u64, 10113 + .write_u64 = cpu_burst_write_u64, 10279 10114 }, 10280 - #endif 10115 + #endif /* CONFIG_CFS_BANDWIDTH */ 10281 10116 #ifdef CONFIG_UCLAMP_TASK_GROUP 10282 10117 { 10283 10118 .name = "uclamp.min", ··· 10291 10126 .seq_show = cpu_uclamp_max_show, 10292 10127 .write = cpu_uclamp_max_write, 10293 10128 }, 10294 - #endif 10129 + #endif /* CONFIG_UCLAMP_TASK_GROUP */ 10295 10130 { } /* terminate */ 10296 10131 }; 10297 10132 ··· 10312 10147 .threaded = true, 10313 10148 }; 10314 10149 10315 - #endif /* CONFIG_CGROUP_SCHED */ 10150 + #endif /* CONFIG_CGROUP_SCHED */ 10316 10151 10317 10152 void dump_cpu_task(int cpu) 10318 10153 { ··· 10898 10733 WARN_ON_ONCE(!t->mm || t->mm_cid != -1); 10899 10734 t->mm_cid_active = 1; 10900 10735 } 10901 - #endif 10736 + #endif /* CONFIG_SCHED_MM_CID */ 10902 10737 10903 10738 #ifdef CONFIG_SCHED_CLASS_EXT 10904 10739 void sched_deq_and_put_task(struct task_struct *p, int queue_flags, ··· 10933 10768 if (ctx->running) 10934 10769 set_next_task(rq, ctx->p); 10935 10770 } 10936 - #endif /* CONFIG_SCHED_CLASS_EXT */ 10771 + #endif /* CONFIG_SCHED_CLASS_EXT */

+2

kernel/sched/core_sched.c

··· 4 4 * A simple wrapper around refcount. An allocated sched_core_cookie's 5 5 * address is used to compute the cookie of the task. 6 6 */ 7 + #include "sched.h" 8 + 7 9 struct sched_core_cookie { 8 10 refcount_t refcnt; 9 11 };

+2

kernel/sched/cpuacct.c

··· 6 6 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 7 7 * (balbir@in.ibm.com). 8 8 */ 9 + #include <linux/sched/cputime.h> 10 + #include "sched.h" 9 11 10 12 /* Time spent by the tasks of the CPU accounting group executing in ... */ 11 13 enum cpuacct_stat_index {

+1

kernel/sched/cpudeadline.c

··· 6 6 * 7 7 * Author: Juri Lelli <j.lelli@sssup.it> 8 8 */ 9 + #include "sched.h" 9 10 10 11 static inline int parent(int i) 11 12 {

+2 -2

kernel/sched/cpudeadline.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 + #include <linux/types.h> 3 + #include <linux/spinlock.h> 2 4 3 5 #define IDX_INVALID -1 4 6 ··· 17 15 struct cpudl_item *elements; 18 16 }; 19 17 20 - #ifdef CONFIG_SMP 21 18 int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); 22 19 void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 23 20 void cpudl_clear(struct cpudl *cp, int cpu); ··· 24 23 void cpudl_set_freecpu(struct cpudl *cp, int cpu); 25 24 void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 26 25 void cpudl_cleanup(struct cpudl *cp); 27 - #endif /* CONFIG_SMP */

+1

kernel/sched/cpufreq.c

··· 5 5 * Copyright (C) 2016, Intel Corporation 6 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 7 */ 8 + #include "sched.h" 8 9 9 10 DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); 10 11

+4 -2

kernel/sched/cpufreq_schedutil.c

··· 5 5 * Copyright (C) 2016, Intel Corporation 6 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 7 */ 8 + #include <uapi/linux/sched/types.h> 9 + #include "sched.h" 8 10 9 11 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) 10 12 ··· 382 380 sg_cpu->saved_idle_calls = idle_calls; 383 381 return ret; 384 382 } 385 - #else 383 + #else /* !CONFIG_NO_HZ_COMMON: */ 386 384 static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } 387 - #endif /* CONFIG_NO_HZ_COMMON */ 385 + #endif /* !CONFIG_NO_HZ_COMMON */ 388 386 389 387 /* 390 388 * Make sugov_should_update_freq() ignore the rate limit when DL

+1

kernel/sched/cpupri.c

··· 22 22 * worst case complexity of O(min(101, nr_domcpus)), though the scenario that 23 23 * yields the worst case search is fairly contrived. 24 24 */ 25 + #include "sched.h" 25 26 26 27 /* 27 28 * p->rt_priority p->prio newpri cpupri

+3 -2

kernel/sched/cpupri.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 + #include <linux/atomic.h> 3 + #include <linux/cpumask.h> 4 + #include <linux/sched/rt.h> 2 5 3 6 #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) 4 7 ··· 20 17 int *cpu_to_pri; 21 18 }; 22 19 23 - #ifdef CONFIG_SMP 24 20 int cpupri_find(struct cpupri *cp, struct task_struct *p, 25 21 struct cpumask *lowest_mask); 26 22 int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, ··· 28 26 void cpupri_set(struct cpupri *cp, int cpu, int pri); 29 27 int cpupri_init(struct cpupri *cp); 30 28 void cpupri_cleanup(struct cpupri *cp); 31 - #endif

+10 -7

kernel/sched/cputime.c

··· 2 2 /* 3 3 * Simple CPU accounting cgroup controller 4 4 */ 5 + #include <linux/sched/cputime.h> 6 + #include <linux/tsacct_kern.h> 7 + #include "sched.h" 5 8 6 9 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 7 10 #include <asm/cputime.h> ··· 91 88 return delta; 92 89 } 93 90 94 - #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 91 + #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 95 92 96 93 static u64 irqtime_tick_accounted(u64 dummy) 97 94 { ··· 244 241 245 242 task_group_account_field(p, CPUTIME_FORCEIDLE, delta); 246 243 } 247 - #endif 244 + #endif /* CONFIG_SCHED_CORE */ 248 245 249 246 /* 250 247 * When a guest is interrupted for a longer amount of time, missed clock ··· 265 262 266 263 return steal; 267 264 } 268 - #endif 265 + #endif /* CONFIG_PARAVIRT */ 269 266 return 0; 270 267 } 271 268 ··· 291 288 { 292 289 return t->se.sum_exec_runtime; 293 290 } 294 - #else 291 + #else /* !CONFIG_64BIT: */ 295 292 static u64 read_sum_exec_runtime(struct task_struct *t) 296 293 { 297 294 u64 ns; ··· 304 301 305 302 return ns; 306 303 } 307 - #endif 304 + #endif /* !CONFIG_64BIT */ 308 305 309 306 /* 310 307 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live ··· 414 411 { 415 412 irqtime_account_process_tick(current, 0, ticks); 416 413 } 417 - #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 414 + #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 418 415 static inline void irqtime_account_idle_ticks(int ticks) { } 419 416 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 420 417 int nr_ticks) { } 421 - #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 418 + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 422 419 423 420 /* 424 421 * Use precise platform statistics if available:

+78 -130

kernel/sched/deadline.c

··· 17 17 */ 18 18 19 19 #include <linux/cpuset.h> 20 + #include <linux/sched/clock.h> 21 + #include <uapi/linux/sched/types.h> 22 + #include "sched.h" 23 + #include "pelt.h" 20 24 21 25 /* 22 26 * Default limits for DL period; on the top end we guard against small util ··· 55 51 return 0; 56 52 } 57 53 late_initcall(sched_dl_sysctl_init); 58 - #endif 54 + #endif /* CONFIG_SYSCTL */ 59 55 60 56 static bool dl_server(struct sched_dl_entity *dl_se) 61 57 { ··· 103 99 { 104 100 return pi_of(dl_se) != dl_se; 105 101 } 106 - #else 102 + #else /* !CONFIG_RT_MUTEXES: */ 107 103 static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) 108 104 { 109 105 return dl_se; ··· 113 109 { 114 110 return false; 115 111 } 116 - #endif 112 + #endif /* !CONFIG_RT_MUTEXES */ 117 113 118 - #ifdef CONFIG_SMP 119 114 static inline struct dl_bw *dl_bw_of(int i) 120 115 { 121 116 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), ··· 194 191 rq->dl.extra_bw += bw; 195 192 } 196 193 } 197 - #else 198 - static inline struct dl_bw *dl_bw_of(int i) 199 - { 200 - return &cpu_rq(i)->dl.dl_bw; 201 - } 202 - 203 - static inline int dl_bw_cpus(int i) 204 - { 205 - return 1; 206 - } 207 - 208 - static inline unsigned long dl_bw_capacity(int i) 209 - { 210 - return SCHED_CAPACITY_SCALE; 211 - } 212 - 213 - bool dl_bw_visited(int cpu, u64 cookie) 214 - { 215 - return false; 216 - } 217 - 218 - static inline 219 - void __dl_update(struct dl_bw *dl_b, s64 bw) 220 - { 221 - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); 222 - 223 - dl->extra_bw += bw; 224 - } 225 - #endif 226 194 227 195 static inline 228 196 void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) ··· 526 552 { 527 553 dl_rq->root = RB_ROOT_CACHED; 528 554 529 - #ifdef CONFIG_SMP 530 555 /* zero means no -deadline tasks */ 531 556 dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; 532 557 533 558 dl_rq->overloaded = 0; 534 559 dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; 535 - #else 536 - init_dl_bw(&dl_rq->dl_bw); 537 - #endif 538 560 539 561 dl_rq->running_bw = 0; 540 562 dl_rq->this_bw = 0; 541 563 init_dl_rq_bw_ratio(dl_rq); 542 564 } 543 - 544 - #ifdef CONFIG_SMP 545 565 546 566 static inline int dl_overloaded(struct rq *rq) 547 567 { ··· 721 753 return later_rq; 722 754 } 723 755 724 - #else 725 - 726 - static inline 727 - void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) 728 - { 729 - } 730 - 731 - static inline 732 - void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) 733 - { 734 - } 735 - 736 - static inline 737 - void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 738 - { 739 - } 740 - 741 - static inline 742 - void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) 743 - { 744 - } 745 - 746 - static inline void deadline_queue_push_tasks(struct rq *rq) 747 - { 748 - } 749 - 750 - static inline void deadline_queue_pull_task(struct rq *rq) 751 - { 752 - } 753 - #endif /* CONFIG_SMP */ 754 - 755 756 static void 756 757 enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); 757 758 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); ··· 760 823 { 761 824 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 762 825 struct rq *rq = rq_of_dl_rq(dl_rq); 826 + 827 + update_rq_clock(rq); 763 828 764 829 WARN_ON(is_dl_boosted(dl_se)); 765 830 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); ··· 1134 1195 1135 1196 static void __push_dl_task(struct rq *rq, struct rq_flags *rf) 1136 1197 { 1137 - #ifdef CONFIG_SMP 1138 1198 /* 1139 1199 * Queueing this task back might have overloaded rq, check if we need 1140 1200 * to kick someone away. ··· 1147 1209 push_dl_task(rq); 1148 1210 rq_repin_lock(rq, rf); 1149 1211 } 1150 - #endif 1151 1212 } 1152 1213 1153 1214 /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ 1154 1215 static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; 1216 + 1217 + static bool dl_server_stopped(struct sched_dl_entity *dl_se); 1155 1218 1156 1219 static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) 1157 1220 { ··· 1173 1234 1174 1235 if (!dl_se->server_has_tasks(dl_se)) { 1175 1236 replenish_dl_entity(dl_se); 1237 + dl_server_stopped(dl_se); 1176 1238 return HRTIMER_NORESTART; 1177 1239 } 1178 1240 ··· 1279 1339 goto unlock; 1280 1340 } 1281 1341 1282 - #ifdef CONFIG_SMP 1283 1342 if (unlikely(!rq->online)) { 1284 1343 /* 1285 1344 * If the runqueue is no longer available, migrate the ··· 1295 1356 * there. 1296 1357 */ 1297 1358 } 1298 - #endif 1299 1359 1300 1360 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 1301 1361 if (dl_task(rq->donor)) ··· 1538 1600 rt_rq->rt_time += delta_exec; 1539 1601 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1540 1602 } 1541 - #endif 1603 + #endif /* CONFIG_RT_GROUP_SCHED */ 1542 1604 } 1543 1605 1544 1606 /* ··· 1577 1639 void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) 1578 1640 { 1579 1641 /* 0 runtime = fair server disabled */ 1580 - if (dl_se->dl_runtime) 1642 + if (dl_se->dl_runtime) { 1643 + dl_se->dl_server_idle = 0; 1581 1644 update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1645 + } 1582 1646 } 1583 1647 1584 1648 void dl_server_start(struct sched_dl_entity *dl_se) 1585 1649 { 1586 1650 struct rq *rq = dl_se->rq; 1587 1651 1588 - /* 1589 - * XXX: the apply do not work fine at the init phase for the 1590 - * fair server because things are not yet set. We need to improve 1591 - * this before getting generic. 1592 - */ 1593 - if (!dl_server(dl_se)) { 1594 - u64 runtime = 50 * NSEC_PER_MSEC; 1595 - u64 period = 1000 * NSEC_PER_MSEC; 1596 - 1597 - dl_server_apply_params(dl_se, runtime, period, 1); 1598 - 1599 - dl_se->dl_server = 1; 1600 - dl_se->dl_defer = 1; 1601 - setup_new_dl_entity(dl_se); 1602 - } 1603 - 1604 - if (!dl_se->dl_runtime) 1652 + if (!dl_server(dl_se) || dl_se->dl_server_active) 1605 1653 return; 1606 1654 1607 1655 dl_se->dl_server_active = 1; ··· 1598 1674 1599 1675 void dl_server_stop(struct sched_dl_entity *dl_se) 1600 1676 { 1601 - if (!dl_se->dl_runtime) 1677 + if (!dl_server(dl_se) || !dl_server_active(dl_se)) 1602 1678 return; 1603 1679 1604 1680 dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); ··· 1608 1684 dl_se->dl_server_active = 0; 1609 1685 } 1610 1686 1687 + static bool dl_server_stopped(struct sched_dl_entity *dl_se) 1688 + { 1689 + if (!dl_se->dl_server_active) 1690 + return false; 1691 + 1692 + if (dl_se->dl_server_idle) { 1693 + dl_server_stop(dl_se); 1694 + return true; 1695 + } 1696 + 1697 + dl_se->dl_server_idle = 1; 1698 + return false; 1699 + } 1700 + 1611 1701 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, 1612 1702 dl_server_has_tasks_f has_tasks, 1613 1703 dl_server_pick_f pick_task) ··· 1629 1691 dl_se->rq = rq; 1630 1692 dl_se->server_has_tasks = has_tasks; 1631 1693 dl_se->server_pick_task = pick_task; 1694 + } 1695 + 1696 + void sched_init_dl_servers(void) 1697 + { 1698 + int cpu; 1699 + struct rq *rq; 1700 + struct sched_dl_entity *dl_se; 1701 + 1702 + for_each_online_cpu(cpu) { 1703 + u64 runtime = 50 * NSEC_PER_MSEC; 1704 + u64 period = 1000 * NSEC_PER_MSEC; 1705 + 1706 + rq = cpu_rq(cpu); 1707 + 1708 + guard(rq_lock_irq)(rq); 1709 + 1710 + dl_se = &rq->fair_server; 1711 + 1712 + WARN_ON(dl_server(dl_se)); 1713 + 1714 + dl_server_apply_params(dl_se, runtime, period, 1); 1715 + 1716 + dl_se->dl_server = 1; 1717 + dl_se->dl_defer = 1; 1718 + setup_new_dl_entity(dl_se); 1719 + } 1632 1720 } 1633 1721 1634 1722 void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) ··· 1808 1844 #define __node_2_dle(node) \ 1809 1845 rb_entry((node), struct sched_dl_entity, rb_node) 1810 1846 1811 - #ifdef CONFIG_SMP 1812 - 1813 1847 static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) 1814 1848 { 1815 1849 struct rq *rq = rq_of_dl_rq(dl_rq); ··· 1842 1880 cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); 1843 1881 } 1844 1882 } 1845 - 1846 - #else 1847 - 1848 - static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} 1849 - static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} 1850 - 1851 - #endif /* CONFIG_SMP */ 1852 1883 1853 1884 static inline 1854 1885 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) ··· 2121 2166 if (dl_server(&p->dl)) 2122 2167 return; 2123 2168 2169 + if (task_is_blocked(p)) 2170 + return; 2171 + 2124 2172 if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) 2125 2173 enqueue_pushable_dl_task(rq, p); 2126 2174 } ··· 2171 2213 */ 2172 2214 rq_clock_skip_update(rq); 2173 2215 } 2174 - 2175 - #ifdef CONFIG_SMP 2176 2216 2177 2217 static inline bool dl_task_is_earliest_deadline(struct task_struct *p, 2178 2218 struct rq *rq) ··· 2301 2345 2302 2346 return sched_stop_runnable(rq) || sched_dl_runnable(rq); 2303 2347 } 2304 - #endif /* CONFIG_SMP */ 2305 2348 2306 2349 /* 2307 2350 * Only called when both the current and waking task are -deadline ··· 2314 2359 return; 2315 2360 } 2316 2361 2317 - #ifdef CONFIG_SMP 2318 2362 /* 2319 2363 * In the unlikely case current and p have the same deadline 2320 2364 * let us try to decide what's the best thing to do... ··· 2321 2367 if ((p->dl.deadline == rq->donor->dl.deadline) && 2322 2368 !test_tsk_need_resched(rq->curr)) 2323 2369 check_preempt_equal_dl(rq, p); 2324 - #endif /* CONFIG_SMP */ 2325 2370 } 2326 2371 2327 2372 #ifdef CONFIG_SCHED_HRTICK ··· 2328 2375 { 2329 2376 hrtick_start(rq, dl_se->runtime); 2330 2377 } 2331 - #else /* !CONFIG_SCHED_HRTICK */ 2378 + #else /* !CONFIG_SCHED_HRTICK: */ 2332 2379 static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) 2333 2380 { 2334 2381 } 2335 - #endif 2382 + #endif /* !CONFIG_SCHED_HRTICK */ 2336 2383 2337 2384 static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) 2338 2385 { ··· 2388 2435 if (dl_server(dl_se)) { 2389 2436 p = dl_se->server_pick_task(dl_se); 2390 2437 if (!p) { 2391 - if (dl_server_active(dl_se)) { 2438 + if (!dl_server_stopped(dl_se)) { 2392 2439 dl_se->dl_yielded = 1; 2393 2440 update_curr_dl_se(rq, dl_se, 0); 2394 2441 } ··· 2418 2465 update_curr_dl(rq); 2419 2466 2420 2467 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); 2468 + 2469 + if (task_is_blocked(p)) 2470 + return; 2471 + 2421 2472 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) 2422 2473 enqueue_pushable_dl_task(rq, p); 2423 2474 } ··· 2456 2499 * sched_fork() 2457 2500 */ 2458 2501 } 2459 - 2460 - #ifdef CONFIG_SMP 2461 2502 2462 2503 /* Only try algorithms three times */ 2463 2504 #define DL_MAX_TRIES 3 ··· 2931 2976 int i; 2932 2977 2933 2978 guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); 2979 + 2980 + /* 2981 + * Reset total_bw to zero and extra_bw to max_bw so that next 2982 + * loop will add dl-servers contributions back properly, 2983 + */ 2934 2984 rd->dl_bw.total_bw = 0; 2985 + for_each_cpu(i, rd->span) 2986 + cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw; 2935 2987 2936 2988 /* 2937 2989 * dl_servers are not tasks. Since dl_add_task_root_domain ignores ··· 2956 2994 { 2957 2995 dl_clear_root_domain(cpu_rq(cpu)->rd); 2958 2996 } 2959 - 2960 - #endif /* CONFIG_SMP */ 2961 2997 2962 2998 static void switched_from_dl(struct rq *rq, struct task_struct *p) 2963 2999 { ··· 3029 3069 } 3030 3070 3031 3071 if (rq->donor != p) { 3032 - #ifdef CONFIG_SMP 3033 3072 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) 3034 3073 deadline_queue_push_tasks(rq); 3035 - #endif 3036 3074 if (dl_task(rq->donor)) 3037 3075 wakeup_preempt_dl(rq, p, 0); 3038 3076 else ··· 3050 3092 if (!task_on_rq_queued(p)) 3051 3093 return; 3052 3094 3053 - #ifdef CONFIG_SMP 3054 3095 /* 3055 3096 * This might be too much, but unfortunately 3056 3097 * we don't have the old deadline value, and ··· 3078 3121 dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) 3079 3122 resched_curr(rq); 3080 3123 } 3081 - #else 3082 - /* 3083 - * We don't know if p has a earlier or later deadline, so let's blindly 3084 - * set a (maybe not needed) rescheduling point. 3085 - */ 3086 - resched_curr(rq); 3087 - #endif 3088 3124 } 3089 3125 3090 3126 #ifdef CONFIG_SCHED_CORE ··· 3099 3149 .put_prev_task = put_prev_task_dl, 3100 3150 .set_next_task = set_next_task_dl, 3101 3151 3102 - #ifdef CONFIG_SMP 3103 3152 .balance = balance_dl, 3104 3153 .select_task_rq = select_task_rq_dl, 3105 3154 .migrate_task_rq = migrate_task_rq_dl, ··· 3107 3158 .rq_offline = rq_offline_dl, 3108 3159 .task_woken = task_woken_dl, 3109 3160 .find_lock_rq = find_lock_later_rq, 3110 - #endif 3111 3161 3112 3162 .task_tick = task_tick_dl, 3113 3163 .task_fork = task_fork_dl, ··· 3190 3242 if (global_rt_runtime() != RUNTIME_INF) 3191 3243 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 3192 3244 3245 + for_each_possible_cpu(cpu) 3246 + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); 3247 + 3193 3248 for_each_possible_cpu(cpu) { 3194 3249 rcu_read_lock_sched(); 3195 3250 ··· 3208 3257 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 3209 3258 3210 3259 rcu_read_unlock_sched(); 3211 - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); 3212 3260 } 3213 3261 } 3214 3262 ··· 3408 3458 return false; 3409 3459 } 3410 3460 3411 - #ifdef CONFIG_SMP 3412 3461 int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, 3413 3462 const struct cpumask *trial) 3414 3463 { ··· 3519 3570 { 3520 3571 dl_bw_manage(dl_bw_req_free, cpu, dl_bw); 3521 3572 } 3522 - #endif 3523 3573 3524 3574 void print_dl_stats(struct seq_file *m, int cpu) 3525 3575 {

+12 -35

kernel/sched/debug.c

··· 6 6 * 7 7 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar 8 8 */ 9 + #include <linux/debugfs.h> 10 + #include <linux/nmi.h> 11 + #include "sched.h" 9 12 10 13 /* 11 14 * This allows printing both to /sys/kernel/debug/sched/debug and ··· 93 90 { 94 91 static_key_enable_cpuslocked(&sched_feat_keys[i]); 95 92 } 96 - #else 93 + #else /* !CONFIG_JUMP_LABEL: */ 97 94 static void sched_feat_disable(int i) { }; 98 95 static void sched_feat_enable(int i) { }; 99 - #endif /* CONFIG_JUMP_LABEL */ 96 + #endif /* !CONFIG_JUMP_LABEL */ 100 97 101 98 static int sched_feat_set(char *cmp) 102 99 { ··· 169 166 .release = single_release, 170 167 }; 171 168 172 - #ifdef CONFIG_SMP 173 - 174 169 static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, 175 170 size_t cnt, loff_t *ppos) 176 171 { ··· 214 213 .llseek = seq_lseek, 215 214 .release = single_release, 216 215 }; 217 - 218 - #endif /* SMP */ 219 216 220 217 #ifdef CONFIG_PREEMPT_DYNAMIC 221 218 ··· 282 283 283 284 __read_mostly bool sched_debug_verbose; 284 285 285 - #ifdef CONFIG_SMP 286 286 static struct dentry *sd_dentry; 287 287 288 288 ··· 309 311 310 312 return result; 311 313 } 312 - #else 313 - #define sched_verbose_write debugfs_write_file_bool 314 - #endif 315 314 316 315 static const struct file_operations sched_verbose_fops = { 317 316 .read = debugfs_read_file_bool, ··· 507 512 debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); 508 513 debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); 509 514 510 - #ifdef CONFIG_SMP 511 515 debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); 512 516 debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); 513 517 debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); ··· 514 520 sched_domains_mutex_lock(); 515 521 update_sched_domain_debugfs(); 516 522 sched_domains_mutex_unlock(); 517 - #endif 518 523 519 524 #ifdef CONFIG_NUMA_BALANCING 520 525 numa = debugfs_create_dir("numa_balancing", debugfs_sched); ··· 523 530 debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); 524 531 debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); 525 532 debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); 526 - #endif 533 + #endif /* CONFIG_NUMA_BALANCING */ 527 534 528 535 debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); 529 536 ··· 532 539 return 0; 533 540 } 534 541 late_initcall(sched_init_debug); 535 - 536 - #ifdef CONFIG_SMP 537 542 538 543 static cpumask_var_t sd_sysctl_cpus; 539 544 ··· 643 652 __cpumask_set_cpu(cpu, sd_sysctl_cpus); 644 653 } 645 654 646 - #endif /* CONFIG_SMP */ 647 - 648 655 #ifdef CONFIG_FAIR_GROUP_SCHED 649 656 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 650 657 { ··· 679 690 } 680 691 681 692 P(se->load.weight); 682 - #ifdef CONFIG_SMP 683 693 P(se->avg.load_avg); 684 694 P(se->avg.util_avg); 685 695 P(se->avg.runnable_avg); 686 - #endif 687 696 688 697 #undef PN_SCHEDSTAT 689 698 #undef PN 690 699 #undef P_SCHEDSTAT 691 700 #undef P 692 701 } 693 - #endif 702 + #endif /* CONFIG_FAIR_GROUP_SCHED */ 694 703 695 704 #ifdef CONFIG_CGROUP_SCHED 696 705 static DEFINE_SPINLOCK(sched_debug_lock); ··· 841 854 SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); 842 855 SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); 843 856 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 844 - #ifdef CONFIG_SMP 845 857 SEQ_printf(m, " .%-30s: %lu\n", "load_avg", 846 858 cfs_rq->avg.load_avg); 847 859 SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", ··· 860 874 cfs_rq->tg_load_avg_contrib); 861 875 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", 862 876 atomic_long_read(&cfs_rq->tg->load_avg)); 863 - #endif 864 - #endif 877 + #endif /* CONFIG_FAIR_GROUP_SCHED */ 865 878 #ifdef CONFIG_CFS_BANDWIDTH 866 879 SEQ_printf(m, " .%-30s: %d\n", "throttled", 867 880 cfs_rq->throttled); ··· 914 929 SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) 915 930 916 931 PU(dl_nr_running); 917 - #ifdef CONFIG_SMP 918 932 dl_bw = &cpu_rq(cpu)->rd->dl_bw; 919 - #else 920 - dl_bw = &dl_rq->dl_bw; 921 - #endif 922 933 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); 923 934 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); 924 935 ··· 932 951 SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", 933 952 cpu, freq / 1000, (freq % 1000)); 934 953 } 935 - #else 954 + #else /* !CONFIG_X86: */ 936 955 SEQ_printf(m, "cpu#%d\n", cpu); 937 - #endif 956 + #endif /* !CONFIG_X86 */ 938 957 939 958 #define P(x) \ 940 959 do { \ ··· 957 976 #undef P 958 977 #undef PN 959 978 960 - #ifdef CONFIG_SMP 961 979 #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); 962 980 P64(avg_idle); 963 981 P64(max_idle_balance_cost); 964 982 #undef P64 965 - #endif 966 983 967 984 #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); 968 985 if (schedstat_enabled()) { ··· 1142 1163 SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", 1143 1164 task_node(p), task_numa_group_id(p)); 1144 1165 show_numa_stats(p, m); 1145 - #endif 1166 + #endif /* CONFIG_NUMA_BALANCING */ 1146 1167 } 1147 1168 1148 1169 void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ··· 1226 1247 __PS("nr_involuntary_switches", p->nivcsw); 1227 1248 1228 1249 P(se.load.weight); 1229 - #ifdef CONFIG_SMP 1230 1250 P(se.avg.load_sum); 1231 1251 P(se.avg.runnable_sum); 1232 1252 P(se.avg.util_sum); ··· 1234 1256 P(se.avg.util_avg); 1235 1257 P(se.avg.last_update_time); 1236 1258 PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); 1237 - #endif 1238 1259 #ifdef CONFIG_UCLAMP_TASK 1239 1260 __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); 1240 1261 __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); 1241 1262 __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); 1242 1263 __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); 1243 - #endif 1264 + #endif /* CONFIG_UCLAMP_TASK */ 1244 1265 P(policy); 1245 1266 P(prio); 1246 1267 if (task_has_dl_policy(p)) {

+149 -259

kernel/sched/fair.c

··· 88 88 } 89 89 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); 90 90 91 - #ifdef CONFIG_SMP 92 91 /* 93 92 * For asym packing, by default the lower numbered CPU has higher priority. 94 93 */ ··· 110 111 * (default: ~5%) 111 112 */ 112 113 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) 113 - #endif 114 114 115 115 #ifdef CONFIG_CFS_BANDWIDTH 116 116 /* ··· 160 162 return 0; 161 163 } 162 164 late_initcall(sched_fair_sysctl_init); 163 - #endif 165 + #endif /* CONFIG_SYSCTL */ 164 166 165 167 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 166 168 { ··· 469 471 return cfs_rq_is_idle(group_cfs_rq(se)); 470 472 } 471 473 472 - #else /* !CONFIG_FAIR_GROUP_SCHED */ 474 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 473 475 474 476 #define for_each_sched_entity(se) \ 475 477 for (; se; se = NULL) ··· 515 517 return task_has_idle_policy(task_of(se)); 516 518 } 517 519 518 - #endif /* CONFIG_FAIR_GROUP_SCHED */ 520 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 519 521 520 522 static __always_inline 521 523 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); ··· 882 884 } 883 885 884 886 /* 885 - * HACK, stash a copy of deadline at the point of pick in vlag, 886 - * which isn't used until dequeue. 887 + * Set the vruntime up to which an entity can run before looking 888 + * for another entity to pick. 889 + * In case of run to parity, we use the shortest slice of the enqueued 890 + * entities to set the protected period. 891 + * When run to parity is disabled, we give a minimum quantum to the running 892 + * entity to ensure progress. 887 893 */ 888 - static inline void set_protect_slice(struct sched_entity *se) 894 + static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 889 895 { 890 - se->vlag = se->deadline; 896 + u64 slice = normalized_sysctl_sched_base_slice; 897 + u64 vprot = se->deadline; 898 + 899 + if (sched_feat(RUN_TO_PARITY)) 900 + slice = cfs_rq_min_slice(cfs_rq); 901 + 902 + slice = min(slice, se->slice); 903 + if (slice != se->slice) 904 + vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); 905 + 906 + se->vprot = vprot; 907 + } 908 + 909 + static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 910 + { 911 + u64 slice = cfs_rq_min_slice(cfs_rq); 912 + 913 + se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se)); 891 914 } 892 915 893 916 static inline bool protect_slice(struct sched_entity *se) 894 917 { 895 - return se->vlag == se->deadline; 918 + return ((s64)(se->vprot - se->vruntime) > 0); 896 919 } 897 920 898 921 static inline void cancel_protect_slice(struct sched_entity *se) 899 922 { 900 923 if (protect_slice(se)) 901 - se->vlag = se->deadline + 1; 924 + se->vprot = se->vruntime; 902 925 } 903 926 904 927 /* ··· 941 922 * 942 923 * Which allows tree pruning through eligibility. 943 924 */ 944 - static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) 925 + static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) 945 926 { 946 927 struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; 947 928 struct sched_entity *se = __pick_first_entity(cfs_rq); ··· 958 939 if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) 959 940 curr = NULL; 960 941 961 - if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) 942 + if (curr && protect && protect_slice(curr)) 962 943 return curr; 963 944 964 945 /* Pick the leftmost entity if it's eligible */ ··· 1002 983 return best; 1003 984 } 1004 985 986 + static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) 987 + { 988 + return __pick_eevdf(cfs_rq, true); 989 + } 990 + 1005 991 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 1006 992 { 1007 993 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); ··· 1020 996 /************************************************************** 1021 997 * Scheduling class statistics methods: 1022 998 */ 1023 - #ifdef CONFIG_SMP 1024 999 int sched_update_scaling(void) 1025 1000 { 1026 1001 unsigned int factor = get_update_sysctl_factor(); ··· 1031 1008 1032 1009 return 0; 1033 1010 } 1034 - #endif 1035 1011 1036 1012 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); 1037 1013 ··· 1063 1041 } 1064 1042 1065 1043 #include "pelt.h" 1066 - #ifdef CONFIG_SMP 1067 1044 1068 1045 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); 1069 1046 static unsigned long task_h_load(struct task_struct *p); ··· 1152 1131 sa->runnable_avg = sa->util_avg; 1153 1132 } 1154 1133 1155 - #else /* !CONFIG_SMP */ 1156 - void init_entity_runnable_average(struct sched_entity *se) 1157 - { 1158 - } 1159 - void post_init_entity_util_avg(struct task_struct *p) 1160 - { 1161 - } 1162 - static void update_tg_load_avg(struct cfs_rq *cfs_rq) 1163 - { 1164 - } 1165 - #endif /* CONFIG_SMP */ 1166 - 1167 - static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) 1134 + static s64 update_se(struct rq *rq, struct sched_entity *se) 1168 1135 { 1169 1136 u64 now = rq_clock_task(rq); 1170 1137 s64 delta_exec; 1171 1138 1172 - delta_exec = now - curr->exec_start; 1139 + delta_exec = now - se->exec_start; 1173 1140 if (unlikely(delta_exec <= 0)) 1174 1141 return delta_exec; 1175 1142 1176 - curr->exec_start = now; 1177 - curr->sum_exec_runtime += delta_exec; 1143 + se->exec_start = now; 1144 + if (entity_is_task(se)) { 1145 + struct task_struct *donor = task_of(se); 1146 + struct task_struct *running = rq->curr; 1147 + /* 1148 + * If se is a task, we account the time against the running 1149 + * task, as w/ proxy-exec they may not be the same. 1150 + */ 1151 + running->se.exec_start = now; 1152 + running->se.sum_exec_runtime += delta_exec; 1153 + 1154 + trace_sched_stat_runtime(running, delta_exec); 1155 + account_group_exec_runtime(running, delta_exec); 1156 + 1157 + /* cgroup time is always accounted against the donor */ 1158 + cgroup_account_cputime(donor, delta_exec); 1159 + } else { 1160 + /* If not task, account the time against donor se */ 1161 + se->sum_exec_runtime += delta_exec; 1162 + } 1178 1163 1179 1164 if (schedstat_enabled()) { 1180 1165 struct sched_statistics *stats; 1181 1166 1182 - stats = __schedstats_from_se(curr); 1167 + stats = __schedstats_from_se(se); 1183 1168 __schedstat_set(stats->exec_max, 1184 1169 max(delta_exec, stats->exec_max)); 1185 1170 } ··· 1193 1166 return delta_exec; 1194 1167 } 1195 1168 1196 - static inline void update_curr_task(struct task_struct *p, s64 delta_exec) 1197 - { 1198 - trace_sched_stat_runtime(p, delta_exec); 1199 - account_group_exec_runtime(p, delta_exec); 1200 - cgroup_account_cputime(p, delta_exec); 1201 - } 1202 - 1203 - static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1204 - { 1205 - if (!sched_feat(PREEMPT_SHORT)) 1206 - return false; 1207 - 1208 - if (curr->vlag == curr->deadline) 1209 - return false; 1210 - 1211 - return !entity_eligible(cfs_rq, curr); 1212 - } 1213 - 1214 - static inline bool do_preempt_short(struct cfs_rq *cfs_rq, 1215 - struct sched_entity *pse, struct sched_entity *se) 1216 - { 1217 - if (!sched_feat(PREEMPT_SHORT)) 1218 - return false; 1219 - 1220 - if (pse->slice >= se->slice) 1221 - return false; 1222 - 1223 - if (!entity_eligible(cfs_rq, pse)) 1224 - return false; 1225 - 1226 - if (entity_before(pse, se)) 1227 - return true; 1228 - 1229 - if (!entity_eligible(cfs_rq, se)) 1230 - return true; 1231 - 1232 - return false; 1233 - } 1234 - 1235 1169 /* 1236 1170 * Used by other classes to account runtime. 1237 1171 */ 1238 1172 s64 update_curr_common(struct rq *rq) 1239 1173 { 1240 - struct task_struct *donor = rq->donor; 1241 - s64 delta_exec; 1242 - 1243 - delta_exec = update_curr_se(rq, &donor->se); 1244 - if (likely(delta_exec > 0)) 1245 - update_curr_task(donor, delta_exec); 1246 - 1247 - return delta_exec; 1174 + return update_se(rq, &rq->donor->se); 1248 1175 } 1249 1176 1250 1177 /* ··· 1206 1225 */ 1207 1226 static void update_curr(struct cfs_rq *cfs_rq) 1208 1227 { 1228 + /* 1229 + * Note: cfs_rq->curr corresponds to the task picked to 1230 + * run (ie: rq->donor.se) which due to proxy-exec may 1231 + * not necessarily be the actual task running 1232 + * (rq->curr.se). This is easy to confuse! 1233 + */ 1209 1234 struct sched_entity *curr = cfs_rq->curr; 1210 1235 struct rq *rq = rq_of(cfs_rq); 1211 1236 s64 delta_exec; ··· 1220 1233 if (unlikely(!curr)) 1221 1234 return; 1222 1235 1223 - delta_exec = update_curr_se(rq, curr); 1236 + delta_exec = update_se(rq, curr); 1224 1237 if (unlikely(delta_exec <= 0)) 1225 1238 return; 1226 1239 ··· 1229 1242 update_min_vruntime(cfs_rq); 1230 1243 1231 1244 if (entity_is_task(curr)) { 1232 - struct task_struct *p = task_of(curr); 1233 - 1234 - update_curr_task(p, delta_exec); 1235 - 1236 1245 /* 1237 1246 * If the fair_server is active, we need to account for the 1238 1247 * fair_server time whether or not the task is running on ··· 1248 1265 if (cfs_rq->nr_queued == 1) 1249 1266 return; 1250 1267 1251 - if (resched || did_preempt_short(cfs_rq, curr)) { 1268 + if (resched || !protect_slice(curr)) { 1252 1269 resched_curr_lazy(rq); 1253 1270 clear_buddies(cfs_rq, curr); 1254 1271 } ··· 2097 2114 2098 2115 return idle_core; 2099 2116 } 2100 - #else 2117 + #else /* !CONFIG_SCHED_SMT: */ 2101 2118 static inline int numa_idle_core(int idle_core, int cpu) 2102 2119 { 2103 2120 return idle_core; 2104 2121 } 2105 - #endif 2122 + #endif /* !CONFIG_SCHED_SMT */ 2106 2123 2107 2124 /* 2108 2125 * Gather all necessary information to make NUMA balancing placement ··· 3656 3673 p->numa_scan_period = task_scan_start(p); 3657 3674 } 3658 3675 3659 - #else 3676 + #else /* !CONFIG_NUMA_BALANCING: */ 3677 + 3660 3678 static void task_tick_numa(struct rq *rq, struct task_struct *curr) 3661 3679 { 3662 3680 } ··· 3674 3690 { 3675 3691 } 3676 3692 3677 - #endif /* CONFIG_NUMA_BALANCING */ 3693 + #endif /* !CONFIG_NUMA_BALANCING */ 3678 3694 3679 3695 static void 3680 3696 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 3681 3697 { 3682 3698 update_load_add(&cfs_rq->load, se->load.weight); 3683 - #ifdef CONFIG_SMP 3684 3699 if (entity_is_task(se)) { 3685 3700 struct rq *rq = rq_of(cfs_rq); 3686 3701 3687 3702 account_numa_enqueue(rq, task_of(se)); 3688 3703 list_add(&se->group_node, &rq->cfs_tasks); 3689 3704 } 3690 - #endif 3691 3705 cfs_rq->nr_queued++; 3692 3706 } 3693 3707 ··· 3693 3711 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 3694 3712 { 3695 3713 update_load_sub(&cfs_rq->load, se->load.weight); 3696 - #ifdef CONFIG_SMP 3697 3714 if (entity_is_task(se)) { 3698 3715 account_numa_dequeue(rq_of(cfs_rq), task_of(se)); 3699 3716 list_del_init(&se->group_node); 3700 3717 } 3701 - #endif 3702 3718 cfs_rq->nr_queued--; 3703 3719 } 3704 3720 ··· 3748 3768 *ptr -= min_t(typeof(*ptr), *ptr, _val); \ 3749 3769 } while (0) 3750 3770 3751 - #ifdef CONFIG_SMP 3752 3771 static inline void 3753 3772 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3754 3773 { ··· 3764 3785 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, 3765 3786 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); 3766 3787 } 3767 - #else 3768 - static inline void 3769 - enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3770 - static inline void 3771 - dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3772 - #endif 3773 3788 3774 3789 static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); 3775 3790 ··· 3795 3822 3796 3823 update_load_set(&se->load, weight); 3797 3824 3798 - #ifdef CONFIG_SMP 3799 3825 do { 3800 3826 u32 divider = get_pelt_divider(&se->avg); 3801 3827 3802 3828 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 3803 3829 } while (0); 3804 - #endif 3805 3830 3806 3831 enqueue_load_avg(cfs_rq, se); 3807 3832 if (se->on_rq) { ··· 3834 3863 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 3835 3864 3836 3865 #ifdef CONFIG_FAIR_GROUP_SCHED 3837 - #ifdef CONFIG_SMP 3838 3866 /* 3839 3867 * All this does is approximate the hierarchical proportion which includes that 3840 3868 * global sum we all love to hate. ··· 3940 3970 */ 3941 3971 return clamp_t(long, shares, MIN_SHARES, tg_shares); 3942 3972 } 3943 - #endif /* CONFIG_SMP */ 3944 3973 3945 3974 /* 3946 3975 * Recomputes the group entity based on the current state of its group ··· 3960 3991 if (throttled_hierarchy(gcfs_rq)) 3961 3992 return; 3962 3993 3963 - #ifndef CONFIG_SMP 3964 - shares = READ_ONCE(gcfs_rq->tg->shares); 3965 - #else 3966 3994 shares = calc_group_shares(gcfs_rq); 3967 - #endif 3968 3995 if (unlikely(se->load.weight != shares)) 3969 3996 reweight_entity(cfs_rq_of(se), se, shares); 3970 3997 } 3971 3998 3972 - #else /* CONFIG_FAIR_GROUP_SCHED */ 3999 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 3973 4000 static inline void update_cfs_group(struct sched_entity *se) 3974 4001 { 3975 4002 } 3976 - #endif /* CONFIG_FAIR_GROUP_SCHED */ 4003 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 3977 4004 3978 4005 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) 3979 4006 { ··· 3994 4029 } 3995 4030 } 3996 4031 3997 - #ifdef CONFIG_SMP 3998 4032 static inline bool load_avg_is_decayed(struct sched_avg *sa) 3999 4033 { 4000 4034 if (sa->load_sum) ··· 4445 4481 return true; 4446 4482 } 4447 4483 4448 - #else /* CONFIG_FAIR_GROUP_SCHED */ 4484 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 4449 4485 4450 4486 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} 4451 4487 ··· 4458 4494 4459 4495 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} 4460 4496 4461 - #endif /* CONFIG_FAIR_GROUP_SCHED */ 4497 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 4462 4498 4463 4499 #ifdef CONFIG_NO_HZ_COMMON 4464 4500 static inline void migrate_se_pelt_lag(struct sched_entity *se) ··· 4539 4575 4540 4576 __update_load_avg_blocked_se(now, se); 4541 4577 } 4542 - #else 4578 + #else /* !CONFIG_NO_HZ_COMMON: */ 4543 4579 static void migrate_se_pelt_lag(struct sched_entity *se) {} 4544 - #endif 4580 + #endif /* !CONFIG_NO_HZ_COMMON */ 4545 4581 4546 4582 /** 4547 4583 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages ··· 5108 5144 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); 5109 5145 } 5110 5146 5111 - #else /* CONFIG_SMP */ 5112 - 5113 - static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 5114 - { 5115 - return !cfs_rq->nr_queued; 5116 - } 5117 - 5118 - #define UPDATE_TG 0x0 5119 - #define SKIP_AGE_LOAD 0x0 5120 - #define DO_ATTACH 0x0 5121 - #define DO_DETACH 0x0 5122 - 5123 - static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 5124 - { 5125 - cfs_rq_util_change(cfs_rq, 0); 5126 - } 5127 - 5128 - static inline void remove_entity_load_avg(struct sched_entity *se) {} 5129 - 5130 - static inline void 5131 - attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 5132 - static inline void 5133 - detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 5134 - 5135 - static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) 5136 - { 5137 - return 0; 5138 - } 5139 - 5140 - static inline void 5141 - util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} 5142 - 5143 - static inline void 5144 - util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} 5145 - 5146 - static inline void 5147 - util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, 5148 - bool task_sleep) {} 5149 - static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} 5150 - 5151 - #endif /* CONFIG_SMP */ 5152 - 5153 5147 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) 5154 5148 { 5155 5149 struct sched_entity *se = &p->se; ··· 5175 5253 * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) 5176 5254 * = (W*V + w_i*(V - vl_i)) / (W + w_i) 5177 5255 * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) 5178 - * = (V*(W + w_i) - w_i*l) / (W + w_i) 5256 + * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) 5179 5257 * = V - w_i*vl_i / (W + w_i) 5180 5258 * 5181 5259 * And the actual lag after adding an entity with vl_i is: ··· 5472 5550 __dequeue_entity(cfs_rq, se); 5473 5551 update_load_avg(cfs_rq, se, UPDATE_TG); 5474 5552 5475 - set_protect_slice(se); 5553 + set_protect_slice(cfs_rq, se); 5476 5554 } 5477 5555 5478 5556 update_stats_curr_start(cfs_rq, se); ··· 5607 5685 { 5608 5686 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); 5609 5687 } 5610 - #else /* CONFIG_JUMP_LABEL */ 5688 + #else /* !CONFIG_JUMP_LABEL: */ 5611 5689 static bool cfs_bandwidth_used(void) 5612 5690 { 5613 5691 return true; ··· 5615 5693 5616 5694 void cfs_bandwidth_usage_inc(void) {} 5617 5695 void cfs_bandwidth_usage_dec(void) {} 5618 - #endif /* CONFIG_JUMP_LABEL */ 5619 - 5620 - /* 5621 - * default period for cfs group bandwidth. 5622 - * default: 0.1s, units: nanoseconds 5623 - */ 5624 - static inline u64 default_cfs_period(void) 5625 - { 5626 - return 100000000ULL; 5627 - } 5696 + #endif /* !CONFIG_JUMP_LABEL */ 5628 5697 5629 5698 static inline u64 sched_cfs_bandwidth_slice(void) 5630 5699 { ··· 5802 5889 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5803 5890 struct sched_entity *se; 5804 5891 long queued_delta, runnable_delta, idle_delta, dequeue = 1; 5805 - long rq_h_nr_queued = rq->cfs.h_nr_queued; 5806 5892 5807 5893 raw_spin_lock(&cfs_b->lock); 5808 5894 /* This will start the period timer if necessary */ ··· 5885 5973 5886 5974 /* At this point se is NULL and we are at root level*/ 5887 5975 sub_nr_running(rq, queued_delta); 5888 - 5889 - /* Stop the fair server if throttling resulted in no runnable tasks */ 5890 - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) 5891 - dl_server_stop(&rq->fair_server); 5892 5976 done: 5893 5977 /* 5894 5978 * Note: distribution will already see us throttled via the ··· 5996 6088 resched_curr(rq); 5997 6089 } 5998 6090 5999 - #ifdef CONFIG_SMP 6000 6091 static void __cfsb_csd_unthrottle(void *arg) 6001 6092 { 6002 6093 struct cfs_rq *cursor, *tmp; ··· 6054 6147 if (first) 6055 6148 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); 6056 6149 } 6057 - #else 6058 - static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) 6059 - { 6060 - unthrottle_cfs_rq(cfs_rq); 6061 - } 6062 - #endif 6063 6150 6064 6151 static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) 6065 6152 { ··· 6391 6490 return HRTIMER_NORESTART; 6392 6491 } 6393 6492 6394 - extern const u64 max_cfs_quota_period; 6395 - 6396 6493 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) 6397 6494 { 6398 6495 struct cfs_bandwidth *cfs_b = ··· 6417 6518 * to fail. 6418 6519 */ 6419 6520 new = old * 2; 6420 - if (new < max_cfs_quota_period) { 6521 + if (new < max_bw_quota_period_us * NSEC_PER_USEC) { 6421 6522 cfs_b->period = ns_to_ktime(new); 6422 6523 cfs_b->quota *= 2; 6423 6524 cfs_b->burst *= 2; ··· 6451 6552 raw_spin_lock_init(&cfs_b->lock); 6452 6553 cfs_b->runtime = 0; 6453 6554 cfs_b->quota = RUNTIME_INF; 6454 - cfs_b->period = ns_to_ktime(default_cfs_period()); 6555 + cfs_b->period = us_to_ktime(default_bw_period_us()); 6455 6556 cfs_b->burst = 0; 6456 6557 cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; 6457 6558 ··· 6507 6608 * guaranteed at this point that no additional cfs_rq of this group can 6508 6609 * join a CSD list. 6509 6610 */ 6510 - #ifdef CONFIG_SMP 6511 6611 for_each_possible_cpu(i) { 6512 6612 struct rq *rq = cpu_rq(i); 6513 6613 unsigned long flags; ··· 6518 6620 __cfsb_csd_unthrottle(rq); 6519 6621 local_irq_restore(flags); 6520 6622 } 6521 - #endif 6522 6623 } 6523 6624 6524 6625 /* ··· 6630 6733 if (cfs_task_bw_constrained(p)) 6631 6734 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 6632 6735 } 6633 - #endif 6736 + #endif /* CONFIG_NO_HZ_FULL */ 6634 6737 6635 - #else /* CONFIG_CFS_BANDWIDTH */ 6738 + #else /* !CONFIG_CFS_BANDWIDTH: */ 6636 6739 6637 6740 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 6638 6741 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } ··· 6674 6777 return false; 6675 6778 } 6676 6779 #endif 6677 - #endif /* CONFIG_CFS_BANDWIDTH */ 6780 + #endif /* !CONFIG_CFS_BANDWIDTH */ 6678 6781 6679 6782 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) 6680 6783 static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} ··· 6719 6822 6720 6823 hrtick_start_fair(rq, donor); 6721 6824 } 6722 - #else /* !CONFIG_SCHED_HRTICK */ 6825 + #else /* !CONFIG_SCHED_HRTICK: */ 6723 6826 static inline void 6724 6827 hrtick_start_fair(struct rq *rq, struct task_struct *p) 6725 6828 { ··· 6728 6831 static inline void hrtick_update(struct rq *rq) 6729 6832 { 6730 6833 } 6731 - #endif 6834 + #endif /* !CONFIG_SCHED_HRTICK */ 6732 6835 6733 - #ifdef CONFIG_SMP 6734 6836 static inline bool cpu_overutilized(int cpu) 6735 6837 { 6736 6838 unsigned long rq_util_min, rq_util_max; ··· 6771 6875 if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) 6772 6876 set_rd_overutilized(rq->rd, 1); 6773 6877 } 6774 - #else 6775 - static inline void check_update_overutilized_status(struct rq *rq) { } 6776 - #endif 6777 6878 6778 6879 /* Runqueue only has SCHED_IDLE tasks enqueued */ 6779 6880 static int sched_idle_rq(struct rq *rq) ··· 6779 6886 rq->nr_running); 6780 6887 } 6781 6888 6782 - #ifdef CONFIG_SMP 6783 6889 static int sched_idle_cpu(int cpu) 6784 6890 { 6785 6891 return sched_idle_rq(cpu_rq(cpu)); 6786 6892 } 6787 - #endif 6788 6893 6789 6894 static void 6790 6895 requeue_delayed_entity(struct sched_entity *se) ··· 6961 7070 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) 6962 7071 { 6963 7072 bool was_sched_idle = sched_idle_rq(rq); 6964 - int rq_h_nr_queued = rq->cfs.h_nr_queued; 6965 7073 bool task_sleep = flags & DEQUEUE_SLEEP; 6966 7074 bool task_delayed = flags & DEQUEUE_DELAYED; 6967 7075 struct task_struct *p = NULL; ··· 7044 7154 7045 7155 sub_nr_running(rq, h_nr_queued); 7046 7156 7047 - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) 7048 - dl_server_stop(&rq->fair_server); 7049 - 7050 7157 /* balance early to pull high priority tasks */ 7051 7158 if (unlikely(!was_sched_idle && sched_idle_rq(rq))) 7052 7159 rq->next_balance = jiffies; ··· 7092 7205 { 7093 7206 return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); 7094 7207 } 7095 - 7096 - #ifdef CONFIG_SMP 7097 7208 7098 7209 /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ 7099 7210 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); ··· 7562 7677 return -1; 7563 7678 } 7564 7679 7565 - #else /* CONFIG_SCHED_SMT */ 7680 + #else /* !CONFIG_SCHED_SMT: */ 7566 7681 7567 7682 static inline void set_idle_cores(int cpu, int val) 7568 7683 { ··· 7583 7698 return -1; 7584 7699 } 7585 7700 7586 - #endif /* CONFIG_SCHED_SMT */ 7701 + #endif /* !CONFIG_SCHED_SMT */ 7587 7702 7588 7703 /* 7589 7704 * Scan the LLC domain for idle CPUs; this is dynamically regulated by ··· 8628 8743 8629 8744 return sched_balance_newidle(rq, rf) != 0; 8630 8745 } 8631 - #else 8632 - static inline void set_task_max_allowed_capacity(struct task_struct *p) {} 8633 - #endif /* CONFIG_SMP */ 8634 8746 8635 8747 static void set_next_buddy(struct sched_entity *se) 8636 8748 { ··· 8649 8767 struct sched_entity *se = &donor->se, *pse = &p->se; 8650 8768 struct cfs_rq *cfs_rq = task_cfs_rq(donor); 8651 8769 int cse_is_idle, pse_is_idle; 8770 + bool do_preempt_short = false; 8652 8771 8653 8772 if (unlikely(se == pse)) 8654 8773 return; ··· 8698 8815 * When non-idle entity preempt an idle entity, 8699 8816 * don't give idle entity slice protection. 8700 8817 */ 8701 - cancel_protect_slice(se); 8818 + do_preempt_short = true; 8702 8819 goto preempt; 8703 8820 } 8704 8821 ··· 8716 8833 /* 8717 8834 * If @p has a shorter slice than current and @p is eligible, override 8718 8835 * current's slice protection in order to allow preemption. 8719 - * 8720 - * Note that even if @p does not turn out to be the most eligible 8721 - * task at this moment, current's slice protection will be lost. 8722 8836 */ 8723 - if (do_preempt_short(cfs_rq, pse, se)) 8724 - cancel_protect_slice(se); 8837 + do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); 8725 8838 8726 8839 /* 8727 8840 * If @p has become the most eligible task, force preemption. 8728 8841 */ 8729 - if (pick_eevdf(cfs_rq) == pse) 8842 + if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) 8730 8843 goto preempt; 8844 + 8845 + if (sched_feat(RUN_TO_PARITY) && do_preempt_short) 8846 + update_protect_slice(cfs_rq, se); 8731 8847 8732 8848 return; 8733 8849 8734 8850 preempt: 8851 + if (do_preempt_short) 8852 + cancel_protect_slice(se); 8853 + 8735 8854 resched_curr_lazy(rq); 8736 8855 } 8737 8856 ··· 8824 8939 return p; 8825 8940 8826 8941 simple: 8827 - #endif 8942 + #endif /* CONFIG_FAIR_GROUP_SCHED */ 8828 8943 put_prev_set_next_task(rq, prev, p); 8829 8944 return p; 8830 8945 ··· 8940 9055 return true; 8941 9056 } 8942 9057 8943 - #ifdef CONFIG_SMP 8944 9058 /************************************************** 8945 9059 * Fair scheduling class load-balancing methods. 8946 9060 * ··· 9241 9357 return src_weight - dst_weight; 9242 9358 } 9243 9359 9244 - #else 9360 + #else /* !CONFIG_NUMA_BALANCING: */ 9245 9361 static inline long migrate_degrades_locality(struct task_struct *p, 9246 9362 struct lb_env *env) 9247 9363 { 9248 9364 return 0; 9249 9365 } 9250 - #endif 9366 + #endif /* !CONFIG_NUMA_BALANCING */ 9251 9367 9252 9368 /* 9253 9369 * Check whether the task is ineligible on the destination cpu ··· 9291 9407 * 2) throttled_lb_pair, or 9292 9408 * 3) cannot be migrated to this CPU due to cpus_ptr, or 9293 9409 * 4) running (obviously), or 9294 - * 5) are cache-hot on their current CPU. 9410 + * 5) are cache-hot on their current CPU, or 9411 + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) 9295 9412 */ 9296 9413 if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) 9297 9414 return 0; ··· 9312 9427 9313 9428 /* Disregard percpu kthreads; they are where they need to be. */ 9314 9429 if (kthread_is_per_cpu(p)) 9430 + return 0; 9431 + 9432 + if (task_is_blocked(p)) 9315 9433 return 0; 9316 9434 9317 9435 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { ··· 9352 9464 /* Record that we found at least one task that could run on dst_cpu */ 9353 9465 env->flags &= ~LBF_ALL_PINNED; 9354 9466 9355 - if (task_on_cpu(env->src_rq, p)) { 9467 + if (task_on_cpu(env->src_rq, p) || 9468 + task_current_donor(env->src_rq, p)) { 9356 9469 schedstat_inc(p->stats.nr_failed_migrations_running); 9357 9470 return 0; 9358 9471 } ··· 9396 9507 schedstat_inc(env->sd->lb_hot_gained[env->idle]); 9397 9508 schedstat_inc(p->stats.nr_forced_migrations); 9398 9509 } 9510 + 9511 + WARN_ON(task_current(env->src_rq, p)); 9512 + WARN_ON(task_current_donor(env->src_rq, p)); 9399 9513 9400 9514 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); 9401 9515 set_task_cpu(p, env->dst_cpu); ··· 9664 9772 if (!has_blocked) 9665 9773 rq->has_blocked_load = 0; 9666 9774 } 9667 - #else 9775 + #else /* !CONFIG_NO_HZ_COMMON: */ 9668 9776 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } 9669 9777 static inline bool others_have_blocked(struct rq *rq) { return false; } 9670 9778 static inline void update_blocked_load_tick(struct rq *rq) {} 9671 9779 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} 9672 - #endif 9780 + #endif /* !CONFIG_NO_HZ_COMMON */ 9673 9781 9674 9782 static bool __update_blocked_others(struct rq *rq, bool *done) 9675 9783 { ··· 9778 9886 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, 9779 9887 cfs_rq_load_avg(cfs_rq) + 1); 9780 9888 } 9781 - #else 9889 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 9782 9890 static bool __update_blocked_fair(struct rq *rq, bool *done) 9783 9891 { 9784 9892 struct cfs_rq *cfs_rq = &rq->cfs; ··· 9795 9903 { 9796 9904 return p->se.avg.load_avg; 9797 9905 } 9798 - #endif 9906 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 9799 9907 9800 9908 static void sched_balance_update_blocked_averages(int cpu) 9801 9909 { ··· 9940 10048 min_capacity = ULONG_MAX; 9941 10049 max_capacity = 0; 9942 10050 9943 - if (child->flags & SD_OVERLAP) { 10051 + if (child->flags & SD_NUMA) { 9944 10052 /* 9945 - * SD_OVERLAP domains cannot assume that child groups 10053 + * SD_NUMA domains cannot assume that child groups 9946 10054 * span the current group. 9947 10055 */ 9948 10056 ··· 9955 10063 } 9956 10064 } else { 9957 10065 /* 9958 - * !SD_OVERLAP domains can assume that child groups 10066 + * !SD_NUMA domains can assume that child groups 9959 10067 * span the current group. 9960 10068 */ 9961 10069 ··· 10508 10616 return remote; 10509 10617 return all; 10510 10618 } 10511 - #else 10619 + #else /* !CONFIG_NUMA_BALANCING: */ 10512 10620 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) 10513 10621 { 10514 10622 return all; ··· 10518 10626 { 10519 10627 return regular; 10520 10628 } 10521 - #endif /* CONFIG_NUMA_BALANCING */ 10629 + #endif /* !CONFIG_NUMA_BALANCING */ 10522 10630 10523 10631 10524 10632 struct sg_lb_stats; ··· 12066 12174 /* 12067 12175 * Track max cost of a domain to make sure to not delay the 12068 12176 * next wakeup on the CPU. 12177 + * 12178 + * sched_balance_newidle() bumps the cost whenever newidle 12179 + * balance fails, and we don't want things to grow out of 12180 + * control. Use the sysctl_sched_migration_cost as the upper 12181 + * limit, plus a litle extra to avoid off by ones. 12069 12182 */ 12070 - sd->max_newidle_lb_cost = cost; 12183 + sd->max_newidle_lb_cost = 12184 + min(cost, sysctl_sched_migration_cost + 200); 12071 12185 sd->last_decay_max_lb_cost = jiffies; 12072 12186 } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { 12073 12187 /* ··· 12670 12772 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); 12671 12773 } 12672 12774 12673 - #else /* !CONFIG_NO_HZ_COMMON */ 12775 + #else /* !CONFIG_NO_HZ_COMMON: */ 12674 12776 static inline void nohz_balancer_kick(struct rq *rq) { } 12675 12777 12676 12778 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) ··· 12679 12781 } 12680 12782 12681 12783 static inline void nohz_newidle_balance(struct rq *this_rq) { } 12682 - #endif /* CONFIG_NO_HZ_COMMON */ 12784 + #endif /* !CONFIG_NO_HZ_COMMON */ 12683 12785 12684 12786 /* 12685 12787 * sched_balance_newidle is called by schedule() if this_cpu is about to become ··· 12765 12867 12766 12868 t1 = sched_clock_cpu(this_cpu); 12767 12869 domain_cost = t1 - t0; 12768 - update_newidle_cost(sd, domain_cost); 12769 - 12770 12870 curr_cost += domain_cost; 12771 12871 t0 = t1; 12872 + 12873 + /* 12874 + * Failing newidle means it is not effective; 12875 + * bump the cost so we end up doing less of it. 12876 + */ 12877 + if (!pulled_task) 12878 + domain_cost = (3 * sd->max_newidle_lb_cost) / 2; 12879 + 12880 + update_newidle_cost(sd, domain_cost); 12772 12881 } 12773 12882 12774 12883 /* ··· 12883 12978 clear_tg_offline_cfs_rqs(rq); 12884 12979 } 12885 12980 12886 - #endif /* CONFIG_SMP */ 12887 - 12888 12981 #ifdef CONFIG_SCHED_CORE 12889 12982 static inline bool 12890 12983 __entity_slice_used(struct sched_entity *se, int min_nr_tasks) ··· 12979 13076 12980 13077 cfs_rqa = sea->cfs_rq; 12981 13078 cfs_rqb = seb->cfs_rq; 12982 - #else 13079 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 12983 13080 cfs_rqa = &task_rq(a)->cfs; 12984 13081 cfs_rqb = &task_rq(b)->cfs; 12985 - #endif 13082 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 12986 13083 12987 13084 /* 12988 13085 * Find delta after normalizing se's vruntime with its cfs_rq's ··· 13006 13103 #endif 13007 13104 return throttled_hierarchy(cfs_rq); 13008 13105 } 13009 - #else 13106 + #else /* !CONFIG_SCHED_CORE: */ 13010 13107 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} 13011 - #endif 13108 + #endif /* !CONFIG_SCHED_CORE */ 13012 13109 13013 13110 /* 13014 13111 * scheduler tick hitting a task of our scheduling class. ··· 13102 13199 list_add_leaf_cfs_rq(cfs_rq); 13103 13200 } 13104 13201 } 13105 - #else 13202 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 13106 13203 static void propagate_entity_cfs_rq(struct sched_entity *se) { } 13107 - #endif 13204 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 13108 13205 13109 13206 static void detach_entity_cfs_rq(struct sched_entity *se) 13110 13207 { 13111 13208 struct cfs_rq *cfs_rq = cfs_rq_of(se); 13112 13209 13113 - #ifdef CONFIG_SMP 13114 13210 /* 13115 13211 * In case the task sched_avg hasn't been attached: 13116 13212 * - A forked task which hasn't been woken up by wake_up_new_task(). ··· 13118 13216 */ 13119 13217 if (!se->avg.last_update_time) 13120 13218 return; 13121 - #endif 13122 13219 13123 13220 /* Catch up with the cfs_rq and remove our load when we leave */ 13124 13221 update_load_avg(cfs_rq, se, 0); ··· 13181 13280 { 13182 13281 struct sched_entity *se = &p->se; 13183 13282 13184 - #ifdef CONFIG_SMP 13185 13283 if (task_on_rq_queued(p)) { 13186 13284 /* 13187 13285 * Move the next running task to the front of the list, so our ··· 13188 13288 */ 13189 13289 list_move(&se->group_node, &rq->cfs_tasks); 13190 13290 } 13191 - #endif 13192 13291 if (!first) 13193 13292 return; 13194 13293 ··· 13225 13326 { 13226 13327 cfs_rq->tasks_timeline = RB_ROOT_CACHED; 13227 13328 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 13228 - #ifdef CONFIG_SMP 13229 13329 raw_spin_lock_init(&cfs_rq->removed.lock); 13230 - #endif 13231 13330 } 13232 13331 13233 13332 #ifdef CONFIG_FAIR_GROUP_SCHED ··· 13240 13343 13241 13344 detach_task_cfs_rq(p); 13242 13345 13243 - #ifdef CONFIG_SMP 13244 13346 /* Tell se's cfs_rq has been changed -- migrated */ 13245 13347 p->se.avg.last_update_time = 0; 13246 - #endif 13247 13348 set_task_rq(p, task_cpu(p)); 13248 13349 attach_task_cfs_rq(p); 13249 13350 } ··· 13537 13642 .put_prev_task = put_prev_task_fair, 13538 13643 .set_next_task = set_next_task_fair, 13539 13644 13540 - #ifdef CONFIG_SMP 13541 13645 .balance = balance_fair, 13542 13646 .select_task_rq = select_task_rq_fair, 13543 13647 .migrate_task_rq = migrate_task_rq_fair, ··· 13546 13652 13547 13653 .task_dead = task_dead_fair, 13548 13654 .set_cpus_allowed = set_cpus_allowed_fair, 13549 - #endif 13550 13655 13551 13656 .task_tick = task_tick_fair, 13552 13657 .task_fork = task_fork_fair, ··· 13608 13715 13609 13716 __init void init_sched_fair_class(void) 13610 13717 { 13611 - #ifdef CONFIG_SMP 13612 13718 int i; 13613 13719 13614 13720 for_each_possible_cpu(i) { ··· 13629 13737 nohz.next_blocked = jiffies; 13630 13738 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 13631 13739 #endif 13632 - #endif /* SMP */ 13633 - 13634 13740 }

+8 -7

kernel/sched/idle.c

··· 6 6 * (NOTE: these are not related to SCHED_IDLE batch scheduled 7 7 * tasks which are handled in sched/fair.c ) 8 8 */ 9 + #include <linux/cpuidle.h> 10 + #include <linux/suspend.h> 11 + #include <linux/livepatch.h> 12 + #include "sched.h" 13 + #include "smp.h" 9 14 10 15 /* Linker adds these: start and end of __cpuidle functions */ 11 16 extern char __cpuidle_text_start[], __cpuidle_text_end[]; ··· 52 47 return 1; 53 48 } 54 49 __setup("hlt", cpu_idle_nopoll_setup); 55 - #endif 50 + #endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */ 56 51 57 52 static noinline int __cpuidle cpu_idle_poll(void) 58 53 { ··· 100 95 if (static_branch_unlikely(&arch_needs_tick_broadcast)) 101 96 tick_broadcast_exit(); 102 97 } 103 - #else 98 + #else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */ 104 99 static inline void cond_tick_broadcast_enter(void) { } 105 100 static inline void cond_tick_broadcast_exit(void) { } 106 - #endif 101 + #endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */ 107 102 108 103 /** 109 104 * default_idle_call - Default CPU idle routine. ··· 432 427 * idle-task scheduling class. 433 428 */ 434 429 435 - #ifdef CONFIG_SMP 436 430 static int 437 431 select_task_rq_idle(struct task_struct *p, int cpu, int flags) 438 432 { ··· 443 439 { 444 440 return WARN_ON_ONCE(1); 445 441 } 446 - #endif 447 442 448 443 /* 449 444 * Idle tasks are unconditionally rescheduled: ··· 529 526 .put_prev_task = put_prev_task_idle, 530 527 .set_next_task = set_next_task_idle, 531 528 532 - #ifdef CONFIG_SMP 533 529 .balance = balance_idle, 534 530 .select_task_rq = select_task_rq_idle, 535 531 .set_cpus_allowed = set_cpus_allowed_common, 536 - #endif 537 532 538 533 .task_tick = task_tick_idle, 539 534

+2

kernel/sched/isolation.c

··· 7 7 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker 8 8 * 9 9 */ 10 + #include <linux/sched/isolation.h> 11 + #include "sched.h" 10 12 11 13 enum hk_flags { 12 14 HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),

+4 -2

kernel/sched/loadavg.c

··· 6 6 * figure. Its a silly number but people think its important. We go through 7 7 * great pains to make it work on big machines and tickless kernels. 8 8 */ 9 + #include <linux/sched/nohz.h> 10 + #include "sched.h" 9 11 10 12 /* 11 13 * Global load-average calculations ··· 335 333 smp_wmb(); 336 334 calc_load_idx++; 337 335 } 338 - #else /* !CONFIG_NO_HZ_COMMON */ 336 + #else /* !CONFIG_NO_HZ_COMMON: */ 339 337 340 338 static inline long calc_load_nohz_read(void) { return 0; } 341 339 static inline void calc_global_nohz(void) { } 342 340 343 - #endif /* CONFIG_NO_HZ_COMMON */ 341 + #endif /* !CONFIG_NO_HZ_COMMON */ 344 342 345 343 /* 346 344 * calc_load - update the avenrun load estimates 10 ticks after the

+2

kernel/sched/membarrier.c

··· 4 4 * 5 5 * membarrier system call 6 6 */ 7 + #include <uapi/linux/membarrier.h> 8 + #include "sched.h" 7 9 8 10 /* 9 11 * For documentation purposes, here are some membarrier ordering

+3 -2

kernel/sched/pelt.c

··· 23 23 * Move PELT related code from fair.c into this pelt.c file 24 24 * Author: Vincent Guittot <vincent.guittot@linaro.org> 25 25 */ 26 + #include "pelt.h" 26 27 27 28 /* 28 29 * Approximate: ··· 414 413 415 414 return 0; 416 415 } 417 - #endif 416 + #endif /* CONFIG_SCHED_HW_PRESSURE */ 418 417 419 418 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 420 419 /* ··· 467 466 468 467 return ret; 469 468 } 470 - #endif 469 + #endif /* CONFIG_HAVE_SCHED_AVG_IRQ */ 471 470 472 471 /* 473 472 * Load avg and utiliztion metrics need to be updated periodically and before

+10 -57

kernel/sched/pelt.h

··· 1 - #ifdef CONFIG_SMP 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef _KERNEL_SCHED_PELT_H 3 + #define _KERNEL_SCHED_PELT_H 4 + #include "sched.h" 5 + 2 6 #include "sched-pelt.h" 3 7 4 8 int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); ··· 19 15 { 20 16 return READ_ONCE(rq->avg_hw.load_avg); 21 17 } 22 - #else 18 + #else /* !CONFIG_SCHED_HW_PRESSURE: */ 23 19 static inline int 24 20 update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) 25 21 { ··· 30 26 { 31 27 return 0; 32 28 } 33 - #endif 29 + #endif /* !CONFIG_SCHED_HW_PRESSURE */ 34 30 35 31 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 36 32 int update_irq_load_avg(struct rq *rq, u64 running); ··· 178 174 179 175 return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; 180 176 } 181 - #else 177 + #else /* !CONFIG_CFS_BANDWIDTH: */ 182 178 static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } 183 179 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) 184 180 { 185 181 return rq_clock_pelt(rq_of(cfs_rq)); 186 182 } 187 - #endif 183 + #endif /* !CONFIG_CFS_BANDWIDTH */ 188 184 189 - #else 190 - 191 - static inline int 192 - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 193 - { 194 - return 0; 195 - } 196 - 197 - static inline int 198 - update_rt_rq_load_avg(u64 now, struct rq *rq, int running) 199 - { 200 - return 0; 201 - } 202 - 203 - static inline int 204 - update_dl_rq_load_avg(u64 now, struct rq *rq, int running) 205 - { 206 - return 0; 207 - } 208 - 209 - static inline int 210 - update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) 211 - { 212 - return 0; 213 - } 214 - 215 - static inline u64 hw_load_avg(struct rq *rq) 216 - { 217 - return 0; 218 - } 219 - 220 - static inline int 221 - update_irq_load_avg(struct rq *rq, u64 running) 222 - { 223 - return 0; 224 - } 225 - 226 - static inline u64 rq_clock_pelt(struct rq *rq) 227 - { 228 - return rq_clock_task(rq); 229 - } 230 - 231 - static inline void 232 - update_rq_clock_pelt(struct rq *rq, s64 delta) { } 233 - 234 - static inline void 235 - update_idle_rq_clock_pelt(struct rq *rq) { } 236 - 237 - static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } 238 - #endif 239 - 240 - 185 + #endif /* _KERNEL_SCHED_PELT_H */

+72 -57

kernel/sched/psi.c

··· 136 136 * cost-wise, yet way more sensitive and accurate than periodic 137 137 * sampling of the aggregate task states would be. 138 138 */ 139 + #include <linux/sched/clock.h> 140 + #include <linux/workqueue.h> 141 + #include <linux/psi.h> 142 + #include "sched.h" 139 143 140 144 static int psi_bug __read_mostly; 141 145 ··· 176 172 .pcpu = &system_group_pcpu, 177 173 }; 178 174 175 + static DEFINE_PER_CPU(seqcount_t, psi_seq); 176 + 177 + static inline void psi_write_begin(int cpu) 178 + { 179 + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); 180 + } 181 + 182 + static inline void psi_write_end(int cpu) 183 + { 184 + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); 185 + } 186 + 187 + static inline u32 psi_read_begin(int cpu) 188 + { 189 + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); 190 + } 191 + 192 + static inline bool psi_read_retry(int cpu, u32 seq) 193 + { 194 + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); 195 + } 196 + 179 197 static void psi_avgs_work(struct work_struct *work); 180 198 181 199 static void poll_timer_fn(struct timer_list *t); ··· 208 182 209 183 group->enabled = true; 210 184 for_each_possible_cpu(cpu) 211 - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); 185 + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); 212 186 group->avg_last_update = sched_clock(); 213 187 group->avg_next_update = group->avg_last_update + psi_period; 214 188 mutex_init(&group->avgs_lock); ··· 288 262 289 263 /* Snapshot a coherent view of the CPU state */ 290 264 do { 291 - seq = read_seqcount_begin(&groupc->seq); 265 + seq = psi_read_begin(cpu); 292 266 now = cpu_clock(cpu); 293 267 memcpy(times, groupc->times, sizeof(groupc->times)); 294 268 state_mask = groupc->state_mask; 295 269 state_start = groupc->state_start; 296 270 if (cpu == current_cpu) 297 271 memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); 298 - } while (read_seqcount_retry(&groupc->seq, seq)); 272 + } while (psi_read_retry(cpu, seq)); 299 273 300 274 /* Calculate state time deltas against the previous snapshot */ 301 275 for (s = 0; s < NR_PSI_STATES; s++) { ··· 794 768 groupc->times[PSI_NONIDLE] += delta; 795 769 } 796 770 771 + #define for_each_group(iter, group) \ 772 + for (typeof(group) iter = group; iter; iter = iter->parent) 773 + 797 774 static void psi_group_change(struct psi_group *group, int cpu, 798 775 unsigned int clear, unsigned int set, 799 - bool wake_clock) 776 + u64 now, bool wake_clock) 800 777 { 801 778 struct psi_group_cpu *groupc; 802 779 unsigned int t, m; 803 780 u32 state_mask; 804 - u64 now; 805 781 806 782 lockdep_assert_rq_held(cpu_rq(cpu)); 807 783 groupc = per_cpu_ptr(group->pcpu, cpu); 808 - 809 - /* 810 - * First we update the task counts according to the state 811 - * change requested through the @clear and @set bits. 812 - * 813 - * Then if the cgroup PSI stats accounting enabled, we 814 - * assess the aggregate resource states this CPU's tasks 815 - * have been in since the last change, and account any 816 - * SOME and FULL time these may have resulted in. 817 - */ 818 - write_seqcount_begin(&groupc->seq); 819 - now = cpu_clock(cpu); 820 784 821 785 /* 822 786 * Start with TSK_ONCPU, which doesn't have a corresponding ··· 859 843 860 844 groupc->state_mask = state_mask; 861 845 862 - write_seqcount_end(&groupc->seq); 863 846 return; 864 847 } 865 848 ··· 878 863 record_times(groupc, now); 879 864 880 865 groupc->state_mask = state_mask; 881 - 882 - write_seqcount_end(&groupc->seq); 883 866 884 867 if (state_mask & group->rtpoll_states) 885 868 psi_schedule_rtpoll_work(group, 1, false); ··· 913 900 void psi_task_change(struct task_struct *task, int clear, int set) 914 901 { 915 902 int cpu = task_cpu(task); 916 - struct psi_group *group; 903 + u64 now; 917 904 918 905 if (!task->pid) 919 906 return; 920 907 921 908 psi_flags_change(task, clear, set); 922 909 923 - group = task_psi_group(task); 924 - do { 925 - psi_group_change(group, cpu, clear, set, true); 926 - } while ((group = group->parent)); 910 + psi_write_begin(cpu); 911 + now = cpu_clock(cpu); 912 + for_each_group(group, task_psi_group(task)) 913 + psi_group_change(group, cpu, clear, set, now, true); 914 + psi_write_end(cpu); 927 915 } 928 916 929 917 void psi_task_switch(struct task_struct *prev, struct task_struct *next, 930 918 bool sleep) 931 919 { 932 - struct psi_group *group, *common = NULL; 920 + struct psi_group *common = NULL; 933 921 int cpu = task_cpu(prev); 922 + u64 now; 923 + 924 + psi_write_begin(cpu); 925 + now = cpu_clock(cpu); 934 926 935 927 if (next->pid) { 936 928 psi_flags_change(next, 0, TSK_ONCPU); ··· 944 926 * ancestors with @prev, those will already have @prev's 945 927 * TSK_ONCPU bit set, and we can stop the iteration there. 946 928 */ 947 - group = task_psi_group(next); 948 - do { 949 - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & 950 - PSI_ONCPU) { 929 + for_each_group(group, task_psi_group(next)) { 930 + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); 931 + 932 + if (groupc->state_mask & PSI_ONCPU) { 951 933 common = group; 952 934 break; 953 935 } 954 - 955 - psi_group_change(group, cpu, 0, TSK_ONCPU, true); 956 - } while ((group = group->parent)); 936 + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); 937 + } 957 938 } 958 939 959 940 if (prev->pid) { ··· 985 968 986 969 psi_flags_change(prev, clear, set); 987 970 988 - group = task_psi_group(prev); 989 - do { 971 + for_each_group(group, task_psi_group(prev)) { 990 972 if (group == common) 991 973 break; 992 - psi_group_change(group, cpu, clear, set, wake_clock); 993 - } while ((group = group->parent)); 974 + psi_group_change(group, cpu, clear, set, now, wake_clock); 975 + } 994 976 995 977 /* 996 978 * TSK_ONCPU is handled up to the common ancestor. If there are ··· 999 983 */ 1000 984 if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { 1001 985 clear &= ~TSK_ONCPU; 1002 - for (; group; group = group->parent) 1003 - psi_group_change(group, cpu, clear, set, wake_clock); 986 + for_each_group(group, common) 987 + psi_group_change(group, cpu, clear, set, now, wake_clock); 1004 988 } 1005 989 } 990 + psi_write_end(cpu); 1006 991 } 1007 992 1008 993 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1009 994 void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) 1010 995 { 1011 996 int cpu = task_cpu(curr); 1012 - struct psi_group *group; 1013 997 struct psi_group_cpu *groupc; 1014 998 s64 delta; 1015 999 u64 irq; 1000 + u64 now; 1016 1001 1017 1002 if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) 1018 1003 return; ··· 1022 1005 return; 1023 1006 1024 1007 lockdep_assert_rq_held(rq); 1025 - group = task_psi_group(curr); 1026 - if (prev && task_psi_group(prev) == group) 1008 + if (prev && task_psi_group(prev) == task_psi_group(curr)) 1027 1009 return; 1028 1010 1029 1011 irq = irq_time_read(cpu); ··· 1031 1015 return; 1032 1016 rq->psi_irq_time = irq; 1033 1017 1034 - do { 1035 - u64 now; 1018 + psi_write_begin(cpu); 1019 + now = cpu_clock(cpu); 1036 1020 1021 + for_each_group(group, task_psi_group(curr)) { 1037 1022 if (!group->enabled) 1038 1023 continue; 1039 1024 1040 1025 groupc = per_cpu_ptr(group->pcpu, cpu); 1041 1026 1042 - write_seqcount_begin(&groupc->seq); 1043 - now = cpu_clock(cpu); 1044 - 1045 1027 record_times(groupc, now); 1046 1028 groupc->times[PSI_IRQ_FULL] += delta; 1047 1029 1048 - write_seqcount_end(&groupc->seq); 1049 - 1050 1030 if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) 1051 1031 psi_schedule_rtpoll_work(group, 1, false); 1052 - } while ((group = group->parent)); 1032 + } 1033 + psi_write_end(cpu); 1053 1034 } 1054 - #endif 1035 + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1055 1036 1056 1037 /** 1057 1038 * psi_memstall_enter - mark the beginning of a memory stall section ··· 1234 1221 return; 1235 1222 1236 1223 for_each_possible_cpu(cpu) { 1237 - struct rq *rq = cpu_rq(cpu); 1238 - struct rq_flags rf; 1224 + u64 now; 1239 1225 1240 - rq_lock_irq(rq, &rf); 1241 - psi_group_change(group, cpu, 0, 0, true); 1242 - rq_unlock_irq(rq, &rf); 1226 + guard(rq_lock_irq)(cpu_rq(cpu)); 1227 + 1228 + psi_write_begin(cpu); 1229 + now = cpu_clock(cpu); 1230 + psi_group_change(group, cpu, 0, 0, now, true); 1231 + psi_write_end(cpu); 1243 1232 } 1244 1233 } 1245 1234 #endif /* CONFIG_CGROUPS */ ··· 1666 1651 .proc_poll = psi_fop_poll, 1667 1652 .proc_release = psi_fop_release, 1668 1653 }; 1669 - #endif 1654 + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 1670 1655 1671 1656 static int __init psi_proc_init(void) 1672 1657 {

+29 -83

kernel/sched/rt.c

··· 4 4 * policies) 5 5 */ 6 6 7 + #include "sched.h" 8 + #include "pelt.h" 9 + 7 10 int sched_rr_timeslice = RR_TIMESLICE; 8 11 /* More than 4 hours if BW_SHIFT equals 20. */ 9 12 static const u64 max_rt_runtime = MAX_BW; ··· 63 60 return 0; 64 61 } 65 62 late_initcall(sched_rt_sysctl_init); 66 - #endif 63 + #endif /* CONFIG_SYSCTL */ 67 64 68 65 void init_rt_rq(struct rt_rq *rt_rq) 69 66 { ··· 78 75 /* delimiter for bitsearch: */ 79 76 __set_bit(MAX_RT_PRIO, array->bitmap); 80 77 81 - #if defined CONFIG_SMP 82 78 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 83 79 rt_rq->highest_prio.next = MAX_RT_PRIO-1; 84 80 rt_rq->overloaded = 0; 85 81 plist_head_init(&rt_rq->pushable_tasks); 86 - #endif /* CONFIG_SMP */ 87 82 /* We start is dequeued state, because no RT tasks are queued */ 88 83 rt_rq->rt_queued = 0; 89 84 ··· 292 291 return 0; 293 292 } 294 293 295 - #else /* CONFIG_RT_GROUP_SCHED */ 294 + #else /* !CONFIG_RT_GROUP_SCHED: */ 296 295 297 296 #define rt_entity_is_task(rt_se) (1) 298 297 ··· 328 327 { 329 328 return 1; 330 329 } 331 - #endif /* CONFIG_RT_GROUP_SCHED */ 332 - 333 - #ifdef CONFIG_SMP 330 + #endif /* !CONFIG_RT_GROUP_SCHED */ 334 331 335 332 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 336 333 { ··· 429 430 } 430 431 } 431 432 432 - #else 433 - 434 - static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 435 - { 436 - } 437 - 438 - static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 439 - { 440 - } 441 - 442 - static inline void rt_queue_push_tasks(struct rq *rq) 443 - { 444 - } 445 - #endif /* CONFIG_SMP */ 446 - 447 433 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 448 434 static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); 449 435 ··· 469 485 470 486 return cpu_cap >= min(min_cap, max_cap); 471 487 } 472 - #else 488 + #else /* !CONFIG_UCLAMP_TASK: */ 473 489 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 474 490 { 475 491 return true; 476 492 } 477 - #endif 493 + #endif /* !CONFIG_UCLAMP_TASK */ 478 494 479 495 #ifdef CONFIG_RT_GROUP_SCHED 480 496 ··· 578 594 return p->prio != p->normal_prio; 579 595 } 580 596 581 - #ifdef CONFIG_SMP 582 597 static inline const struct cpumask *sched_rt_period_mask(void) 583 598 { 584 599 return this_rq()->rd->span; 585 600 } 586 - #else 587 - static inline const struct cpumask *sched_rt_period_mask(void) 588 - { 589 - return cpu_online_mask; 590 - } 591 - #endif 592 601 593 602 static inline 594 603 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) ··· 602 625 rt_rq->rt_time < rt_b->rt_runtime); 603 626 } 604 627 605 - #ifdef CONFIG_SMP 606 628 /* 607 629 * We ran out of runtime, see if we can borrow some from our neighbours. 608 630 */ ··· 774 798 raw_spin_lock(&rt_rq->rt_runtime_lock); 775 799 } 776 800 } 777 - #else /* !CONFIG_SMP */ 778 - static inline void balance_runtime(struct rt_rq *rt_rq) {} 779 - #endif /* CONFIG_SMP */ 780 801 781 802 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 782 803 { ··· 903 930 return 0; 904 931 } 905 932 906 - #else /* !CONFIG_RT_GROUP_SCHED */ 933 + #else /* !CONFIG_RT_GROUP_SCHED: */ 907 934 908 935 typedef struct rt_rq *rt_rq_iter_t; 909 936 ··· 950 977 return &cpu_rq(cpu)->rt; 951 978 } 952 979 953 - #ifdef CONFIG_SMP 954 980 static void __enable_runtime(struct rq *rq) { } 955 981 static void __disable_runtime(struct rq *rq) { } 956 - #endif 957 982 958 - #endif /* CONFIG_RT_GROUP_SCHED */ 983 + #endif /* !CONFIG_RT_GROUP_SCHED */ 959 984 960 985 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 961 986 { ··· 1004 1033 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); 1005 1034 } 1006 1035 } 1007 - #endif 1036 + #endif /* CONFIG_RT_GROUP_SCHED */ 1008 1037 } 1009 1038 1010 1039 static void ··· 1046 1075 cpufreq_update_util(rq, 0); 1047 1076 } 1048 1077 1049 - #if defined CONFIG_SMP 1050 - 1051 1078 static void 1052 1079 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1053 1080 { ··· 1076 1107 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1077 1108 } 1078 1109 1079 - #else /* CONFIG_SMP */ 1080 - 1081 - static inline 1082 - void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1083 - static inline 1084 - void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1085 - 1086 - #endif /* CONFIG_SMP */ 1087 - 1088 - #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1089 1110 static void 1090 1111 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1091 1112 { ··· 1114 1155 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1115 1156 } 1116 1157 1117 - #else 1118 - 1119 - static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1120 - static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1121 - 1122 - #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1123 - 1124 1158 #ifdef CONFIG_RT_GROUP_SCHED 1125 1159 1126 1160 static void ··· 1134 1182 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1135 1183 } 1136 1184 1137 - #else /* CONFIG_RT_GROUP_SCHED */ 1185 + #else /* !CONFIG_RT_GROUP_SCHED: */ 1138 1186 1139 1187 static void 1140 1188 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ··· 1144 1192 static inline 1145 1193 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1146 1194 1147 - #endif /* CONFIG_RT_GROUP_SCHED */ 1195 + #endif /* !CONFIG_RT_GROUP_SCHED */ 1148 1196 1149 1197 static inline 1150 1198 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) ··· 1440 1488 1441 1489 enqueue_rt_entity(rt_se, flags); 1442 1490 1491 + if (task_is_blocked(p)) 1492 + return; 1493 + 1443 1494 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1444 1495 enqueue_pushable_task(rq, p); 1445 1496 } ··· 1493 1538 requeue_task_rt(rq, rq->curr, 0); 1494 1539 } 1495 1540 1496 - #ifdef CONFIG_SMP 1497 1541 static int find_lowest_rq(struct task_struct *task); 1498 1542 1499 1543 static int ··· 1607 1653 1608 1654 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); 1609 1655 } 1610 - #endif /* CONFIG_SMP */ 1611 1656 1612 1657 /* 1613 1658 * Preempt the current task with a newly woken task if needed: ··· 1620 1667 return; 1621 1668 } 1622 1669 1623 - #ifdef CONFIG_SMP 1624 1670 /* 1625 1671 * If: 1626 1672 * ··· 1634 1682 */ 1635 1683 if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) 1636 1684 check_preempt_equal_prio(rq, p); 1637 - #endif 1638 1685 } 1639 1686 1640 1687 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) ··· 1719 1768 1720 1769 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 1721 1770 1771 + if (task_is_blocked(p)) 1772 + return; 1722 1773 /* 1723 1774 * The previous task needs to be made eligible for pushing 1724 1775 * if it is still active ··· 1728 1775 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1729 1776 enqueue_pushable_task(rq, p); 1730 1777 } 1731 - 1732 - #ifdef CONFIG_SMP 1733 1778 1734 1779 /* Only try algorithms three times */ 1735 1780 #define RT_MAX_TRIES 3 ··· 2402 2451 GFP_KERNEL, cpu_to_node(i)); 2403 2452 } 2404 2453 } 2405 - #endif /* CONFIG_SMP */ 2406 2454 2407 2455 /* 2408 2456 * When switching a task to RT, we may overload the runqueue ··· 2425 2475 * then see if we can move to another run queue. 2426 2476 */ 2427 2477 if (task_on_rq_queued(p)) { 2428 - #ifdef CONFIG_SMP 2429 2478 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2430 2479 rt_queue_push_tasks(rq); 2431 - #endif /* CONFIG_SMP */ 2432 2480 if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) 2433 2481 resched_curr(rq); 2434 2482 } ··· 2443 2495 return; 2444 2496 2445 2497 if (task_current_donor(rq, p)) { 2446 - #ifdef CONFIG_SMP 2447 2498 /* 2448 2499 * If our priority decreases while running, we 2449 2500 * may need to pull tasks to this runqueue. ··· 2456 2509 */ 2457 2510 if (p->prio > rq->rt.highest_prio.curr) 2458 2511 resched_curr(rq); 2459 - #else 2460 - /* For UP simply resched on drop of prio */ 2461 - if (oldprio < p->prio) 2462 - resched_curr(rq); 2463 - #endif /* CONFIG_SMP */ 2464 2512 } else { 2465 2513 /* 2466 2514 * This task is not running, but if it is ··· 2491 2549 } 2492 2550 } 2493 2551 } 2494 - #else 2552 + #else /* !CONFIG_POSIX_TIMERS: */ 2495 2553 static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2496 - #endif 2554 + #endif /* !CONFIG_POSIX_TIMERS */ 2497 2555 2498 2556 /* 2499 2557 * scheduler tick hitting a task of our scheduling class. ··· 2562 2620 2563 2621 return rt_rq_throttled(rt_rq); 2564 2622 } 2565 - #endif 2623 + #endif /* CONFIG_SCHED_CORE */ 2566 2624 2567 2625 DEFINE_SCHED_CLASS(rt) = { 2568 2626 ··· 2576 2634 .put_prev_task = put_prev_task_rt, 2577 2635 .set_next_task = set_next_task_rt, 2578 2636 2579 - #ifdef CONFIG_SMP 2580 2637 .balance = balance_rt, 2581 2638 .select_task_rq = select_task_rq_rt, 2582 2639 .set_cpus_allowed = set_cpus_allowed_common, ··· 2584 2643 .task_woken = task_woken_rt, 2585 2644 .switched_from = switched_from_rt, 2586 2645 .find_lock_rq = find_lock_lowest_rq, 2587 - #endif 2588 2646 2589 2647 .task_tick = task_tick_rt, 2590 2648 ··· 2827 2887 return 1; 2828 2888 } 2829 2889 2830 - #else /* !CONFIG_RT_GROUP_SCHED */ 2890 + #else /* !CONFIG_RT_GROUP_SCHED: */ 2831 2891 2832 2892 #ifdef CONFIG_SYSCTL 2833 2893 static int sched_rt_global_constraints(void) ··· 2835 2895 return 0; 2836 2896 } 2837 2897 #endif /* CONFIG_SYSCTL */ 2838 - #endif /* CONFIG_RT_GROUP_SCHED */ 2898 + #endif /* !CONFIG_RT_GROUP_SCHED */ 2839 2899 2840 2900 #ifdef CONFIG_SYSCTL 2841 2901 static int sched_rt_global_validate(void) ··· 2890 2950 } 2891 2951 sched_domains_mutex_unlock(); 2892 2952 mutex_unlock(&mutex); 2953 + 2954 + /* 2955 + * After changing maximum available bandwidth for DEADLINE, we need to 2956 + * recompute per root domain and per cpus variables accordingly. 2957 + */ 2958 + rebuild_sched_domains(); 2893 2959 2894 2960 return ret; 2895 2961 }

+1

kernel/sched/sched-pelt.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ 3 + #include <linux/types.h> 3 4 4 5 static const u32 runnable_avg_yN_inv[] __maybe_unused = { 5 6 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,

+68 -175

kernel/sched/sched.h

··· 69 69 #include <linux/wait_bit.h> 70 70 #include <linux/workqueue_api.h> 71 71 #include <linux/delayacct.h> 72 + #include <linux/mmu_context.h> 72 73 73 74 #include <trace/events/power.h> 74 75 #include <trace/events/sched.h> ··· 385 384 extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, 386 385 dl_server_has_tasks_f has_tasks, 387 386 dl_server_pick_f pick_task); 387 + extern void sched_init_dl_servers(void); 388 388 389 389 extern void dl_server_update_idle_time(struct rq *rq, 390 390 struct task_struct *p); ··· 402 400 #ifdef CONFIG_CGROUP_SCHED 403 401 404 402 extern struct list_head task_groups; 403 + 404 + #ifdef CONFIG_CFS_BANDWIDTH 405 + extern const u64 max_bw_quota_period_us; 406 + 407 + /* 408 + * default period for group bandwidth. 409 + * default: 0.1s, units: microseconds 410 + */ 411 + static inline u64 default_bw_period_us(void) 412 + { 413 + return 100000ULL; 414 + } 415 + #endif /* CONFIG_CFS_BANDWIDTH */ 405 416 406 417 struct cfs_bandwidth { 407 418 #ifdef CONFIG_CFS_BANDWIDTH ··· 439 424 int nr_burst; 440 425 u64 throttled_time; 441 426 u64 burst_time; 442 - #endif 427 + #endif /* CONFIG_CFS_BANDWIDTH */ 443 428 }; 444 429 445 430 /* Task group related information */ ··· 457 442 /* runqueue "owned" by this group on each CPU */ 458 443 struct cfs_rq **cfs_rq; 459 444 unsigned long shares; 460 - #ifdef CONFIG_SMP 461 445 /* 462 446 * load_avg can be heavily contended at clock tick time, so put 463 447 * it in its own cache-line separated from the fields above which 464 448 * will also be accessed at each tick. 465 449 */ 466 450 atomic_long_t load_avg ____cacheline_aligned; 467 - #endif 468 - #endif 451 + #endif /* CONFIG_FAIR_GROUP_SCHED */ 469 452 470 453 #ifdef CONFIG_RT_GROUP_SCHED 471 454 struct sched_rt_entity **rt_se; ··· 544 531 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 545 532 extern void online_fair_sched_group(struct task_group *tg); 546 533 extern void unregister_fair_sched_group(struct task_group *tg); 547 - #else 534 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 548 535 static inline void free_fair_sched_group(struct task_group *tg) { } 549 536 static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 550 537 { ··· 552 539 } 553 540 static inline void online_fair_sched_group(struct task_group *tg) { } 554 541 static inline void unregister_fair_sched_group(struct task_group *tg) { } 555 - #endif 542 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 556 543 557 544 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 558 545 struct sched_entity *se, int cpu, ··· 586 573 587 574 extern int sched_group_set_idle(struct task_group *tg, long idle); 588 575 589 - #ifdef CONFIG_SMP 590 576 extern void set_task_rq_fair(struct sched_entity *se, 591 577 struct cfs_rq *prev, struct cfs_rq *next); 592 - #else /* !CONFIG_SMP */ 593 - static inline void set_task_rq_fair(struct sched_entity *se, 594 - struct cfs_rq *prev, struct cfs_rq *next) { } 595 - #endif /* CONFIG_SMP */ 596 - #else /* !CONFIG_FAIR_GROUP_SCHED */ 578 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 597 579 static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } 598 580 static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } 599 - #endif /* CONFIG_FAIR_GROUP_SCHED */ 581 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 600 582 601 - #else /* CONFIG_CGROUP_SCHED */ 583 + #else /* !CONFIG_CGROUP_SCHED: */ 602 584 603 585 struct cfs_bandwidth { }; 604 586 605 587 static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } 606 588 607 - #endif /* CONFIG_CGROUP_SCHED */ 589 + #endif /* !CONFIG_CGROUP_SCHED */ 608 590 609 591 extern void unregister_rt_sched_group(struct task_group *tg); 610 592 extern void free_rt_sched_group(struct task_group *tg); ··· 675 667 struct sched_entity *curr; 676 668 struct sched_entity *next; 677 669 678 - #ifdef CONFIG_SMP 679 670 /* 680 671 * CFS load tracking 681 672 */ ··· 706 699 u64 last_h_load_update; 707 700 struct sched_entity *h_load_next; 708 701 #endif /* CONFIG_FAIR_GROUP_SCHED */ 709 - #endif /* CONFIG_SMP */ 710 702 711 703 #ifdef CONFIG_FAIR_GROUP_SCHED 712 704 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ ··· 802 796 struct rt_prio_array active; 803 797 unsigned int rt_nr_running; 804 798 unsigned int rr_nr_running; 805 - #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 806 799 struct { 807 800 int curr; /* highest queued rt task prio */ 808 - #ifdef CONFIG_SMP 809 801 int next; /* next highest */ 810 - #endif 811 802 } highest_prio; 812 - #endif 813 - #ifdef CONFIG_SMP 814 803 bool overloaded; 815 804 struct plist_head pushable_tasks; 816 805 817 - #endif /* CONFIG_SMP */ 818 806 int rt_queued; 819 807 820 808 #ifdef CONFIG_RT_GROUP_SCHED ··· 839 839 840 840 unsigned int dl_nr_running; 841 841 842 - #ifdef CONFIG_SMP 843 842 /* 844 843 * Deadline values of the currently executing and the 845 844 * earliest ready task on this rq. Caching these facilitates ··· 858 859 * of the leftmost (earliest deadline) element. 859 860 */ 860 861 struct rb_root_cached pushable_dl_tasks_root; 861 - #else 862 - struct dl_bw dl_bw; 863 - #endif 862 + 864 863 /* 865 864 * "Active utilization" for this runqueue: increased when a 866 865 * task wakes up (becomes TASK_RUNNING) and decreased when a ··· 929 932 930 933 #endif /* !CONFIG_FAIR_GROUP_SCHED */ 931 934 932 - #ifdef CONFIG_SMP 933 935 /* 934 936 * XXX we want to get rid of these helpers and use the full load resolution. 935 937 */ ··· 1004 1008 /* These atomics are updated outside of a lock */ 1005 1009 atomic_t rto_loop_next; 1006 1010 atomic_t rto_loop_start; 1007 - #endif 1011 + #endif /* HAVE_RT_PUSH_IPI */ 1008 1012 /* 1009 1013 * The "RT overload" flag: it gets set if a CPU has more than 1010 1014 * one runnable RT task. ··· 1039 1043 #ifdef HAVE_RT_PUSH_IPI 1040 1044 extern void rto_push_irq_work_func(struct irq_work *work); 1041 1045 #endif 1042 - #endif /* CONFIG_SMP */ 1043 1046 1044 1047 #ifdef CONFIG_UCLAMP_TASK 1045 1048 /* ··· 1102 1107 unsigned int numa_migrate_on; 1103 1108 #endif 1104 1109 #ifdef CONFIG_NO_HZ_COMMON 1105 - #ifdef CONFIG_SMP 1106 1110 unsigned long last_blocked_load_update_tick; 1107 1111 unsigned int has_blocked_load; 1108 1112 call_single_data_t nohz_csd; 1109 - #endif /* CONFIG_SMP */ 1110 1113 unsigned int nohz_tick_stopped; 1111 1114 atomic_t nohz_flags; 1112 1115 #endif /* CONFIG_NO_HZ_COMMON */ 1113 1116 1114 - #ifdef CONFIG_SMP 1115 1117 unsigned int ttwu_pending; 1116 - #endif 1117 1118 u64 nr_switches; 1118 1119 1119 1120 #ifdef CONFIG_UCLAMP_TASK ··· 1142 1151 */ 1143 1152 unsigned long nr_uninterruptible; 1144 1153 1154 + #ifdef CONFIG_SCHED_PROXY_EXEC 1155 + struct task_struct __rcu *donor; /* Scheduling context */ 1156 + struct task_struct __rcu *curr; /* Execution context */ 1157 + #else 1145 1158 union { 1146 1159 struct task_struct __rcu *donor; /* Scheduler context */ 1147 1160 struct task_struct __rcu *curr; /* Execution context */ 1148 1161 }; 1162 + #endif 1149 1163 struct sched_dl_entity *dl_server; 1150 1164 struct task_struct *idle; 1151 1165 struct task_struct *stop; ··· 1179 1183 int membarrier_state; 1180 1184 #endif 1181 1185 1182 - #ifdef CONFIG_SMP 1183 1186 struct root_domain *rd; 1184 1187 struct sched_domain __rcu *sd; 1185 1188 ··· 1219 1224 #ifdef CONFIG_HOTPLUG_CPU 1220 1225 struct rcuwait hotplug_wait; 1221 1226 #endif 1222 - #endif /* CONFIG_SMP */ 1223 1227 1224 1228 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 1225 1229 u64 prev_irq_time; ··· 1236 1242 long calc_load_active; 1237 1243 1238 1244 #ifdef CONFIG_SCHED_HRTICK 1239 - #ifdef CONFIG_SMP 1240 1245 call_single_data_t hrtick_csd; 1241 - #endif 1242 1246 struct hrtimer hrtick_timer; 1243 1247 ktime_t hrtick_time; 1244 1248 #endif ··· 1263 1271 struct cpuidle_state *idle_state; 1264 1272 #endif 1265 1273 1266 - #ifdef CONFIG_SMP 1267 1274 unsigned int nr_pinned; 1268 - #endif 1269 1275 unsigned int push_busy; 1270 1276 struct cpu_stop_work push_work; 1271 1277 ··· 1284 1294 unsigned int core_forceidle_seq; 1285 1295 unsigned int core_forceidle_occupation; 1286 1296 u64 core_forceidle_start; 1287 - #endif 1297 + #endif /* CONFIG_SCHED_CORE */ 1288 1298 1289 1299 /* Scratch cpumask to be temporarily used under rq_lock */ 1290 1300 cpumask_var_t scratch_mask; 1291 1301 1292 - #if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) 1302 + #ifdef CONFIG_CFS_BANDWIDTH 1293 1303 call_single_data_t cfsb_csd; 1294 1304 struct list_head cfsb_csd_list; 1295 1305 #endif ··· 1303 1313 return cfs_rq->rq; 1304 1314 } 1305 1315 1306 - #else 1316 + #else /* !CONFIG_FAIR_GROUP_SCHED: */ 1307 1317 1308 1318 static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 1309 1319 { 1310 1320 return container_of(cfs_rq, struct rq, cfs); 1311 1321 } 1312 - #endif 1322 + #endif /* !CONFIG_FAIR_GROUP_SCHED */ 1313 1323 1314 1324 static inline int cpu_of(struct rq *rq) 1315 1325 { 1316 - #ifdef CONFIG_SMP 1317 1326 return rq->cpu; 1318 - #else 1319 - return 0; 1320 - #endif 1321 1327 } 1322 1328 1323 1329 #define MDF_PUSH 0x01 1324 1330 1325 1331 static inline bool is_migration_disabled(struct task_struct *p) 1326 1332 { 1327 - #ifdef CONFIG_SMP 1328 1333 return p->migration_disabled; 1329 - #else 1330 - return false; 1331 - #endif 1332 1334 } 1333 1335 1334 1336 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ··· 1331 1349 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 1332 1350 #define raw_rq() raw_cpu_ptr(&runqueues) 1333 1351 1352 + #ifdef CONFIG_SCHED_PROXY_EXEC 1353 + static inline void rq_set_donor(struct rq *rq, struct task_struct *t) 1354 + { 1355 + rcu_assign_pointer(rq->donor, t); 1356 + } 1357 + #else 1334 1358 static inline void rq_set_donor(struct rq *rq, struct task_struct *t) 1335 1359 { 1336 1360 /* Do nothing */ 1337 1361 } 1362 + #endif 1338 1363 1339 1364 #ifdef CONFIG_SCHED_CORE 1340 1365 static inline struct cpumask *sched_group_span(struct sched_group *sg); ··· 1489 1500 } 1490 1501 1491 1502 #endif /* !CONFIG_SCHED_CORE */ 1503 + 1492 1504 #ifdef CONFIG_RT_GROUP_SCHED 1493 1505 # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED 1494 1506 DECLARE_STATIC_KEY_FALSE(rt_group_sched); ··· 1497 1507 { 1498 1508 return static_branch_unlikely(&rt_group_sched); 1499 1509 } 1500 - # else 1510 + # else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */ 1501 1511 DECLARE_STATIC_KEY_TRUE(rt_group_sched); 1502 1512 static inline bool rt_group_sched_enabled(void) 1503 1513 { 1504 1514 return static_branch_likely(&rt_group_sched); 1505 1515 } 1506 - # endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ 1507 - #else 1516 + # endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ 1517 + #else /* !CONFIG_RT_GROUP_SCHED: */ 1508 1518 # define rt_group_sched_enabled() false 1509 - #endif /* CONFIG_RT_GROUP_SCHED */ 1519 + #endif /* !CONFIG_RT_GROUP_SCHED */ 1510 1520 1511 1521 static inline void lockdep_assert_rq_held(struct rq *rq) 1512 1522 { ··· 1564 1574 __update_idle_core(rq); 1565 1575 } 1566 1576 1567 - #else 1577 + #else /* !CONFIG_SCHED_SMT: */ 1568 1578 static inline void update_idle_core(struct rq *rq) { } 1569 - #endif 1579 + #endif /* !CONFIG_SCHED_SMT */ 1570 1580 1571 1581 #ifdef CONFIG_FAIR_GROUP_SCHED 1572 1582 ··· 1747 1757 WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); 1748 1758 } 1749 1759 1750 - #else /* !CONFIG_SCHED_CLASS_EXT */ 1760 + #else /* !CONFIG_SCHED_CLASS_EXT: */ 1751 1761 #define scx_enabled() false 1752 1762 #define scx_switched_all() false 1753 1763 ··· 1771 1781 1772 1782 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1773 1783 rf->clock_update_flags = 0; 1774 - #ifdef CONFIG_SMP 1775 1784 WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); 1776 - #endif 1777 1785 } 1778 1786 1779 1787 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) ··· 1949 1961 1950 1962 #endif /* !CONFIG_NUMA_BALANCING */ 1951 1963 1952 - #ifdef CONFIG_SMP 1953 - 1954 1964 static inline void 1955 1965 queue_balance_callback(struct rq *rq, 1956 1966 struct balance_callback *head, ··· 2114 2128 return p->user_cpus_ptr; 2115 2129 } 2116 2130 2117 - #endif /* CONFIG_SMP */ 2118 - 2119 2131 #ifdef CONFIG_CGROUP_SCHED 2120 2132 2121 2133 /* ··· 2158 2174 tg = &root_task_group; 2159 2175 p->rt.rt_rq = tg->rt_rq[cpu]; 2160 2176 p->rt.parent = tg->rt_se[cpu]; 2161 - #endif 2177 + #endif /* CONFIG_RT_GROUP_SCHED */ 2162 2178 } 2163 2179 2164 2180 #else /* !CONFIG_CGROUP_SCHED: */ ··· 2184 2200 smp_wmb(); 2185 2201 WRITE_ONCE(task_thread_info(p)->cpu, cpu); 2186 2202 p->wake_cpu = cpu; 2187 - #endif 2203 + #endif /* CONFIG_SMP */ 2188 2204 } 2189 2205 2190 2206 /* ··· 2262 2278 return rq->donor == p; 2263 2279 } 2264 2280 2281 + static inline bool task_is_blocked(struct task_struct *p) 2282 + { 2283 + if (!sched_proxy_exec()) 2284 + return false; 2285 + 2286 + return !!p->blocked_on; 2287 + } 2288 + 2265 2289 static inline int task_on_cpu(struct rq *rq, struct task_struct *p) 2266 2290 { 2267 - #ifdef CONFIG_SMP 2268 2291 return p->on_cpu; 2269 - #else 2270 - return task_current(rq, p); 2271 - #endif 2272 2292 } 2273 2293 2274 2294 static inline int task_on_rq_queued(struct task_struct *p) ··· 2295 2307 #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ 2296 2308 #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ 2297 2309 2298 - #ifdef CONFIG_SMP 2299 2310 static_assert(WF_EXEC == SD_BALANCE_EXEC); 2300 2311 static_assert(WF_FORK == SD_BALANCE_FORK); 2301 2312 static_assert(WF_TTWU == SD_BALANCE_WAKE); 2302 - #endif 2303 2313 2304 2314 /* 2305 2315 * To aid in avoiding the subversion of "niceness" due to uneven distribution ··· 2353 2367 2354 2368 #define ENQUEUE_HEAD 0x10 2355 2369 #define ENQUEUE_REPLENISH 0x20 2356 - #ifdef CONFIG_SMP 2357 2370 #define ENQUEUE_MIGRATED 0x40 2358 - #else 2359 - #define ENQUEUE_MIGRATED 0x00 2360 - #endif 2361 2371 #define ENQUEUE_INITIAL 0x80 2362 2372 #define ENQUEUE_MIGRATING 0x100 2363 2373 #define ENQUEUE_DELAYED 0x200 ··· 2398 2416 void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); 2399 2417 void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); 2400 2418 2401 - #ifdef CONFIG_SMP 2402 2419 int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); 2403 2420 2404 2421 void (*migrate_task_rq)(struct task_struct *p, int new_cpu); ··· 2410 2429 void (*rq_offline)(struct rq *rq); 2411 2430 2412 2431 struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); 2413 - #endif 2414 2432 2415 2433 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); 2416 2434 void (*task_fork)(struct task_struct *p); ··· 2467 2487 struct task_struct *prev, 2468 2488 struct task_struct *next) 2469 2489 { 2470 - WARN_ON_ONCE(rq->curr != prev); 2490 + WARN_ON_ONCE(rq->donor != prev); 2471 2491 2472 2492 __put_prev_set_next_dl_server(rq, prev, next); 2473 2493 ··· 2561 2581 #define SCA_MIGRATE_ENABLE 0x04 2562 2582 #define SCA_USER 0x08 2563 2583 2564 - #ifdef CONFIG_SMP 2565 - 2566 2584 extern void update_group_capacity(struct sched_domain *sd, int cpu); 2567 2585 2568 2586 extern void sched_balance_trigger(struct rq *rq); ··· 2611 2633 } 2612 2634 2613 2635 extern int push_cpu_stop(void *arg); 2614 - 2615 - #else /* !CONFIG_SMP: */ 2616 - 2617 - static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) 2618 - { 2619 - return true; 2620 - } 2621 - 2622 - static inline int __set_cpus_allowed_ptr(struct task_struct *p, 2623 - struct affinity_context *ctx) 2624 - { 2625 - return set_cpus_allowed_ptr(p, ctx->new_mask); 2626 - } 2627 - 2628 - static inline cpumask_t *alloc_user_cpus_ptr(int node) 2629 - { 2630 - return NULL; 2631 - } 2632 - 2633 - #endif /* !CONFIG_SMP */ 2634 2636 2635 2637 #ifdef CONFIG_CPU_IDLE 2636 2638 ··· 2707 2749 call_trace_sched_update_nr_running(rq, count); 2708 2750 } 2709 2751 2710 - #ifdef CONFIG_SMP 2711 2752 if (prev_nr < 2 && rq->nr_running >= 2) 2712 2753 set_rd_overloaded(rq->rd, 1); 2713 - #endif 2714 2754 2715 2755 sched_update_tick_dependency(rq); 2716 2756 } ··· 2874 2918 static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) 2875 2919 { 2876 2920 rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 2877 - /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ 2878 - #ifdef CONFIG_SMP 2879 2921 rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 2880 - #endif 2881 2922 } 2882 2923 2883 2924 #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ ··· 2882 2929 static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ 2883 2930 { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ 2884 2931 _lock; return _t; } 2885 - 2886 - #ifdef CONFIG_SMP 2887 2932 2888 2933 static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) 2889 2934 { ··· 2905 2954 /* 2906 2955 * __sched_core_flip() relies on SMT having cpu-id lock order. 2907 2956 */ 2908 - #endif 2957 + #endif /* CONFIG_SCHED_CORE */ 2909 2958 return rq1->cpu < rq2->cpu; 2910 2959 } 2911 2960 ··· 3042 3091 3043 3092 extern bool sched_smp_initialized; 3044 3093 3045 - #else /* !CONFIG_SMP: */ 3046 - 3047 - /* 3048 - * double_rq_lock - safely lock two runqueues 3049 - * 3050 - * Note this does not disable interrupts like task_rq_lock, 3051 - * you need to do so manually before calling. 3052 - */ 3053 - static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) 3054 - __acquires(rq1->lock) 3055 - __acquires(rq2->lock) 3056 - { 3057 - WARN_ON_ONCE(!irqs_disabled()); 3058 - WARN_ON_ONCE(rq1 != rq2); 3059 - raw_spin_rq_lock(rq1); 3060 - __acquire(rq2->lock); /* Fake it out ;) */ 3061 - double_rq_clock_clear_update(rq1, rq2); 3062 - } 3063 - 3064 - /* 3065 - * double_rq_unlock - safely unlock two runqueues 3066 - * 3067 - * Note this does not restore interrupts like task_rq_unlock, 3068 - * you need to do so manually after calling. 3069 - */ 3070 - static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) 3071 - __releases(rq1->lock) 3072 - __releases(rq2->lock) 3073 - { 3074 - WARN_ON_ONCE(rq1 != rq2); 3075 - raw_spin_rq_unlock(rq1); 3076 - __release(rq2->lock); 3077 - } 3078 - 3079 - #endif /* !CONFIG_SMP */ 3080 - 3081 3094 DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, 3082 3095 double_rq_lock(_T->lock, _T->lock2), 3083 3096 double_rq_unlock(_T->lock, _T->lock2)) ··· 3060 3145 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); 3061 3146 3062 3147 extern void resched_latency_warn(int cpu, u64 latency); 3148 + 3063 3149 #ifdef CONFIG_NUMA_BALANCING 3064 3150 extern void show_numa_stats(struct task_struct *p, struct seq_file *m); 3065 3151 extern void ··· 3100 3184 static inline void nohz_balance_exit_idle(struct rq *rq) { } 3101 3185 #endif /* !CONFIG_NO_HZ_COMMON */ 3102 3186 3103 - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 3187 + #ifdef CONFIG_NO_HZ_COMMON 3104 3188 extern void nohz_run_idle_balance(int cpu); 3105 3189 #else 3106 3190 static inline void nohz_run_idle_balance(int cpu) { } ··· 3170 3254 return total; 3171 3255 } 3172 3256 3173 - #else 3257 + #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 3174 3258 3175 3259 static inline int irqtime_enabled(void) 3176 3260 { 3177 3261 return 0; 3178 3262 } 3179 3263 3180 - #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 3264 + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 3181 3265 3182 3266 #ifdef CONFIG_CPU_FREQ 3183 3267 ··· 3226 3310 # define arch_scale_freq_invariant() false 3227 3311 #endif 3228 3312 3229 - #ifdef CONFIG_SMP 3230 - 3231 3313 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, 3232 3314 unsigned long *min, 3233 3315 unsigned long *max); ··· 3268 3354 { 3269 3355 return READ_ONCE(rq->avg_rt.util_avg); 3270 3356 } 3271 - 3272 - #else /* !CONFIG_SMP */ 3273 - static inline bool update_other_load_avgs(struct rq *rq) { return false; } 3274 - #endif /* CONFIG_SMP */ 3275 3357 3276 3358 #ifdef CONFIG_UCLAMP_TASK 3277 3359 ··· 3445 3535 return static_branch_unlikely(&sched_energy_present); 3446 3536 } 3447 3537 3448 - #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ 3538 + #else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ 3449 3539 3450 3540 #define perf_domain_span(pd) NULL 3451 3541 3452 3542 static inline bool sched_energy_enabled(void) { return false; } 3453 3543 3454 - #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ 3544 + #endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ 3455 3545 3456 3546 #ifdef CONFIG_MEMBARRIER 3457 3547 ··· 3477 3567 WRITE_ONCE(rq->membarrier_state, membarrier_state); 3478 3568 } 3479 3569 3480 - #else /* !CONFIG_MEMBARRIER :*/ 3570 + #else /* !CONFIG_MEMBARRIER: */ 3481 3571 3482 3572 static inline void membarrier_switch_mm(struct rq *rq, 3483 3573 struct mm_struct *prev_mm, ··· 3487 3577 3488 3578 #endif /* !CONFIG_MEMBARRIER */ 3489 3579 3490 - #ifdef CONFIG_SMP 3491 3580 static inline bool is_per_cpu_kthread(struct task_struct *p) 3492 3581 { 3493 3582 if (!(p->flags & PF_KTHREAD)) ··· 3497 3588 3498 3589 return true; 3499 3590 } 3500 - #endif 3501 3591 3502 3592 extern void swake_up_all_locked(struct swait_queue_head *q); 3503 3593 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ··· 3795 3887 3796 3888 extern u64 avg_vruntime(struct cfs_rq *cfs_rq); 3797 3889 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); 3798 - #ifdef CONFIG_SMP 3799 3890 static inline 3800 3891 void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) 3801 3892 { ··· 3815 3908 3816 3909 return false; 3817 3910 } 3818 - #endif 3819 3911 3820 3912 #ifdef CONFIG_RT_MUTEXES 3821 3913 ··· 3855 3949 const struct sched_class *prev_class, 3856 3950 int oldprio); 3857 3951 3858 - #ifdef CONFIG_SMP 3859 3952 extern struct balance_callback *splice_balance_callbacks(struct rq *rq); 3860 3953 extern void balance_callbacks(struct rq *rq, struct balance_callback *head); 3861 - #else 3862 - 3863 - static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) 3864 - { 3865 - return NULL; 3866 - } 3867 - 3868 - static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) 3869 - { 3870 - } 3871 - 3872 - #endif 3873 3954 3874 3955 #ifdef CONFIG_SCHED_CLASS_EXT 3875 3956 /*

+7

kernel/sched/smp.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef _KERNEL_SCHED_SMP_H 4 + #define _KERNEL_SCHED_SMP_H 5 + 2 6 /* 3 7 * Scheduler internal SMP callback types and methods between the scheduler 4 8 * and other internal parts of the core kernel: 5 9 */ 10 + #include <linux/types.h> 6 11 7 12 extern void sched_ttwu_pending(void *arg); 8 13 ··· 18 13 #else 19 14 static inline void flush_smp_call_function_queue(void) { } 20 15 #endif 16 + 17 + #endif /* _KERNEL_SCHED_SMP_H */

+1 -4

kernel/sched/stats.c

··· 2 2 /* 3 3 * /proc/schedstat implementation 4 4 */ 5 + #include "sched.h" 5 6 6 7 void __update_stats_wait_start(struct rq *rq, struct task_struct *p, 7 8 struct sched_statistics *stats) ··· 115 114 seq_printf(seq, "timestamp %lu\n", jiffies); 116 115 } else { 117 116 struct rq *rq; 118 - #ifdef CONFIG_SMP 119 117 struct sched_domain *sd; 120 118 int dcount = 0; 121 - #endif 122 119 cpu = (unsigned long)(v - 2); 123 120 rq = cpu_rq(cpu); 124 121 ··· 131 132 132 133 seq_printf(seq, "\n"); 133 134 134 - #ifdef CONFIG_SMP 135 135 /* domain-specific stats */ 136 136 rcu_read_lock(); 137 137 for_each_domain(cpu, sd) { ··· 161 163 sd->ttwu_move_balance); 162 164 } 163 165 rcu_read_unlock(); 164 - #endif 165 166 } 166 167 return 0; 167 168 }

+5 -5

kernel/sched/stats.h

··· 112 112 bool sleep); 113 113 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 114 114 void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); 115 - #else 115 + #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ 116 116 static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, 117 117 struct task_struct *prev) {} 118 - #endif /*CONFIG_IRQ_TIME_ACCOUNTING */ 118 + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ 119 119 /* 120 120 * PSI tracks state that persists across sleeps, such as iowaits and 121 121 * memory stalls. As a result, it has to distinguish between sleeps, ··· 220 220 psi_task_switch(prev, next, sleep); 221 221 } 222 222 223 - #else /* CONFIG_PSI */ 223 + #else /* !CONFIG_PSI: */ 224 224 static inline void psi_enqueue(struct task_struct *p, bool migrate) {} 225 225 static inline void psi_dequeue(struct task_struct *p, bool migrate) {} 226 226 static inline void psi_ttwu_dequeue(struct task_struct *p) {} ··· 229 229 bool sleep) {} 230 230 static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, 231 231 struct task_struct *prev) {} 232 - #endif /* CONFIG_PSI */ 232 + #endif /* !CONFIG_PSI */ 233 233 234 234 #ifdef CONFIG_SCHED_INFO 235 235 /* ··· 334 334 # define sched_info_enqueue(rq, t) do { } while (0) 335 335 # define sched_info_dequeue(rq, t) do { } while (0) 336 336 # define sched_info_switch(rq, t, next) do { } while (0) 337 - #endif /* CONFIG_SCHED_INFO */ 337 + #endif /* !CONFIG_SCHED_INFO */ 338 338 339 339 #endif /* _KERNEL_STATS_H */

+1 -4

kernel/sched/stop_task.c

··· 7 7 * 8 8 * See kernel/stop_machine.c 9 9 */ 10 + #include "sched.h" 10 11 11 - #ifdef CONFIG_SMP 12 12 static int 13 13 select_task_rq_stop(struct task_struct *p, int cpu, int flags) 14 14 { ··· 20 20 { 21 21 return sched_stop_runnable(rq); 22 22 } 23 - #endif /* CONFIG_SMP */ 24 23 25 24 static void 26 25 wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) ··· 105 106 .put_prev_task = put_prev_task_stop, 106 107 .set_next_task = set_next_task_stop, 107 108 108 - #ifdef CONFIG_SMP 109 109 .balance = balance_stop, 110 110 .select_task_rq = select_task_rq_stop, 111 111 .set_cpus_allowed = set_cpus_allowed_common, 112 - #endif 113 112 114 113 .task_tick = task_tick_stop, 115 114

+1

kernel/sched/swait.c

··· 2 2 /* 3 3 * <linux/swait.h> (simple wait queues ) implementation: 4 4 */ 5 + #include "sched.h" 5 6 6 7 void __init_swait_queue_head(struct swait_queue_head *q, const char *name, 7 8 struct lock_class_key *key)

+4 -11

kernel/sched/syscalls.c

··· 174 174 return 0; 175 175 } 176 176 177 - #endif 177 + #endif /* __ARCH_WANT_SYS_NICE */ 178 178 179 179 /** 180 180 * task_prio - return the priority value of a given task. ··· 209 209 if (rq->nr_running) 210 210 return 0; 211 211 212 - #ifdef CONFIG_SMP 213 212 if (rq->ttwu_pending) 214 213 return 0; 215 - #endif 216 214 217 215 return 1; 218 216 } ··· 253 255 254 256 return idle_cpu(cpu); 255 257 } 256 - 257 - #endif 258 + #endif /* CONFIG_SCHED_CORE */ 258 259 259 260 /** 260 261 * find_process_by_pid - find a process with a matching PID value. ··· 445 448 } 446 449 static void __setscheduler_uclamp(struct task_struct *p, 447 450 const struct sched_attr *attr) { } 448 - #endif 451 + #endif /* !CONFIG_UCLAMP_TASK */ 449 452 450 453 /* 451 454 * Allow unprivileged RT tasks to decrease priority. ··· 639 642 goto unlock; 640 643 } 641 644 #endif /* CONFIG_RT_GROUP_SCHED */ 642 - #ifdef CONFIG_SMP 643 645 if (dl_bandwidth_enabled() && dl_policy(policy) && 644 646 !(attr->sched_flags & SCHED_FLAG_SUGOV)) { 645 647 cpumask_t *span = rq->rd->span; ··· 654 658 goto unlock; 655 659 } 656 660 } 657 - #endif 658 661 } 659 662 660 663 /* Re-check policy now with rq lock held: */ ··· 1115 1120 return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); 1116 1121 } 1117 1122 1118 - #ifdef CONFIG_SMP 1119 1123 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) 1120 1124 { 1121 1125 /* ··· 1143 1149 1144 1150 return 0; 1145 1151 } 1146 - #endif /* CONFIG_SMP */ 1147 1152 1148 1153 int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) 1149 1154 { ··· 1235 1242 user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); 1236 1243 if (user_mask) { 1237 1244 cpumask_copy(user_mask, in_mask); 1238 - } else if (IS_ENABLED(CONFIG_SMP)) { 1245 + } else { 1239 1246 return -ENOMEM; 1240 1247 } 1241 1248

+24 -33

kernel/sched/topology.c

··· 3 3 * Scheduler topology setup/handling methods 4 4 */ 5 5 6 + #include <linux/sched/isolation.h> 6 7 #include <linux/bsearch.h> 8 + #include "sched.h" 7 9 8 10 DEFINE_MUTEX(sched_domains_mutex); 9 11 void sched_domains_mutex_lock(void) ··· 89 87 break; 90 88 } 91 89 92 - if (!(sd->flags & SD_OVERLAP) && 90 + if (!(sd->flags & SD_NUMA) && 93 91 cpumask_intersects(groupmask, sched_group_span(group))) { 94 92 printk(KERN_CONT "\n"); 95 93 printk(KERN_ERR "ERROR: repeated CPUs\n"); ··· 102 100 group->sgc->id, 103 101 cpumask_pr_args(sched_group_span(group))); 104 102 105 - if ((sd->flags & SD_OVERLAP) && 103 + if ((sd->flags & SD_NUMA) && 106 104 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { 107 105 printk(KERN_CONT " mask=%*pbl", 108 106 cpumask_pr_args(group_balance_mask(group))); ··· 315 313 } 316 314 317 315 late_initcall(sched_energy_aware_sysctl_init); 318 - #endif 316 + #endif /* CONFIG_PROC_SYSCTL */ 319 317 320 318 static void free_pd(struct perf_domain *pd) 321 319 { ··· 451 449 452 450 return false; 453 451 } 454 - #else 452 + #else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ 455 453 static void free_pd(struct perf_domain *pd) { } 456 - #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ 454 + #endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ 457 455 458 456 static void free_rootdomain(struct rcu_head *rcu) 459 457 { ··· 1320 1318 update_group_capacity(sd, cpu); 1321 1319 } 1322 1320 1323 - #ifdef CONFIG_SMP 1324 - 1325 1321 /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ 1326 1322 void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) 1327 1323 { ··· 1344 1344 * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" 1345 1345 * which is shared by all the overlapping groups. 1346 1346 */ 1347 - WARN_ON_ONCE(sd->flags & SD_OVERLAP); 1347 + WARN_ON_ONCE(sd->flags & SD_NUMA); 1348 1348 1349 1349 sg = sd->groups; 1350 1350 if (cpu != sg->asym_prefer_cpu) { ··· 1373 1373 WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); 1374 1374 } 1375 1375 } 1376 - 1377 - #endif /* CONFIG_SMP */ 1378 1376 1379 1377 /* 1380 1378 * Set of available CPUs grouped by their corresponding capacities ··· 1596 1598 int sched_max_numa_distance; 1597 1599 static int *sched_domains_numa_distance; 1598 1600 static struct cpumask ***sched_domains_numa_masks; 1599 - #endif 1601 + #endif /* CONFIG_NUMA */ 1600 1602 1601 1603 /* 1602 1604 * SD_flags allowed in topology descriptions. ··· 1712 1714 SD_WAKE_AFFINE); 1713 1715 } 1714 1716 1715 - #endif 1717 + #endif /* CONFIG_NUMA */ 1716 1718 } else { 1717 1719 sd->cache_nice_tries = 1; 1718 1720 } ··· 1737 1739 */ 1738 1740 static struct sched_domain_topology_level default_topology[] = { 1739 1741 #ifdef CONFIG_SCHED_SMT 1740 - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 1742 + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), 1741 1743 #endif 1742 1744 1743 1745 #ifdef CONFIG_SCHED_CLUSTER 1744 - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, 1746 + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), 1745 1747 #endif 1746 1748 1747 1749 #ifdef CONFIG_SCHED_MC 1748 - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 1750 + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), 1749 1751 #endif 1750 - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, 1752 + SDTL_INIT(cpu_cpu_mask, NULL, PKG), 1751 1753 { NULL, }, 1752 1754 }; 1753 1755 ··· 2008 2010 /* 2009 2011 * Add the NUMA identity distance, aka single NODE. 2010 2012 */ 2011 - tl[i++] = (struct sched_domain_topology_level){ 2012 - .mask = sd_numa_mask, 2013 - .numa_level = 0, 2014 - SD_INIT_NAME(NODE) 2015 - }; 2013 + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); 2016 2014 2017 2015 /* 2018 2016 * .. and append 'j' levels of NUMA goodness. 2019 2017 */ 2020 2018 for (j = 1; j < nr_levels; i++, j++) { 2021 - tl[i] = (struct sched_domain_topology_level){ 2022 - .mask = sd_numa_mask, 2023 - .sd_flags = cpu_numa_flags, 2024 - .flags = SDTL_OVERLAP, 2025 - .numa_level = j, 2026 - SD_INIT_NAME(NUMA) 2027 - }; 2019 + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); 2020 + tl[i].numa_level = j; 2028 2021 } 2029 2022 2030 2023 sched_domain_topology_saved = sched_domain_topology; ··· 2326 2337 2327 2338 if (sdd->sd) { 2328 2339 sd = *per_cpu_ptr(sdd->sd, j); 2329 - if (sd && (sd->flags & SD_OVERLAP)) 2340 + if (sd && (sd->flags & SD_NUMA)) 2330 2341 free_sched_groups(sd->groups, 0); 2331 2342 kfree(*per_cpu_ptr(sdd->sd, j)); 2332 2343 } ··· 2392 2403 id_seen = sched_domains_tmpmask2; 2393 2404 2394 2405 for_each_sd_topology(tl) { 2406 + int tl_common_flags = 0; 2407 + 2408 + if (tl->sd_flags) 2409 + tl_common_flags = (*tl->sd_flags)(); 2395 2410 2396 2411 /* NUMA levels are allowed to overlap */ 2397 - if (tl->flags & SDTL_OVERLAP) 2412 + if (tl_common_flags & SD_NUMA) 2398 2413 continue; 2399 2414 2400 2415 cpumask_clear(covered); ··· 2469 2476 2470 2477 if (tl == sched_domain_topology) 2471 2478 *per_cpu_ptr(d.sd, i) = sd; 2472 - if (tl->flags & SDTL_OVERLAP) 2473 - sd->flags |= SD_OVERLAP; 2474 2479 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 2475 2480 break; 2476 2481 } ··· 2481 2490 for_each_cpu(i, cpu_map) { 2482 2491 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 2483 2492 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 2484 - if (sd->flags & SD_OVERLAP) { 2493 + if (sd->flags & SD_NUMA) { 2485 2494 if (build_overlap_sched_groups(sd, i)) 2486 2495 goto error; 2487 2496 } else {

+1

kernel/sched/wait.c

··· 4 4 * 5 5 * (C) 2004 Nadia Yvette Chambers, Oracle 6 6 */ 7 + #include "sched.h" 7 8 8 9 void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) 9 10 {

+3

kernel/sched/wait_bit.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 3 + #include <linux/sched/debug.h> 4 + #include "sched.h" 5 + 3 6 /* 4 7 * The implementation of the wait_bit*() and related waiting APIs: 5 8 */

-4

kernel/smpboot.c

··· 18 18 19 19 #include "smpboot.h" 20 20 21 - #ifdef CONFIG_SMP 22 - 23 21 #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD 24 22 /* 25 23 * For the hotplug case we keep the task structs around and reuse ··· 73 75 } 74 76 } 75 77 #endif 76 - 77 - #endif /* #ifdef CONFIG_SMP */ 78 78 79 79 static LIST_HEAD(hotplug_threads); 80 80 static DEFINE_MUTEX(smpboot_threads_lock);

-2

lib/smp_processor_id.c

··· 22 22 if (is_percpu_thread()) 23 23 goto out; 24 24 25 - #ifdef CONFIG_SMP 26 25 if (current->migration_disabled) 27 26 goto out; 28 - #endif 29 27 30 28 /* 31 29 * It is valid to assume CPU-locality during early bootup:

+6

rust/helpers/task.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 + #include <linux/kernel.h> 3 4 #include <linux/sched/task.h> 5 + 6 + void rust_helper_might_resched(void) 7 + { 8 + might_resched(); 9 + } 4 10 5 11 struct task_struct *rust_helper_get_current(void) 6 12 {

+48

rust/kernel/lib.rs

··· 43 43 #![cfg_attr(not(CONFIG_RUSTC_HAS_COERCE_POINTEE), feature(coerce_unsized))] 44 44 #![cfg_attr(not(CONFIG_RUSTC_HAS_COERCE_POINTEE), feature(dispatch_from_dyn))] 45 45 #![cfg_attr(not(CONFIG_RUSTC_HAS_COERCE_POINTEE), feature(unsize))] 46 + // 47 + // `feature(file_with_nul)` is expected to become stable. Before Rust 1.89.0, it did not exist, so 48 + // enable it conditionally. 49 + #![cfg_attr(CONFIG_RUSTC_HAS_FILE_WITH_NUL, feature(file_with_nul))] 46 50 47 51 // Ensure conditional compilation based on the kernel configuration works; 48 52 // otherwise we may silently break things like initcall handling. ··· 282 278 ($($asm:expr),* ; $($rest:tt)*) => { 283 279 ::core::arch::asm!( $($asm)*, $($rest)* ) 284 280 }; 281 + } 282 + 283 + /// Gets the C string file name of a [`Location`]. 284 + /// 285 + /// If `file_with_nul()` is not available, returns a string that warns about it. 286 + /// 287 + /// [`Location`]: core::panic::Location 288 + /// 289 + /// # Examples 290 + /// 291 + /// ``` 292 + /// # use kernel::file_from_location; 293 + /// 294 + /// #[track_caller] 295 + /// fn foo() { 296 + /// let caller = core::panic::Location::caller(); 297 + /// 298 + /// // Output: 299 + /// // - A path like "rust/kernel/example.rs" if file_with_nul() is available. 300 + /// // - "<Location::file_with_nul() not supported>" otherwise. 301 + /// let caller_file = file_from_location(caller); 302 + /// 303 + /// // Prints out the message with caller's file name. 304 + /// pr_info!("foo() called in file {caller_file:?}\n"); 305 + /// 306 + /// # if cfg!(CONFIG_RUSTC_HAS_FILE_WITH_NUL) { 307 + /// # assert_eq!(Ok(caller.file()), caller_file.to_str()); 308 + /// # } 309 + /// } 310 + /// 311 + /// # foo(); 312 + /// ``` 313 + #[inline] 314 + pub fn file_from_location<'a>(loc: &'a core::panic::Location<'a>) -> &'a core::ffi::CStr { 315 + #[cfg(CONFIG_RUSTC_HAS_FILE_WITH_NUL)] 316 + { 317 + loc.file_with_nul() 318 + } 319 + 320 + #[cfg(not(CONFIG_RUSTC_HAS_FILE_WITH_NUL))] 321 + { 322 + let _ = loc; 323 + c"<Location::file_with_nul() not supported>" 324 + } 285 325 }

+3

rust/kernel/sync/condvar.rs

··· 216 216 /// This method behaves like `notify_one`, except that it hints to the scheduler that the 217 217 /// current thread is about to go to sleep, so it should schedule the target thread on the same 218 218 /// CPU. 219 + #[inline] 219 220 pub fn notify_sync(&self) { 220 221 // SAFETY: `wait_queue_head` points to valid memory. 221 222 unsafe { bindings::__wake_up_sync(self.wait_queue_head.get(), TASK_NORMAL) }; ··· 226 225 /// 227 226 /// This is not 'sticky' in the sense that if no thread is waiting, the notification is lost 228 227 /// completely (as opposed to automatically waking up the next waiter). 228 + #[inline] 229 229 pub fn notify_one(&self) { 230 230 self.notify(1); 231 231 } ··· 235 233 /// 236 234 /// This is not 'sticky' in the sense that if no thread is waiting, the notification is lost 237 235 /// completely (as opposed to automatically waking up the next waiter). 236 + #[inline] 238 237 pub fn notify_all(&self) { 239 238 self.notify(0); 240 239 }

+1

rust/kernel/sync/poll.rs

··· 91 91 92 92 #[pinned_drop] 93 93 impl PinnedDrop for PollCondVar { 94 + #[inline] 94 95 fn drop(self: Pin<&mut Self>) { 95 96 // Clear anything registered using `register_wait`. 96 97 //

+33

rust/kernel/task.rs

··· 173 173 /// Callers must ensure that the returned object is only used to access a [`CurrentTask`] 174 174 /// within the task context that was active when this function was called. For more details, 175 175 /// see the invariants section for [`CurrentTask`]. 176 + #[inline] 176 177 pub unsafe fn current() -> impl Deref<Target = CurrentTask> { 177 178 struct TaskRef { 178 179 task: *const CurrentTask, ··· 223 222 } 224 223 225 224 /// Returns the UID of the given task. 225 + #[inline] 226 226 pub fn uid(&self) -> Kuid { 227 227 // SAFETY: It's always safe to call `task_uid` on a valid task. 228 228 Kuid::from_raw(unsafe { bindings::task_uid(self.as_ptr()) }) 229 229 } 230 230 231 231 /// Returns the effective UID of the given task. 232 + #[inline] 232 233 pub fn euid(&self) -> Kuid { 233 234 // SAFETY: It's always safe to call `task_euid` on a valid task. 234 235 Kuid::from_raw(unsafe { bindings::task_euid(self.as_ptr()) }) 235 236 } 236 237 237 238 /// Determines whether the given task has pending signals. 239 + #[inline] 238 240 pub fn signal_pending(&self) -> bool { 239 241 // SAFETY: It's always safe to call `signal_pending` on a valid task. 240 242 unsafe { bindings::signal_pending(self.as_ptr()) != 0 } 241 243 } 242 244 243 245 /// Returns task's pid namespace with elevated reference count 246 + #[inline] 244 247 pub fn get_pid_ns(&self) -> Option<ARef<PidNamespace>> { 245 248 // SAFETY: By the type invariant, we know that `self.0` is valid. 246 249 let ptr = unsafe { bindings::task_get_pid_ns(self.as_ptr()) }; ··· 260 255 261 256 /// Returns the given task's pid in the provided pid namespace. 262 257 #[doc(alias = "task_tgid_nr_ns")] 258 + #[inline] 263 259 pub fn tgid_nr_ns(&self, pidns: Option<&PidNamespace>) -> Pid { 264 260 let pidns = match pidns { 265 261 Some(pidns) => pidns.as_ptr(), ··· 274 268 } 275 269 276 270 /// Wakes up the task. 271 + #[inline] 277 272 pub fn wake_up(&self) { 278 273 // SAFETY: It's always safe to call `wake_up_process` on a valid task, even if the task 279 274 // running. ··· 348 341 349 342 // SAFETY: The type invariants guarantee that `Task` is always refcounted. 350 343 unsafe impl crate::types::AlwaysRefCounted for Task { 344 + #[inline] 351 345 fn inc_ref(&self) { 352 346 // SAFETY: The existence of a shared reference means that the refcount is nonzero. 353 347 unsafe { bindings::get_task_struct(self.as_ptr()) }; 354 348 } 355 349 350 + #[inline] 356 351 unsafe fn dec_ref(obj: ptr::NonNull<Self>) { 357 352 // SAFETY: The safety requirements guarantee that the refcount is nonzero. 358 353 unsafe { bindings::put_task_struct(obj.cast().as_ptr()) } ··· 400 391 } 401 392 402 393 impl Eq for Kuid {} 394 + 395 + /// Annotation for functions that can sleep. 396 + /// 397 + /// Equivalent to the C side [`might_sleep()`], this function serves as 398 + /// a debugging aid and a potential scheduling point. 399 + /// 400 + /// This function can only be used in a nonatomic context. 401 + /// 402 + /// [`might_sleep()`]: https://docs.kernel.org/driver-api/basics.html#c.might_sleep 403 + #[track_caller] 404 + #[inline] 405 + pub fn might_sleep() { 406 + #[cfg(CONFIG_DEBUG_ATOMIC_SLEEP)] 407 + { 408 + let loc = core::panic::Location::caller(); 409 + let file = kernel::file_from_location(loc); 410 + 411 + // SAFETY: `file.as_ptr()` is valid for reading and guaranteed to be nul-terminated. 412 + unsafe { crate::bindings::__might_sleep(file.as_ptr().cast(), loc.line() as i32) } 413 + } 414 + 415 + // SAFETY: Always safe to call. 416 + unsafe { crate::bindings::might_resched() } 417 + }

+57

tools/sched/dl_bw_dump.py

··· 1 + #!/usr/bin/env drgn 2 + # SPDX-License-Identifier: GPL-2.0 3 + # Copyright (C) 2025 Juri Lelli <juri.lelli@redhat.com> 4 + # Copyright (C) 2025 Red Hat, Inc. 5 + 6 + desc = """ 7 + This is a drgn script to show dl_rq bandwidth accounting information. For more 8 + info on drgn, visit https://github.com/osandov/drgn. 9 + 10 + Only online CPUs are reported. 11 + """ 12 + 13 + import os 14 + import argparse 15 + 16 + import drgn 17 + from drgn import FaultError 18 + from drgn.helpers.common import * 19 + from drgn.helpers.linux import * 20 + 21 + def print_dl_bws_info(): 22 + 23 + print("Retrieving dl_rq bandwidth accounting information:") 24 + 25 + runqueues = prog['runqueues'] 26 + 27 + for cpu_id in for_each_possible_cpu(prog): 28 + try: 29 + rq = per_cpu(runqueues, cpu_id) 30 + 31 + if rq.online == 0: 32 + continue 33 + 34 + dl_rq = rq.dl 35 + 36 + print(f" From CPU: {cpu_id}") 37 + 38 + # Access and print relevant fields from struct dl_rq 39 + print(f" running_bw : {dl_rq.running_bw}") 40 + print(f" this_bw : {dl_rq.this_bw}") 41 + print(f" extra_bw : {dl_rq.extra_bw}") 42 + print(f" max_bw : {dl_rq.max_bw}") 43 + print(f" bw_ratio : {dl_rq.bw_ratio}") 44 + 45 + except drgn.FaultError as fe: 46 + print(f" (CPU {cpu_id}: Fault accessing kernel memory: {fe})") 47 + except AttributeError as ae: 48 + print(f" (CPU {cpu_id}: Missing attribute for root_domain (kernel struct change?): {ae})") 49 + except Exception as e: 50 + print(f" (CPU {cpu_id}: An unexpected error occurred: {e})") 51 + 52 + if __name__ == "__main__": 53 + parser = argparse.ArgumentParser(description=desc, 54 + formatter_class=argparse.RawTextHelpFormatter) 55 + args = parser.parse_args() 56 + 57 + print_dl_bws_info()

+68

tools/sched/root_domains_dump.py

··· 1 + #!/usr/bin/env drgn 2 + # SPDX-License-Identifier: GPL-2.0 3 + # Copyright (C) 2025 Juri Lelli <juri.lelli@redhat.com> 4 + # Copyright (C) 2025 Red Hat, Inc. 5 + 6 + desc = """ 7 + This is a drgn script to show the current root domains configuration. For more 8 + info on drgn, visit https://github.com/osandov/drgn. 9 + 10 + Root domains are only printed once, as multiple CPUs might be attached to the 11 + same root domain. 12 + """ 13 + 14 + import os 15 + import argparse 16 + 17 + import drgn 18 + from drgn import FaultError 19 + from drgn.helpers.common import * 20 + from drgn.helpers.linux import * 21 + 22 + def print_root_domains_info(): 23 + 24 + # To store unique root domains found 25 + seen_root_domains = set() 26 + 27 + print("Retrieving (unique) Root Domain Information:") 28 + 29 + runqueues = prog['runqueues'] 30 + def_root_domain = prog['def_root_domain'] 31 + 32 + for cpu_id in for_each_possible_cpu(prog): 33 + try: 34 + rq = per_cpu(runqueues, cpu_id) 35 + 36 + root_domain = rq.rd 37 + 38 + # Check if we've already processed this root domain to avoid duplicates 39 + # Use the memory address of the root_domain as a unique identifier 40 + root_domain_cast = int(root_domain) 41 + if root_domain_cast in seen_root_domains: 42 + continue 43 + seen_root_domains.add(root_domain_cast) 44 + 45 + if root_domain_cast == int(def_root_domain.address_): 46 + print(f"\n--- Root Domain @ def_root_domain ---") 47 + else: 48 + print(f"\n--- Root Domain @ 0x{root_domain_cast:x} ---") 49 + 50 + print(f" From CPU: {cpu_id}") # This CPU belongs to this root domain 51 + 52 + # Access and print relevant fields from struct root_domain 53 + print(f" Span : {cpumask_to_cpulist(root_domain.span[0])}") 54 + print(f" Online : {cpumask_to_cpulist(root_domain.span[0])}") 55 + 56 + except drgn.FaultError as fe: 57 + print(f" (CPU {cpu_id}: Fault accessing kernel memory: {fe})") 58 + except AttributeError as ae: 59 + print(f" (CPU {cpu_id}: Missing attribute for root_domain (kernel struct change?): {ae})") 60 + except Exception as e: 61 + print(f" (CPU {cpu_id}: An unexpected error occurred: {e})") 62 + 63 + if __name__ == "__main__": 64 + parser = argparse.ArgumentParser(description=desc, 65 + formatter_class=argparse.RawTextHelpFormatter) 66 + args = parser.parse_args() 67 + 68 + print_root_domains_info()

Configure Feed

Configure Feed