Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+16

Documentation/admin-guide/kernel-parameters.txt

··· 4428 4428 incurs a small amount of overhead in the scheduler 4429 4429 but is useful for debugging and performance tuning. 4430 4430 4431 + sched_thermal_decay_shift= 4432 + [KNL, SMP] Set a decay shift for scheduler thermal 4433 + pressure signal. Thermal pressure signal follows the 4434 + default decay period of other scheduler pelt 4435 + signals(usually 32 ms but configurable). Setting 4436 + sched_thermal_decay_shift will left shift the decay 4437 + period for the thermal pressure signal by the shift 4438 + value. 4439 + i.e. with the default pelt decay period of 32 ms 4440 + sched_thermal_decay_shift thermal pressure decay pr 4441 + 1 64 ms 4442 + 2 128 ms 4443 + and so on. 4444 + Format: integer between 0 and 10 4445 + Default is 0. 4446 + 4431 4447 skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate 4432 4448 xtime_lock contention on larger systems, and/or RCU lock 4433 4449 contention on all systems with CONFIG_MAXSMP set.

+6 -8

Documentation/robust-futex-ABI.txt

··· 61 61 address of the associated 'lock entry', plus or minus, of what will 62 62 be called the 'lock word', from that 'lock entry'. The 'lock word' 63 63 is always a 32 bit word, unlike the other words above. The 'lock 64 - word' holds 3 flag bits in the upper 3 bits, and the thread id (TID) 65 - of the thread holding the lock in the bottom 29 bits. See further 64 + word' holds 2 flag bits in the upper 2 bits, and the thread id (TID) 65 + of the thread holding the lock in the bottom 30 bits. See further 66 66 below for a description of the flag bits. 67 67 68 68 The third word, called 'list_op_pending', contains transient copy of ··· 128 128 A given futex lock structure in a user shared memory region may be held 129 129 at different times by any of the threads with access to that region. The 130 130 thread currently holding such a lock, if any, is marked with the threads 131 - TID in the lower 29 bits of the 'lock word'. 131 + TID in the lower 30 bits of the 'lock word'. 132 132 133 133 When adding or removing a lock from its list of held locks, in order for 134 134 the kernel to correctly handle lock cleanup regardless of when the task ··· 141 141 1) set the 'list_op_pending' word to the address of the 'lock entry' 142 142 to be inserted, 143 143 2) acquire the futex lock, 144 - 3) add the lock entry, with its thread id (TID) in the bottom 29 bits 144 + 3) add the lock entry, with its thread id (TID) in the bottom 30 bits 145 145 of the 'lock word', to the linked list starting at 'head', and 146 146 4) clear the 'list_op_pending' word. 147 147 ··· 155 155 156 156 On exit, the kernel will consider the address stored in 157 157 'list_op_pending' and the address of each 'lock word' found by walking 158 - the list starting at 'head'. For each such address, if the bottom 29 158 + the list starting at 'head'. For each such address, if the bottom 30 159 159 bits of the 'lock word' at offset 'offset' from that address equals the 160 160 exiting threads TID, then the kernel will do two things: 161 161 ··· 180 180 future kernel configuration changes) elements. 181 181 182 182 When the kernel sees a list entry whose 'lock word' doesn't have the 183 - current threads TID in the lower 29 bits, it does nothing with that 183 + current threads TID in the lower 30 bits, it does nothing with that 184 184 entry, and goes on to the next entry. 185 - 186 - Bit 29 (0x20000000) of the 'lock word' is reserved for future use.

+6

MAINTAINERS

··· 13552 13552 F: include/net/psample.h 13553 13553 F: include/uapi/linux/psample.h 13554 13554 13555 + PRESSURE STALL INFORMATION (PSI) 13556 + M: Johannes Weiner <hannes@cmpxchg.org> 13557 + S: Maintained 13558 + F: kernel/sched/psi.c 13559 + F: include/linux/psi* 13560 + 13555 13561 PSTORE FILESYSTEM 13556 13562 M: Kees Cook <keescook@chromium.org> 13557 13563 M: Anton Vorontsov <anton@enomsg.org>

+3

arch/arm/include/asm/topology.h

··· 16 16 /* Enable topology flag updates */ 17 17 #define arch_update_cpu_topology topology_update_cpu_topology 18 18 19 + /* Replace task scheduler's default thermal pressure retrieve API */ 20 + #define arch_scale_thermal_pressure topology_get_thermal_pressure 21 + 19 22 #else 20 23 21 24 static inline void init_cpu_topology(void) { }

+1

arch/arm64/configs/defconfig

··· 62 62 CONFIG_ARCH_ZYNQMP=y 63 63 CONFIG_ARM64_VA_BITS_48=y 64 64 CONFIG_SCHED_MC=y 65 + CONFIG_SCHED_SMT=y 65 66 CONFIG_NUMA=y 66 67 CONFIG_SECCOMP=y 67 68 CONFIG_KEXEC=y

+3

arch/arm64/include/asm/topology.h

··· 25 25 /* Enable topology flag updates */ 26 26 #define arch_update_cpu_topology topology_update_cpu_topology 27 27 28 + /* Replace task scheduler's default thermal pressure retrieve API */ 29 + #define arch_scale_thermal_pressure topology_get_thermal_pressure 30 + 28 31 #include <asm-generic/topology.h> 29 32 30 33 #endif /* _ASM_ARM_TOPOLOGY_H */

+25

arch/x86/include/asm/topology.h

··· 193 193 } 194 194 #endif /* CONFIG_SCHED_MC_PRIO */ 195 195 196 + #ifdef CONFIG_SMP 197 + #include <asm/cpufeature.h> 198 + 199 + DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); 200 + 201 + #define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key) 202 + 203 + DECLARE_PER_CPU(unsigned long, arch_freq_scale); 204 + 205 + static inline long arch_scale_freq_capacity(int cpu) 206 + { 207 + return per_cpu(arch_freq_scale, cpu); 208 + } 209 + #define arch_scale_freq_capacity arch_scale_freq_capacity 210 + 211 + extern void arch_scale_freq_tick(void); 212 + #define arch_scale_freq_tick arch_scale_freq_tick 213 + 214 + extern void arch_set_max_freq_ratio(bool turbo_disabled); 215 + #else 216 + static inline void arch_set_max_freq_ratio(bool turbo_disabled) 217 + { 218 + } 219 + #endif 220 + 196 221 #endif /* _ASM_X86_TOPOLOGY_H */

+289 -1

arch/x86/kernel/smpboot.c

··· 147 147 *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; 148 148 } 149 149 150 + static void init_freq_invariance(void); 151 + 150 152 /* 151 153 * Report back to the Boot Processor during boot time or to the caller processor 152 154 * during CPU online. ··· 184 182 * calibrate_delay() and notify_cpu_starting(). 185 183 */ 186 184 set_cpu_sibling_map(raw_smp_processor_id()); 185 + 186 + init_freq_invariance(); 187 187 188 188 /* 189 189 * Get our bogomips. ··· 1341 1337 set_sched_topology(x86_topology); 1342 1338 1343 1339 set_cpu_sibling_map(0); 1344 - 1340 + init_freq_invariance(); 1345 1341 smp_sanity_check(); 1346 1342 1347 1343 switch (apic_intr_mode) { ··· 1768 1764 } 1769 1765 1770 1766 #endif 1767 + 1768 + /* 1769 + * APERF/MPERF frequency ratio computation. 1770 + * 1771 + * The scheduler wants to do frequency invariant accounting and needs a <1 1772 + * ratio to account for the 'current' frequency, corresponding to 1773 + * freq_curr / freq_max. 1774 + * 1775 + * Since the frequency freq_curr on x86 is controlled by micro-controller and 1776 + * our P-state setting is little more than a request/hint, we need to observe 1777 + * the effective frequency 'BusyMHz', i.e. the average frequency over a time 1778 + * interval after discarding idle time. This is given by: 1779 + * 1780 + * BusyMHz = delta_APERF / delta_MPERF * freq_base 1781 + * 1782 + * where freq_base is the max non-turbo P-state. 1783 + * 1784 + * The freq_max term has to be set to a somewhat arbitrary value, because we 1785 + * can't know which turbo states will be available at a given point in time: 1786 + * it all depends on the thermal headroom of the entire package. We set it to 1787 + * the turbo level with 4 cores active. 1788 + * 1789 + * Benchmarks show that's a good compromise between the 1C turbo ratio 1790 + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, 1791 + * which would ignore the entire turbo range (a conspicuous part, making 1792 + * freq_curr/freq_max always maxed out). 1793 + * 1794 + * An exception to the heuristic above is the Atom uarch, where we choose the 1795 + * highest turbo level for freq_max since Atom's are generally oriented towards 1796 + * power efficiency. 1797 + * 1798 + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio 1799 + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. 1800 + */ 1801 + 1802 + DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); 1803 + 1804 + static DEFINE_PER_CPU(u64, arch_prev_aperf); 1805 + static DEFINE_PER_CPU(u64, arch_prev_mperf); 1806 + static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; 1807 + static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; 1808 + 1809 + void arch_set_max_freq_ratio(bool turbo_disabled) 1810 + { 1811 + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : 1812 + arch_turbo_freq_ratio; 1813 + } 1814 + 1815 + static bool turbo_disabled(void) 1816 + { 1817 + u64 misc_en; 1818 + int err; 1819 + 1820 + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); 1821 + if (err) 1822 + return false; 1823 + 1824 + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); 1825 + } 1826 + 1827 + static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 1828 + { 1829 + int err; 1830 + 1831 + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); 1832 + if (err) 1833 + return false; 1834 + 1835 + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); 1836 + if (err) 1837 + return false; 1838 + 1839 + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ 1840 + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ 1841 + 1842 + return true; 1843 + } 1844 + 1845 + #include <asm/cpu_device_id.h> 1846 + #include <asm/intel-family.h> 1847 + 1848 + #define ICPU(model) \ 1849 + {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} 1850 + 1851 + static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { 1852 + ICPU(INTEL_FAM6_XEON_PHI_KNL), 1853 + ICPU(INTEL_FAM6_XEON_PHI_KNM), 1854 + {} 1855 + }; 1856 + 1857 + static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { 1858 + ICPU(INTEL_FAM6_SKYLAKE_X), 1859 + {} 1860 + }; 1861 + 1862 + static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { 1863 + ICPU(INTEL_FAM6_ATOM_GOLDMONT), 1864 + ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), 1865 + ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), 1866 + {} 1867 + }; 1868 + 1869 + static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, 1870 + int num_delta_fratio) 1871 + { 1872 + int fratio, delta_fratio, found; 1873 + int err, i; 1874 + u64 msr; 1875 + 1876 + if (!x86_match_cpu(has_knl_turbo_ratio_limits)) 1877 + return false; 1878 + 1879 + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 1880 + if (err) 1881 + return false; 1882 + 1883 + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 1884 + 1885 + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); 1886 + if (err) 1887 + return false; 1888 + 1889 + fratio = (msr >> 8) & 0xFF; 1890 + i = 16; 1891 + found = 0; 1892 + do { 1893 + if (found >= num_delta_fratio) { 1894 + *turbo_freq = fratio; 1895 + return true; 1896 + } 1897 + 1898 + delta_fratio = (msr >> (i + 5)) & 0x7; 1899 + 1900 + if (delta_fratio) { 1901 + found += 1; 1902 + fratio -= delta_fratio; 1903 + } 1904 + 1905 + i += 8; 1906 + } while (i < 64); 1907 + 1908 + return true; 1909 + } 1910 + 1911 + static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) 1912 + { 1913 + u64 ratios, counts; 1914 + u32 group_size; 1915 + int err, i; 1916 + 1917 + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 1918 + if (err) 1919 + return false; 1920 + 1921 + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 1922 + 1923 + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); 1924 + if (err) 1925 + return false; 1926 + 1927 + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); 1928 + if (err) 1929 + return false; 1930 + 1931 + for (i = 0; i < 64; i += 8) { 1932 + group_size = (counts >> i) & 0xFF; 1933 + if (group_size >= size) { 1934 + *turbo_freq = (ratios >> i) & 0xFF; 1935 + return true; 1936 + } 1937 + } 1938 + 1939 + return false; 1940 + } 1941 + 1942 + static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) 1943 + { 1944 + int err; 1945 + 1946 + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); 1947 + if (err) 1948 + return false; 1949 + 1950 + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq); 1951 + if (err) 1952 + return false; 1953 + 1954 + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ 1955 + *turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */ 1956 + 1957 + return true; 1958 + } 1959 + 1960 + static bool intel_set_max_freq_ratio(void) 1961 + { 1962 + u64 base_freq, turbo_freq; 1963 + 1964 + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) 1965 + goto out; 1966 + 1967 + if (x86_match_cpu(has_glm_turbo_ratio_limits) && 1968 + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 1969 + goto out; 1970 + 1971 + if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) 1972 + goto out; 1973 + 1974 + if (x86_match_cpu(has_skx_turbo_ratio_limits) && 1975 + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) 1976 + goto out; 1977 + 1978 + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) 1979 + goto out; 1980 + 1981 + return false; 1982 + 1983 + out: 1984 + arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, 1985 + base_freq); 1986 + arch_set_max_freq_ratio(turbo_disabled()); 1987 + return true; 1988 + } 1989 + 1990 + static void init_counter_refs(void *arg) 1991 + { 1992 + u64 aperf, mperf; 1993 + 1994 + rdmsrl(MSR_IA32_APERF, aperf); 1995 + rdmsrl(MSR_IA32_MPERF, mperf); 1996 + 1997 + this_cpu_write(arch_prev_aperf, aperf); 1998 + this_cpu_write(arch_prev_mperf, mperf); 1999 + } 2000 + 2001 + static void init_freq_invariance(void) 2002 + { 2003 + bool ret = false; 2004 + 2005 + if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF)) 2006 + return; 2007 + 2008 + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) 2009 + ret = intel_set_max_freq_ratio(); 2010 + 2011 + if (ret) { 2012 + on_each_cpu(init_counter_refs, NULL, 1); 2013 + static_branch_enable(&arch_scale_freq_key); 2014 + } else { 2015 + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); 2016 + } 2017 + } 2018 + 2019 + DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; 2020 + 2021 + void arch_scale_freq_tick(void) 2022 + { 2023 + u64 freq_scale; 2024 + u64 aperf, mperf; 2025 + u64 acnt, mcnt; 2026 + 2027 + if (!arch_scale_freq_invariant()) 2028 + return; 2029 + 2030 + rdmsrl(MSR_IA32_APERF, aperf); 2031 + rdmsrl(MSR_IA32_MPERF, mperf); 2032 + 2033 + acnt = aperf - this_cpu_read(arch_prev_aperf); 2034 + mcnt = mperf - this_cpu_read(arch_prev_mperf); 2035 + if (!mcnt) 2036 + return; 2037 + 2038 + this_cpu_write(arch_prev_aperf, aperf); 2039 + this_cpu_write(arch_prev_mperf, mperf); 2040 + 2041 + acnt <<= 2*SCHED_CAPACITY_SHIFT; 2042 + mcnt *= arch_max_freq_ratio; 2043 + 2044 + freq_scale = div64_u64(acnt, mcnt); 2045 + 2046 + if (freq_scale > SCHED_CAPACITY_SCALE) 2047 + freq_scale = SCHED_CAPACITY_SCALE; 2048 + 2049 + this_cpu_write(arch_freq_scale, freq_scale); 2050 + }

+1

drivers/cpufreq/intel_pstate.c

··· 922 922 */ 923 923 if (global.turbo_disabled_mf != global.turbo_disabled) { 924 924 global.turbo_disabled_mf = global.turbo_disabled; 925 + arch_set_max_freq_ratio(global.turbo_disabled); 925 926 for_each_possible_cpu(cpu) 926 927 intel_pstate_update_max_freq(cpu); 927 928 } else {

+17 -2

drivers/thermal/cpufreq_cooling.c

··· 431 431 unsigned long state) 432 432 { 433 433 struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; 434 + struct cpumask *cpus; 435 + unsigned int frequency; 436 + unsigned long max_capacity, capacity; 437 + int ret; 434 438 435 439 /* Request state should be less than max_level */ 436 440 if (WARN_ON(state > cpufreq_cdev->max_level)) ··· 446 442 447 443 cpufreq_cdev->cpufreq_state = state; 448 444 449 - return freq_qos_update_request(&cpufreq_cdev->qos_req, 450 - get_state_freq(cpufreq_cdev, state)); 445 + frequency = get_state_freq(cpufreq_cdev, state); 446 + 447 + ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency); 448 + 449 + if (ret > 0) { 450 + cpus = cpufreq_cdev->policy->cpus; 451 + max_capacity = arch_scale_cpu_capacity(cpumask_first(cpus)); 452 + capacity = frequency * max_capacity; 453 + capacity /= cpufreq_cdev->policy->cpuinfo.max_freq; 454 + arch_set_thermal_pressure(cpus, max_capacity - capacity); 455 + } 456 + 457 + return ret; 451 458 } 452 459 453 460 /* Bind cpufreq callbacks to thermal cooling device ops */

+10

include/linux/arch_topology.h

··· 30 30 return per_cpu(freq_scale, cpu); 31 31 } 32 32 33 + DECLARE_PER_CPU(unsigned long, thermal_pressure); 34 + 35 + static inline unsigned long topology_get_thermal_pressure(int cpu) 36 + { 37 + return per_cpu(thermal_pressure, cpu); 38 + } 39 + 40 + void arch_set_thermal_pressure(struct cpumask *cpus, 41 + unsigned long th_pressure); 42 + 33 43 struct cpu_topology { 34 44 int thread_id; 35 45 int core_id;

+7

include/linux/cpumask.h

··· 194 194 return 0; 195 195 } 196 196 197 + static inline int cpumask_any_and_distribute(const struct cpumask *src1p, 198 + const struct cpumask *src2p) { 199 + return cpumask_next_and(-1, src1p, src2p); 200 + } 201 + 197 202 #define for_each_cpu(cpu, mask) \ 198 203 for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) 199 204 #define for_each_cpu_not(cpu, mask) \ ··· 250 245 int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); 251 246 int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); 252 247 unsigned int cpumask_local_spread(unsigned int i, int node); 248 + int cpumask_any_and_distribute(const struct cpumask *src1p, 249 + const struct cpumask *src2p); 253 250 254 251 /** 255 252 * for_each_cpu - iterate over every cpu in a mask

+7

include/linux/kernel.h

··· 257 257 258 258 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) 259 259 260 + #ifndef CONFIG_PREEMPT_RT 261 + # define cant_migrate() cant_sleep() 262 + #else 263 + /* Placeholder for now */ 264 + # define cant_migrate() do { } while (0) 265 + #endif 266 + 260 267 /** 261 268 * abs - return absolute value of an argument 262 269 * @x: the value. If it is unsigned type, it is converted to signed type first.

+30

include/linux/preempt.h

··· 322 322 323 323 #endif 324 324 325 + /** 326 + * migrate_disable - Prevent migration of the current task 327 + * 328 + * Maps to preempt_disable() which also disables preemption. Use 329 + * migrate_disable() to annotate that the intent is to prevent migration, 330 + * but not necessarily preemption. 331 + * 332 + * Can be invoked nested like preempt_disable() and needs the corresponding 333 + * number of migrate_enable() invocations. 334 + */ 335 + static __always_inline void migrate_disable(void) 336 + { 337 + preempt_disable(); 338 + } 339 + 340 + /** 341 + * migrate_enable - Allow migration of the current task 342 + * 343 + * Counterpart to migrate_disable(). 344 + * 345 + * As migrate_disable() can be invoked nested, only the outermost invocation 346 + * reenables migration. 347 + * 348 + * Currently mapped to preempt_enable(). 349 + */ 350 + static __always_inline void migrate_enable(void) 351 + { 352 + preempt_enable(); 353 + } 354 + 325 355 #endif /* __LINUX_PREEMPT_H */

+2

include/linux/psi.h

··· 17 17 void psi_init(void); 18 18 19 19 void psi_task_change(struct task_struct *task, int clear, int set); 20 + void psi_task_switch(struct task_struct *prev, struct task_struct *next, 21 + bool sleep); 20 22 21 23 void psi_memstall_tick(struct task_struct *task, int cpu); 22 24 void psi_memstall_enter(unsigned long *flags);

+9 -1

include/linux/psi_types.h

··· 14 14 NR_IOWAIT, 15 15 NR_MEMSTALL, 16 16 NR_RUNNING, 17 - NR_PSI_TASK_COUNTS = 3, 17 + /* 18 + * This can't have values other than 0 or 1 and could be 19 + * implemented as a bit flag. But for now we still have room 20 + * in the first cacheline of psi_group_cpu, and this way we 21 + * don't have to special case any state tracking for it. 22 + */ 23 + NR_ONCPU, 24 + NR_PSI_TASK_COUNTS = 4, 18 25 }; 19 26 20 27 /* Task state bitmasks */ 21 28 #define TSK_IOWAIT (1 << NR_IOWAIT) 22 29 #define TSK_MEMSTALL (1 << NR_MEMSTALL) 23 30 #define TSK_RUNNING (1 << NR_RUNNING) 31 + #define TSK_ONCPU (1 << NR_ONCPU) 24 32 25 33 /* Resources that workloads could be stalled on */ 26 34 enum psi_res {

+21 -16

include/linux/sched.h

··· 356 356 } __attribute__((__aligned__(sizeof(u64)))); 357 357 358 358 /* 359 - * The load_avg/util_avg accumulates an infinite geometric series 360 - * (see __update_load_avg() in kernel/sched/fair.c). 359 + * The load/runnable/util_avg accumulates an infinite geometric series 360 + * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). 361 361 * 362 362 * [load_avg definition] 363 363 * 364 364 * load_avg = runnable% * scale_load_down(load) 365 365 * 366 - * where runnable% is the time ratio that a sched_entity is runnable. 367 - * For cfs_rq, it is the aggregated load_avg of all runnable and 368 - * blocked sched_entities. 366 + * [runnable_avg definition] 367 + * 368 + * runnable_avg = runnable% * SCHED_CAPACITY_SCALE 369 369 * 370 370 * [util_avg definition] 371 371 * 372 372 * util_avg = running% * SCHED_CAPACITY_SCALE 373 373 * 374 - * where running% is the time ratio that a sched_entity is running on 375 - * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable 376 - * and blocked sched_entities. 374 + * where runnable% is the time ratio that a sched_entity is runnable and 375 + * running% the time ratio that a sched_entity is running. 377 376 * 378 - * load_avg and util_avg don't direcly factor frequency scaling and CPU 379 - * capacity scaling. The scaling is done through the rq_clock_pelt that 380 - * is used for computing those signals (see update_rq_clock_pelt()) 377 + * For cfs_rq, they are the aggregated values of all runnable and blocked 378 + * sched_entities. 379 + * 380 + * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU 381 + * capacity scaling. The scaling is done through the rq_clock_pelt that is used 382 + * for computing those signals (see update_rq_clock_pelt()) 381 383 * 382 384 * N.B., the above ratios (runnable% and running%) themselves are in the 383 385 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them ··· 403 401 struct sched_avg { 404 402 u64 last_update_time; 405 403 u64 load_sum; 406 - u64 runnable_load_sum; 404 + u64 runnable_sum; 407 405 u32 util_sum; 408 406 u32 period_contrib; 409 407 unsigned long load_avg; 410 - unsigned long runnable_load_avg; 408 + unsigned long runnable_avg; 411 409 unsigned long util_avg; 412 410 struct util_est util_est; 413 411 } ____cacheline_aligned; ··· 451 449 struct sched_entity { 452 450 /* For load-balancing: */ 453 451 struct load_weight load; 454 - unsigned long runnable_weight; 455 452 struct rb_node run_node; 456 453 struct list_head group_node; 457 454 unsigned int on_rq; ··· 471 470 struct cfs_rq *cfs_rq; 472 471 /* rq "owned" by this entity/group: */ 473 472 struct cfs_rq *my_q; 473 + /* cached value of my_q->h_nr_running */ 474 + unsigned long runnable_weight; 474 475 #endif 475 476 476 477 #ifdef CONFIG_SMP ··· 785 782 unsigned frozen:1; 786 783 #endif 787 784 #ifdef CONFIG_BLK_CGROUP 788 - /* to be used once the psi infrastructure lands upstream. */ 789 785 unsigned use_memdelay:1; 786 + #endif 787 + #ifdef CONFIG_PSI 788 + /* Stalled due to lack of memory */ 789 + unsigned in_memstall:1; 790 790 #endif 791 791 792 792 unsigned long atomic_flags; /* Flags requiring atomic access. */ ··· 1485 1479 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1486 1480 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ 1487 1481 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1488 - #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ 1489 1482 #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ 1490 1483 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ 1491 1484 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */

+8

include/linux/sched/topology.h

··· 225 225 } 226 226 #endif 227 227 228 + #ifndef arch_scale_thermal_pressure 229 + static __always_inline 230 + unsigned long arch_scale_thermal_pressure(int cpu) 231 + { 232 + return 0; 233 + } 234 + #endif 235 + 228 236 static inline int task_node(const struct task_struct *p) 229 237 { 230 238 return cpu_to_node(task_cpu(p));

+1 -1

include/linux/threads.h

··· 29 29 30 30 /* 31 31 * A maximum of 4 million PIDs should be enough for a while. 32 - * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.] 32 + * [NOTE: PID/TIDs are limited to 2^30 ~= 1 billion, see FUTEX_TID_MASK.] 33 33 */ 34 34 #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \ 35 35 (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))

+31 -22

include/trace/events/sched.h

··· 487 487 ); 488 488 #endif /* CONFIG_DETECT_HUNG_TASK */ 489 489 490 - DECLARE_EVENT_CLASS(sched_move_task_template, 490 + /* 491 + * Tracks migration of tasks from one runqueue to another. Can be used to 492 + * detect if automatic NUMA balancing is bouncing between nodes. 493 + */ 494 + TRACE_EVENT(sched_move_numa, 491 495 492 496 TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), 493 497 ··· 523 519 __entry->dst_cpu, __entry->dst_nid) 524 520 ); 525 521 526 - /* 527 - * Tracks migration of tasks from one runqueue to another. Can be used to 528 - * detect if automatic NUMA balancing is bouncing between nodes 529 - */ 530 - DEFINE_EVENT(sched_move_task_template, sched_move_numa, 531 - TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), 532 - 533 - TP_ARGS(tsk, src_cpu, dst_cpu) 534 - ); 535 - 536 - DEFINE_EVENT(sched_move_task_template, sched_stick_numa, 537 - TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), 538 - 539 - TP_ARGS(tsk, src_cpu, dst_cpu) 540 - ); 541 - 542 - TRACE_EVENT(sched_swap_numa, 522 + DECLARE_EVENT_CLASS(sched_numa_pair_template, 543 523 544 524 TP_PROTO(struct task_struct *src_tsk, int src_cpu, 545 525 struct task_struct *dst_tsk, int dst_cpu), ··· 549 561 __entry->src_ngid = task_numa_group_id(src_tsk); 550 562 __entry->src_cpu = src_cpu; 551 563 __entry->src_nid = cpu_to_node(src_cpu); 552 - __entry->dst_pid = task_pid_nr(dst_tsk); 553 - __entry->dst_tgid = task_tgid_nr(dst_tsk); 554 - __entry->dst_ngid = task_numa_group_id(dst_tsk); 564 + __entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0; 565 + __entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0; 566 + __entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0; 555 567 __entry->dst_cpu = dst_cpu; 556 - __entry->dst_nid = cpu_to_node(dst_cpu); 568 + __entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1; 557 569 ), 558 570 559 571 TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", ··· 562 574 __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, 563 575 __entry->dst_cpu, __entry->dst_nid) 564 576 ); 577 + 578 + DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa, 579 + 580 + TP_PROTO(struct task_struct *src_tsk, int src_cpu, 581 + struct task_struct *dst_tsk, int dst_cpu), 582 + 583 + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) 584 + ); 585 + 586 + DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, 587 + 588 + TP_PROTO(struct task_struct *src_tsk, int src_cpu, 589 + struct task_struct *dst_tsk, int dst_cpu), 590 + 591 + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) 592 + ); 593 + 565 594 566 595 /* 567 596 * Tracepoint for waking a polling cpu without an IPI. ··· 615 610 TP_ARGS(rq)); 616 611 617 612 DECLARE_TRACE(pelt_dl_tp, 613 + TP_PROTO(struct rq *rq), 614 + TP_ARGS(rq)); 615 + 616 + DECLARE_TRACE(pelt_thermal_tp, 618 617 TP_PROTO(struct rq *rq), 619 618 TP_ARGS(rq)); 620 619

+4

init/Kconfig

··· 451 451 depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING 452 452 depends on SMP 453 453 454 + config SCHED_THERMAL_PRESSURE 455 + bool "Enable periodic averaging of thermal pressure" 456 + depends on SMP 457 + 454 458 config BSD_PROCESS_ACCT 455 459 bool "BSD Process Accounting" 456 460 depends on MULTIUSER

+15 -2

kernel/kthread.c

··· 199 199 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) 200 200 break; 201 201 202 + /* 203 + * Thread is going to call schedule(), do not preempt it, 204 + * or the caller of kthread_park() may spend more time in 205 + * wait_task_inactive(). 206 + */ 207 + preempt_disable(); 202 208 complete(&self->parked); 203 - schedule(); 209 + schedule_preempt_disabled(); 210 + preempt_enable(); 204 211 } 205 212 __set_current_state(TASK_RUNNING); 206 213 } ··· 252 245 /* OK, tell user we're spawned, wait for stop or wakeup */ 253 246 __set_current_state(TASK_UNINTERRUPTIBLE); 254 247 create->result = current; 248 + /* 249 + * Thread is going to call schedule(), do not preempt it, 250 + * or the creator may spend more time in wait_task_inactive(). 251 + */ 252 + preempt_disable(); 255 253 complete(done); 256 - schedule(); 254 + schedule_preempt_disabled(); 255 + preempt_enable(); 257 256 258 257 ret = -EINTR; 259 258 if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {

+23 -4

kernel/sched/core.c

··· 761 761 if (task_has_idle_policy(p)) { 762 762 load->weight = scale_load(WEIGHT_IDLEPRIO); 763 763 load->inv_weight = WMULT_IDLEPRIO; 764 - p->se.runnable_weight = load->weight; 765 764 return; 766 765 } 767 766 ··· 773 774 } else { 774 775 load->weight = scale_load(sched_prio_to_weight[prio]); 775 776 load->inv_weight = sched_prio_to_wmult[prio]; 776 - p->se.runnable_weight = load->weight; 777 777 } 778 778 } 779 779 ··· 1650 1652 if (cpumask_equal(p->cpus_ptr, new_mask)) 1651 1653 goto out; 1652 1654 1653 - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); 1655 + /* 1656 + * Picking a ~random cpu helps in cases where we are changing affinity 1657 + * for groups of tasks (ie. cpuset), so that load balancing is not 1658 + * immediately required to distribute the tasks within their new mask. 1659 + */ 1660 + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); 1654 1661 if (dest_cpu >= nr_cpu_ids) { 1655 1662 ret = -EINVAL; 1656 1663 goto out; ··· 3581 3578 return ns; 3582 3579 } 3583 3580 3581 + DEFINE_PER_CPU(unsigned long, thermal_pressure); 3582 + 3583 + void arch_set_thermal_pressure(struct cpumask *cpus, 3584 + unsigned long th_pressure) 3585 + { 3586 + int cpu; 3587 + 3588 + for_each_cpu(cpu, cpus) 3589 + WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); 3590 + } 3591 + 3584 3592 /* 3585 3593 * This function gets called by the timer code, with HZ frequency. 3586 3594 * We call it with interrupts disabled. ··· 3602 3588 struct rq *rq = cpu_rq(cpu); 3603 3589 struct task_struct *curr = rq->curr; 3604 3590 struct rq_flags rf; 3591 + unsigned long thermal_pressure; 3605 3592 3593 + arch_scale_freq_tick(); 3606 3594 sched_clock_tick(); 3607 3595 3608 3596 rq_lock(rq, &rf); 3609 3597 3610 3598 update_rq_clock(rq); 3599 + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); 3600 + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); 3611 3601 curr->sched_class->task_tick(rq, curr, 0); 3612 3602 calc_global_load_tick(rq); 3613 3603 psi_task_tick(rq); ··· 3689 3671 if (cpu_is_offline(cpu)) 3690 3672 goto out_unlock; 3691 3673 3692 - curr = rq->curr; 3693 3674 update_rq_clock(rq); 3694 3675 3695 3676 if (!is_idle_task(curr)) { ··· 4090 4073 * is a RELEASE barrier), 4091 4074 */ 4092 4075 ++*switch_count; 4076 + 4077 + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 4093 4078 4094 4079 trace_sched_switch(preempt, prev, next); 4095 4080

+96 -62

kernel/sched/cpupri.c

··· 41 41 return cpupri; 42 42 } 43 43 44 + static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, 45 + struct cpumask *lowest_mask, int idx) 46 + { 47 + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 48 + int skip = 0; 49 + 50 + if (!atomic_read(&(vec)->count)) 51 + skip = 1; 52 + /* 53 + * When looking at the vector, we need to read the counter, 54 + * do a memory barrier, then read the mask. 55 + * 56 + * Note: This is still all racey, but we can deal with it. 57 + * Ideally, we only want to look at masks that are set. 58 + * 59 + * If a mask is not set, then the only thing wrong is that we 60 + * did a little more work than necessary. 61 + * 62 + * If we read a zero count but the mask is set, because of the 63 + * memory barriers, that can only happen when the highest prio 64 + * task for a run queue has left the run queue, in which case, 65 + * it will be followed by a pull. If the task we are processing 66 + * fails to find a proper place to go, that pull request will 67 + * pull this task if the run queue is running at a lower 68 + * priority. 69 + */ 70 + smp_rmb(); 71 + 72 + /* Need to do the rmb for every iteration */ 73 + if (skip) 74 + return 0; 75 + 76 + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) 77 + return 0; 78 + 79 + if (lowest_mask) { 80 + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); 81 + 82 + /* 83 + * We have to ensure that we have at least one bit 84 + * still set in the array, since the map could have 85 + * been concurrently emptied between the first and 86 + * second reads of vec->mask. If we hit this 87 + * condition, simply act as though we never hit this 88 + * priority level and continue on. 89 + */ 90 + if (cpumask_empty(lowest_mask)) 91 + return 0; 92 + } 93 + 94 + return 1; 95 + } 96 + 97 + int cpupri_find(struct cpupri *cp, struct task_struct *p, 98 + struct cpumask *lowest_mask) 99 + { 100 + return cpupri_find_fitness(cp, p, lowest_mask, NULL); 101 + } 102 + 44 103 /** 45 - * cpupri_find - find the best (lowest-pri) CPU in the system 104 + * cpupri_find_fitness - find the best (lowest-pri) CPU in the system 46 105 * @cp: The cpupri context 47 106 * @p: The task 48 107 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) ··· 117 58 * 118 59 * Return: (int)bool - CPUs were found 119 60 */ 120 - int cpupri_find(struct cpupri *cp, struct task_struct *p, 61 + int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, 121 62 struct cpumask *lowest_mask, 122 63 bool (*fitness_fn)(struct task_struct *p, int cpu)) 123 64 { 124 - int idx = 0; 125 65 int task_pri = convert_prio(p->prio); 66 + int idx, cpu; 126 67 127 68 BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); 128 69 129 70 for (idx = 0; idx < task_pri; idx++) { 130 - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 131 - int skip = 0; 132 71 133 - if (!atomic_read(&(vec)->count)) 134 - skip = 1; 135 - /* 136 - * When looking at the vector, we need to read the counter, 137 - * do a memory barrier, then read the mask. 138 - * 139 - * Note: This is still all racey, but we can deal with it. 140 - * Ideally, we only want to look at masks that are set. 141 - * 142 - * If a mask is not set, then the only thing wrong is that we 143 - * did a little more work than necessary. 144 - * 145 - * If we read a zero count but the mask is set, because of the 146 - * memory barriers, that can only happen when the highest prio 147 - * task for a run queue has left the run queue, in which case, 148 - * it will be followed by a pull. If the task we are processing 149 - * fails to find a proper place to go, that pull request will 150 - * pull this task if the run queue is running at a lower 151 - * priority. 152 - */ 153 - smp_rmb(); 154 - 155 - /* Need to do the rmb for every iteration */ 156 - if (skip) 72 + if (!__cpupri_find(cp, p, lowest_mask, idx)) 157 73 continue; 158 74 159 - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) 160 - continue; 75 + if (!lowest_mask || !fitness_fn) 76 + return 1; 161 77 162 - if (lowest_mask) { 163 - int cpu; 164 - 165 - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); 166 - 167 - /* 168 - * We have to ensure that we have at least one bit 169 - * still set in the array, since the map could have 170 - * been concurrently emptied between the first and 171 - * second reads of vec->mask. If we hit this 172 - * condition, simply act as though we never hit this 173 - * priority level and continue on. 174 - */ 175 - if (cpumask_empty(lowest_mask)) 176 - continue; 177 - 178 - if (!fitness_fn) 179 - return 1; 180 - 181 - /* Ensure the capacity of the CPUs fit the task */ 182 - for_each_cpu(cpu, lowest_mask) { 183 - if (!fitness_fn(p, cpu)) 184 - cpumask_clear_cpu(cpu, lowest_mask); 185 - } 186 - 187 - /* 188 - * If no CPU at the current priority can fit the task 189 - * continue looking 190 - */ 191 - if (cpumask_empty(lowest_mask)) 192 - continue; 78 + /* Ensure the capacity of the CPUs fit the task */ 79 + for_each_cpu(cpu, lowest_mask) { 80 + if (!fitness_fn(p, cpu)) 81 + cpumask_clear_cpu(cpu, lowest_mask); 193 82 } 83 + 84 + /* 85 + * If no CPU at the current priority can fit the task 86 + * continue looking 87 + */ 88 + if (cpumask_empty(lowest_mask)) 89 + continue; 194 90 195 91 return 1; 196 92 } 93 + 94 + /* 95 + * If we failed to find a fitting lowest_mask, kick off a new search 96 + * but without taking into account any fitness criteria this time. 97 + * 98 + * This rule favours honouring priority over fitting the task in the 99 + * correct CPU (Capacity Awareness being the only user now). 100 + * The idea is that if a higher priority task can run, then it should 101 + * run even if this ends up being on unfitting CPU. 102 + * 103 + * The cost of this trade-off is not entirely clear and will probably 104 + * be good for some workloads and bad for others. 105 + * 106 + * The main idea here is that if some CPUs were overcommitted, we try 107 + * to spread which is what the scheduler traditionally did. Sys admins 108 + * must do proper RT planning to avoid overloading the system if they 109 + * really care. 110 + */ 111 + if (fitness_fn) 112 + return cpupri_find(cp, p, lowest_mask); 197 113 198 114 return 0; 199 115 }

+4 -2

kernel/sched/cpupri.h

··· 19 19 20 20 #ifdef CONFIG_SMP 21 21 int cpupri_find(struct cpupri *cp, struct task_struct *p, 22 - struct cpumask *lowest_mask, 23 - bool (*fitness_fn)(struct task_struct *p, int cpu)); 22 + struct cpumask *lowest_mask); 23 + int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, 24 + struct cpumask *lowest_mask, 25 + bool (*fitness_fn)(struct task_struct *p, int cpu)); 24 26 void cpupri_set(struct cpupri *cp, int cpu, int pri); 25 27 int cpupri_init(struct cpupri *cp); 26 28 void cpupri_cleanup(struct cpupri *cp);

+22 -19

kernel/sched/cputime.c

··· 909 909 } while (read_seqcount_retry(&vtime->seqcount, seq)); 910 910 } 911 911 912 - static int vtime_state_check(struct vtime *vtime, int cpu) 912 + static int vtime_state_fetch(struct vtime *vtime, int cpu) 913 913 { 914 + int state = READ_ONCE(vtime->state); 915 + 914 916 /* 915 917 * We raced against a context switch, fetch the 916 918 * kcpustat task again. ··· 929 927 * 930 928 * Case 1) is ok but 2) is not. So wait for a safe VTIME state. 931 929 */ 932 - if (vtime->state == VTIME_INACTIVE) 930 + if (state == VTIME_INACTIVE) 933 931 return -EAGAIN; 934 932 935 - return 0; 933 + return state; 936 934 } 937 935 938 936 static u64 kcpustat_user_vtime(struct vtime *vtime) ··· 951 949 { 952 950 struct vtime *vtime = &tsk->vtime; 953 951 unsigned int seq; 954 - int err; 955 952 956 953 do { 954 + int state; 955 + 957 956 seq = read_seqcount_begin(&vtime->seqcount); 958 957 959 - err = vtime_state_check(vtime, cpu); 960 - if (err < 0) 961 - return err; 958 + state = vtime_state_fetch(vtime, cpu); 959 + if (state < 0) 960 + return state; 962 961 963 962 *val = cpustat[usage]; 964 963 ··· 972 969 */ 973 970 switch (usage) { 974 971 case CPUTIME_SYSTEM: 975 - if (vtime->state == VTIME_SYS) 972 + if (state == VTIME_SYS) 976 973 *val += vtime->stime + vtime_delta(vtime); 977 974 break; 978 975 case CPUTIME_USER: ··· 984 981 *val += kcpustat_user_vtime(vtime); 985 982 break; 986 983 case CPUTIME_GUEST: 987 - if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) 984 + if (state == VTIME_GUEST && task_nice(tsk) <= 0) 988 985 *val += vtime->gtime + vtime_delta(vtime); 989 986 break; 990 987 case CPUTIME_GUEST_NICE: 991 - if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) 988 + if (state == VTIME_GUEST && task_nice(tsk) > 0) 992 989 *val += vtime->gtime + vtime_delta(vtime); 993 990 break; 994 991 default: ··· 1039 1036 { 1040 1037 struct vtime *vtime = &tsk->vtime; 1041 1038 unsigned int seq; 1042 - int err; 1043 1039 1044 1040 do { 1045 1041 u64 *cpustat; 1046 1042 u64 delta; 1043 + int state; 1047 1044 1048 1045 seq = read_seqcount_begin(&vtime->seqcount); 1049 1046 1050 - err = vtime_state_check(vtime, cpu); 1051 - if (err < 0) 1052 - return err; 1047 + state = vtime_state_fetch(vtime, cpu); 1048 + if (state < 0) 1049 + return state; 1053 1050 1054 1051 *dst = *src; 1055 1052 cpustat = dst->cpustat; 1056 1053 1057 1054 /* Task is sleeping, dead or idle, nothing to add */ 1058 - if (vtime->state < VTIME_SYS) 1055 + if (state < VTIME_SYS) 1059 1056 continue; 1060 1057 1061 1058 delta = vtime_delta(vtime); ··· 1064 1061 * Task runs either in user (including guest) or kernel space, 1065 1062 * add pending nohz time to the right place. 1066 1063 */ 1067 - if (vtime->state == VTIME_SYS) { 1064 + if (state == VTIME_SYS) { 1068 1065 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; 1069 - } else if (vtime->state == VTIME_USER) { 1066 + } else if (state == VTIME_USER) { 1070 1067 if (task_nice(tsk) > 0) 1071 1068 cpustat[CPUTIME_NICE] += vtime->utime + delta; 1072 1069 else 1073 1070 cpustat[CPUTIME_USER] += vtime->utime + delta; 1074 1071 } else { 1075 - WARN_ON_ONCE(vtime->state != VTIME_GUEST); 1072 + WARN_ON_ONCE(state != VTIME_GUEST); 1076 1073 if (task_nice(tsk) > 0) { 1077 1074 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; 1078 1075 cpustat[CPUTIME_NICE] += vtime->gtime + delta; ··· 1083 1080 } 1084 1081 } while (read_seqcount_retry(&vtime->seqcount, seq)); 1085 1082 1086 - return err; 1083 + return 0; 1087 1084 } 1088 1085 1089 1086 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)

+4 -2

kernel/sched/deadline.c

··· 153 153 __sub_running_bw(dl_se->dl_bw, dl_rq); 154 154 } 155 155 156 - void dl_change_utilization(struct task_struct *p, u64 new_bw) 156 + static void dl_change_utilization(struct task_struct *p, u64 new_bw) 157 157 { 158 158 struct rq *rq; 159 159 ··· 333 333 334 334 return dl_rq->root.rb_leftmost == &dl_se->rb_node; 335 335 } 336 + 337 + static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 336 338 337 339 void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) 338 340 { ··· 2498 2496 return ret; 2499 2497 } 2500 2498 2501 - void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) 2499 + static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) 2502 2500 { 2503 2501 if (global_rt_runtime() == RUNTIME_INF) { 2504 2502 dl_rq->bw_ratio = 1 << RATIO_SHIFT;

+7 -10

kernel/sched/debug.c

··· 402 402 } 403 403 404 404 P(se->load.weight); 405 - P(se->runnable_weight); 406 405 #ifdef CONFIG_SMP 407 406 P(se->avg.load_avg); 408 407 P(se->avg.util_avg); 409 - P(se->avg.runnable_load_avg); 408 + P(se->avg.runnable_avg); 410 409 #endif 411 410 412 411 #undef PN_SCHEDSTAT ··· 523 524 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 524 525 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 525 526 #ifdef CONFIG_SMP 526 - SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); 527 527 SEQ_printf(m, " .%-30s: %lu\n", "load_avg", 528 528 cfs_rq->avg.load_avg); 529 - SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", 530 - cfs_rq->avg.runnable_load_avg); 529 + SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", 530 + cfs_rq->avg.runnable_avg); 531 531 SEQ_printf(m, " .%-30s: %lu\n", "util_avg", 532 532 cfs_rq->avg.util_avg); 533 533 SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", ··· 535 537 cfs_rq->removed.load_avg); 536 538 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", 537 539 cfs_rq->removed.util_avg); 538 - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", 539 - cfs_rq->removed.runnable_sum); 540 + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg", 541 + cfs_rq->removed.runnable_avg); 540 542 #ifdef CONFIG_FAIR_GROUP_SCHED 541 543 SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", 542 544 cfs_rq->tg_load_avg_contrib); ··· 945 947 "nr_involuntary_switches", (long long)p->nivcsw); 946 948 947 949 P(se.load.weight); 948 - P(se.runnable_weight); 949 950 #ifdef CONFIG_SMP 950 951 P(se.avg.load_sum); 951 - P(se.avg.runnable_load_sum); 952 + P(se.avg.runnable_sum); 952 953 P(se.avg.util_sum); 953 954 P(se.avg.load_avg); 954 - P(se.avg.runnable_load_avg); 955 + P(se.avg.runnable_avg); 955 956 P(se.avg.util_avg); 956 957 P(se.avg.last_update_time); 957 958 P(se.avg.util_est.ewma);

+544 -249

kernel/sched/fair.c

··· 86 86 87 87 const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 88 88 89 + int sched_thermal_decay_shift; 90 + static int __init setup_sched_thermal_decay_shift(char *str) 91 + { 92 + int _shift = 0; 93 + 94 + if (kstrtoint(str, 0, &_shift)) 95 + pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); 96 + 97 + sched_thermal_decay_shift = clamp(_shift, 0, 10); 98 + return 1; 99 + } 100 + __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); 101 + 89 102 #ifdef CONFIG_SMP 90 103 /* 91 104 * For asym packing, by default the lower numbered CPU has higher priority. ··· 754 741 * nothing has been attached to the task group yet. 755 742 */ 756 743 if (entity_is_task(se)) 757 - sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); 758 - 759 - se->runnable_weight = se->load.weight; 744 + sa->load_avg = scale_load_down(se->load.weight); 760 745 761 746 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 762 747 } ··· 806 795 sa->util_avg = cap; 807 796 } 808 797 } 798 + 799 + sa->runnable_avg = cpu_scale; 809 800 810 801 if (p->sched_class != &fair_sched_class) { 811 802 /* ··· 1486 1473 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 1487 1474 } 1488 1475 1489 - static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); 1490 - 1491 - static unsigned long cpu_runnable_load(struct rq *rq) 1492 - { 1493 - return cfs_rq_runnable_load_avg(&rq->cfs); 1494 - } 1476 + /* 1477 + * 'numa_type' describes the node at the moment of load balancing. 1478 + */ 1479 + enum numa_type { 1480 + /* The node has spare capacity that can be used to run more tasks. */ 1481 + node_has_spare = 0, 1482 + /* 1483 + * The node is fully used and the tasks don't compete for more CPU 1484 + * cycles. Nevertheless, some tasks might wait before running. 1485 + */ 1486 + node_fully_busy, 1487 + /* 1488 + * The node is overloaded and can't provide expected CPU cycles to all 1489 + * tasks. 1490 + */ 1491 + node_overloaded 1492 + }; 1495 1493 1496 1494 /* Cached statistics for all CPUs within a node */ 1497 1495 struct numa_stats { 1498 1496 unsigned long load; 1499 - 1497 + unsigned long util; 1500 1498 /* Total compute capacity of CPUs on a node */ 1501 1499 unsigned long compute_capacity; 1500 + unsigned int nr_running; 1501 + unsigned int weight; 1502 + enum numa_type node_type; 1503 + int idle_cpu; 1502 1504 }; 1503 1505 1504 - /* 1505 - * XXX borrowed from update_sg_lb_stats 1506 - */ 1507 - static void update_numa_stats(struct numa_stats *ns, int nid) 1506 + static inline bool is_core_idle(int cpu) 1508 1507 { 1509 - int cpu; 1508 + #ifdef CONFIG_SCHED_SMT 1509 + int sibling; 1510 1510 1511 - memset(ns, 0, sizeof(*ns)); 1512 - for_each_cpu(cpu, cpumask_of_node(nid)) { 1513 - struct rq *rq = cpu_rq(cpu); 1511 + for_each_cpu(sibling, cpu_smt_mask(cpu)) { 1512 + if (cpu == sibling) 1513 + continue; 1514 1514 1515 - ns->load += cpu_runnable_load(rq); 1516 - ns->compute_capacity += capacity_of(cpu); 1515 + if (!idle_cpu(cpu)) 1516 + return false; 1517 1517 } 1518 + #endif 1518 1519 1520 + return true; 1519 1521 } 1520 1522 1521 1523 struct task_numa_env { ··· 1549 1521 int best_cpu; 1550 1522 }; 1551 1523 1524 + static unsigned long cpu_load(struct rq *rq); 1525 + static unsigned long cpu_util(int cpu); 1526 + static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); 1527 + 1528 + static inline enum 1529 + numa_type numa_classify(unsigned int imbalance_pct, 1530 + struct numa_stats *ns) 1531 + { 1532 + if ((ns->nr_running > ns->weight) && 1533 + ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) 1534 + return node_overloaded; 1535 + 1536 + if ((ns->nr_running < ns->weight) || 1537 + ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) 1538 + return node_has_spare; 1539 + 1540 + return node_fully_busy; 1541 + } 1542 + 1543 + #ifdef CONFIG_SCHED_SMT 1544 + /* Forward declarations of select_idle_sibling helpers */ 1545 + static inline bool test_idle_cores(int cpu, bool def); 1546 + static inline int numa_idle_core(int idle_core, int cpu) 1547 + { 1548 + if (!static_branch_likely(&sched_smt_present) || 1549 + idle_core >= 0 || !test_idle_cores(cpu, false)) 1550 + return idle_core; 1551 + 1552 + /* 1553 + * Prefer cores instead of packing HT siblings 1554 + * and triggering future load balancing. 1555 + */ 1556 + if (is_core_idle(cpu)) 1557 + idle_core = cpu; 1558 + 1559 + return idle_core; 1560 + } 1561 + #else 1562 + static inline int numa_idle_core(int idle_core, int cpu) 1563 + { 1564 + return idle_core; 1565 + } 1566 + #endif 1567 + 1568 + /* 1569 + * Gather all necessary information to make NUMA balancing placement 1570 + * decisions that are compatible with standard load balancer. This 1571 + * borrows code and logic from update_sg_lb_stats but sharing a 1572 + * common implementation is impractical. 1573 + */ 1574 + static void update_numa_stats(struct task_numa_env *env, 1575 + struct numa_stats *ns, int nid, 1576 + bool find_idle) 1577 + { 1578 + int cpu, idle_core = -1; 1579 + 1580 + memset(ns, 0, sizeof(*ns)); 1581 + ns->idle_cpu = -1; 1582 + 1583 + rcu_read_lock(); 1584 + for_each_cpu(cpu, cpumask_of_node(nid)) { 1585 + struct rq *rq = cpu_rq(cpu); 1586 + 1587 + ns->load += cpu_load(rq); 1588 + ns->util += cpu_util(cpu); 1589 + ns->nr_running += rq->cfs.h_nr_running; 1590 + ns->compute_capacity += capacity_of(cpu); 1591 + 1592 + if (find_idle && !rq->nr_running && idle_cpu(cpu)) { 1593 + if (READ_ONCE(rq->numa_migrate_on) || 1594 + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) 1595 + continue; 1596 + 1597 + if (ns->idle_cpu == -1) 1598 + ns->idle_cpu = cpu; 1599 + 1600 + idle_core = numa_idle_core(idle_core, cpu); 1601 + } 1602 + } 1603 + rcu_read_unlock(); 1604 + 1605 + ns->weight = cpumask_weight(cpumask_of_node(nid)); 1606 + 1607 + ns->node_type = numa_classify(env->imbalance_pct, ns); 1608 + 1609 + if (idle_core >= 0) 1610 + ns->idle_cpu = idle_core; 1611 + } 1612 + 1552 1613 static void task_numa_assign(struct task_numa_env *env, 1553 1614 struct task_struct *p, long imp) 1554 1615 { 1555 1616 struct rq *rq = cpu_rq(env->dst_cpu); 1556 1617 1557 - /* Bail out if run-queue part of active NUMA balance. */ 1558 - if (xchg(&rq->numa_migrate_on, 1)) 1559 - return; 1618 + /* Check if run-queue part of active NUMA balance. */ 1619 + if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { 1620 + int cpu; 1621 + int start = env->dst_cpu; 1560 1622 1623 + /* Find alternative idle CPU. */ 1624 + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { 1625 + if (cpu == env->best_cpu || !idle_cpu(cpu) || 1626 + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { 1627 + continue; 1628 + } 1629 + 1630 + env->dst_cpu = cpu; 1631 + rq = cpu_rq(env->dst_cpu); 1632 + if (!xchg(&rq->numa_migrate_on, 1)) 1633 + goto assign; 1634 + } 1635 + 1636 + /* Failed to find an alternative idle CPU */ 1637 + return; 1638 + } 1639 + 1640 + assign: 1561 1641 /* 1562 1642 * Clear previous best_cpu/rq numa-migrate flag, since task now 1563 1643 * found a better CPU to move/swap. 1564 1644 */ 1565 - if (env->best_cpu != -1) { 1645 + if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { 1566 1646 rq = cpu_rq(env->best_cpu); 1567 1647 WRITE_ONCE(rq->numa_migrate_on, 0); 1568 1648 } ··· 1726 1590 * into account that it might be best if task running on the dst_cpu should 1727 1591 * be exchanged with the source task 1728 1592 */ 1729 - static void task_numa_compare(struct task_numa_env *env, 1593 + static bool task_numa_compare(struct task_numa_env *env, 1730 1594 long taskimp, long groupimp, bool maymove) 1731 1595 { 1732 1596 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); ··· 1737 1601 int dist = env->dist; 1738 1602 long moveimp = imp; 1739 1603 long load; 1604 + bool stopsearch = false; 1740 1605 1741 1606 if (READ_ONCE(dst_rq->numa_migrate_on)) 1742 - return; 1607 + return false; 1743 1608 1744 1609 rcu_read_lock(); 1745 1610 cur = rcu_dereference(dst_rq->curr); ··· 1751 1614 * Because we have preemption enabled we can get migrated around and 1752 1615 * end try selecting ourselves (current == env->p) as a swap candidate. 1753 1616 */ 1754 - if (cur == env->p) 1617 + if (cur == env->p) { 1618 + stopsearch = true; 1755 1619 goto unlock; 1620 + } 1756 1621 1757 1622 if (!cur) { 1758 1623 if (maymove && moveimp >= env->best_imp) ··· 1763 1624 goto unlock; 1764 1625 } 1765 1626 1627 + /* Skip this swap candidate if cannot move to the source cpu. */ 1628 + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) 1629 + goto unlock; 1630 + 1631 + /* 1632 + * Skip this swap candidate if it is not moving to its preferred 1633 + * node and the best task is. 1634 + */ 1635 + if (env->best_task && 1636 + env->best_task->numa_preferred_nid == env->src_nid && 1637 + cur->numa_preferred_nid != env->src_nid) { 1638 + goto unlock; 1639 + } 1640 + 1766 1641 /* 1767 1642 * "imp" is the fault differential for the source task between the 1768 1643 * source and destination node. Calculate the total differential for 1769 1644 * the source task and potential destination task. The more negative 1770 1645 * the value is, the more remote accesses that would be expected to 1771 1646 * be incurred if the tasks were swapped. 1772 - */ 1773 - /* Skip this swap candidate if cannot move to the source cpu */ 1774 - if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) 1775 - goto unlock; 1776 - 1777 - /* 1647 + * 1778 1648 * If dst and source tasks are in the same NUMA group, or not 1779 1649 * in any group then look only at task weights. 1780 1650 */ ··· 1810 1662 task_weight(cur, env->dst_nid, dist); 1811 1663 } 1812 1664 1665 + /* Discourage picking a task already on its preferred node */ 1666 + if (cur->numa_preferred_nid == env->dst_nid) 1667 + imp -= imp / 16; 1668 + 1669 + /* 1670 + * Encourage picking a task that moves to its preferred node. 1671 + * This potentially makes imp larger than it's maximum of 1672 + * 1998 (see SMALLIMP and task_weight for why) but in this 1673 + * case, it does not matter. 1674 + */ 1675 + if (cur->numa_preferred_nid == env->src_nid) 1676 + imp += imp / 8; 1677 + 1813 1678 if (maymove && moveimp > imp && moveimp > env->best_imp) { 1814 1679 imp = moveimp; 1815 1680 cur = NULL; 1681 + goto assign; 1682 + } 1683 + 1684 + /* 1685 + * Prefer swapping with a task moving to its preferred node over a 1686 + * task that is not. 1687 + */ 1688 + if (env->best_task && cur->numa_preferred_nid == env->src_nid && 1689 + env->best_task->numa_preferred_nid != env->src_nid) { 1816 1690 goto assign; 1817 1691 } 1818 1692 ··· 1861 1691 goto unlock; 1862 1692 1863 1693 assign: 1864 - /* 1865 - * One idle CPU per node is evaluated for a task numa move. 1866 - * Call select_idle_sibling to maybe find a better one. 1867 - */ 1694 + /* Evaluate an idle CPU for a task numa move. */ 1868 1695 if (!cur) { 1696 + int cpu = env->dst_stats.idle_cpu; 1697 + 1698 + /* Nothing cached so current CPU went idle since the search. */ 1699 + if (cpu < 0) 1700 + cpu = env->dst_cpu; 1701 + 1869 1702 /* 1870 - * select_idle_siblings() uses an per-CPU cpumask that 1871 - * can be used from IRQ context. 1703 + * If the CPU is no longer truly idle and the previous best CPU 1704 + * is, keep using it. 1872 1705 */ 1873 - local_irq_disable(); 1874 - env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, 1875 - env->dst_cpu); 1876 - local_irq_enable(); 1706 + if (!idle_cpu(cpu) && env->best_cpu >= 0 && 1707 + idle_cpu(env->best_cpu)) { 1708 + cpu = env->best_cpu; 1709 + } 1710 + 1711 + env->dst_cpu = cpu; 1877 1712 } 1878 1713 1879 1714 task_numa_assign(env, cur, imp); 1715 + 1716 + /* 1717 + * If a move to idle is allowed because there is capacity or load 1718 + * balance improves then stop the search. While a better swap 1719 + * candidate may exist, a search is not free. 1720 + */ 1721 + if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) 1722 + stopsearch = true; 1723 + 1724 + /* 1725 + * If a swap candidate must be identified and the current best task 1726 + * moves its preferred node then stop the search. 1727 + */ 1728 + if (!maymove && env->best_task && 1729 + env->best_task->numa_preferred_nid == env->src_nid) { 1730 + stopsearch = true; 1731 + } 1880 1732 unlock: 1881 1733 rcu_read_unlock(); 1734 + 1735 + return stopsearch; 1882 1736 } 1883 1737 1884 1738 static void task_numa_find_cpu(struct task_numa_env *env, 1885 1739 long taskimp, long groupimp) 1886 1740 { 1887 - long src_load, dst_load, load; 1888 1741 bool maymove = false; 1889 1742 int cpu; 1890 1743 1891 - load = task_h_load(env->p); 1892 - dst_load = env->dst_stats.load + load; 1893 - src_load = env->src_stats.load - load; 1894 - 1895 1744 /* 1896 - * If the improvement from just moving env->p direction is better 1897 - * than swapping tasks around, check if a move is possible. 1745 + * If dst node has spare capacity, then check if there is an 1746 + * imbalance that would be overruled by the load balancer. 1898 1747 */ 1899 - maymove = !load_too_imbalanced(src_load, dst_load, env); 1748 + if (env->dst_stats.node_type == node_has_spare) { 1749 + unsigned int imbalance; 1750 + int src_running, dst_running; 1751 + 1752 + /* 1753 + * Would movement cause an imbalance? Note that if src has 1754 + * more running tasks that the imbalance is ignored as the 1755 + * move improves the imbalance from the perspective of the 1756 + * CPU load balancer. 1757 + * */ 1758 + src_running = env->src_stats.nr_running - 1; 1759 + dst_running = env->dst_stats.nr_running + 1; 1760 + imbalance = max(0, dst_running - src_running); 1761 + imbalance = adjust_numa_imbalance(imbalance, src_running); 1762 + 1763 + /* Use idle CPU if there is no imbalance */ 1764 + if (!imbalance) { 1765 + maymove = true; 1766 + if (env->dst_stats.idle_cpu >= 0) { 1767 + env->dst_cpu = env->dst_stats.idle_cpu; 1768 + task_numa_assign(env, NULL, 0); 1769 + return; 1770 + } 1771 + } 1772 + } else { 1773 + long src_load, dst_load, load; 1774 + /* 1775 + * If the improvement from just moving env->p direction is better 1776 + * than swapping tasks around, check if a move is possible. 1777 + */ 1778 + load = task_h_load(env->p); 1779 + dst_load = env->dst_stats.load + load; 1780 + src_load = env->src_stats.load - load; 1781 + maymove = !load_too_imbalanced(src_load, dst_load, env); 1782 + } 1900 1783 1901 1784 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 1902 1785 /* Skip this CPU if the source task cannot migrate */ ··· 1957 1734 continue; 1958 1735 1959 1736 env->dst_cpu = cpu; 1960 - task_numa_compare(env, taskimp, groupimp, maymove); 1737 + if (task_numa_compare(env, taskimp, groupimp, maymove)) 1738 + break; 1961 1739 } 1962 1740 } 1963 1741 ··· 2012 1788 dist = env.dist = node_distance(env.src_nid, env.dst_nid); 2013 1789 taskweight = task_weight(p, env.src_nid, dist); 2014 1790 groupweight = group_weight(p, env.src_nid, dist); 2015 - update_numa_stats(&env.src_stats, env.src_nid); 1791 + update_numa_stats(&env, &env.src_stats, env.src_nid, false); 2016 1792 taskimp = task_weight(p, env.dst_nid, dist) - taskweight; 2017 1793 groupimp = group_weight(p, env.dst_nid, dist) - groupweight; 2018 - update_numa_stats(&env.dst_stats, env.dst_nid); 1794 + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); 2019 1795 2020 1796 /* Try to find a spot on the preferred nid. */ 2021 1797 task_numa_find_cpu(&env, taskimp, groupimp); ··· 2048 1824 2049 1825 env.dist = dist; 2050 1826 env.dst_nid = nid; 2051 - update_numa_stats(&env.dst_stats, env.dst_nid); 1827 + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); 2052 1828 task_numa_find_cpu(&env, taskimp, groupimp); 2053 1829 } 2054 1830 } ··· 2072 1848 } 2073 1849 2074 1850 /* No better CPU than the current one was found. */ 2075 - if (env.best_cpu == -1) 1851 + if (env.best_cpu == -1) { 1852 + trace_sched_stick_numa(p, env.src_cpu, NULL, -1); 2076 1853 return -EAGAIN; 1854 + } 2077 1855 2078 1856 best_rq = cpu_rq(env.best_cpu); 2079 1857 if (env.best_task == NULL) { 2080 1858 ret = migrate_task_to(p, env.best_cpu); 2081 1859 WRITE_ONCE(best_rq->numa_migrate_on, 0); 2082 1860 if (ret != 0) 2083 - trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); 1861 + trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); 2084 1862 return ret; 2085 1863 } 2086 1864 ··· 2090 1864 WRITE_ONCE(best_rq->numa_migrate_on, 0); 2091 1865 2092 1866 if (ret != 0) 2093 - trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); 1867 + trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); 2094 1868 put_task_struct(env.best_task); 2095 1869 return ret; 2096 1870 } ··· 3061 2835 3062 2836 #ifdef CONFIG_SMP 3063 2837 static inline void 3064 - enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3065 - { 3066 - cfs_rq->runnable_weight += se->runnable_weight; 3067 - 3068 - cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; 3069 - cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; 3070 - } 3071 - 3072 - static inline void 3073 - dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3074 - { 3075 - cfs_rq->runnable_weight -= se->runnable_weight; 3076 - 3077 - sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); 3078 - sub_positive(&cfs_rq->avg.runnable_load_sum, 3079 - se_runnable(se) * se->avg.runnable_load_sum); 3080 - } 3081 - 3082 - static inline void 3083 2838 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3084 2839 { 3085 2840 cfs_rq->avg.load_avg += se->avg.load_avg; ··· 3075 2868 } 3076 2869 #else 3077 2870 static inline void 3078 - enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3079 - static inline void 3080 - dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3081 - static inline void 3082 2871 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3083 2872 static inline void 3084 2873 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3085 2874 #endif 3086 2875 3087 2876 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 3088 - unsigned long weight, unsigned long runnable) 2877 + unsigned long weight) 3089 2878 { 3090 2879 if (se->on_rq) { 3091 2880 /* commit outstanding execution time */ 3092 2881 if (cfs_rq->curr == se) 3093 2882 update_curr(cfs_rq); 3094 2883 account_entity_dequeue(cfs_rq, se); 3095 - dequeue_runnable_load_avg(cfs_rq, se); 3096 2884 } 3097 2885 dequeue_load_avg(cfs_rq, se); 3098 2886 3099 - se->runnable_weight = runnable; 3100 2887 update_load_set(&se->load, weight); 3101 2888 3102 2889 #ifdef CONFIG_SMP ··· 3098 2897 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; 3099 2898 3100 2899 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 3101 - se->avg.runnable_load_avg = 3102 - div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); 3103 2900 } while (0); 3104 2901 #endif 3105 2902 3106 2903 enqueue_load_avg(cfs_rq, se); 3107 - if (se->on_rq) { 2904 + if (se->on_rq) 3108 2905 account_entity_enqueue(cfs_rq, se); 3109 - enqueue_runnable_load_avg(cfs_rq, se); 3110 - } 2906 + 3111 2907 } 3112 2908 3113 2909 void reweight_task(struct task_struct *p, int prio) ··· 3114 2916 struct load_weight *load = &se->load; 3115 2917 unsigned long weight = scale_load(sched_prio_to_weight[prio]); 3116 2918 3117 - reweight_entity(cfs_rq, se, weight, weight); 2919 + reweight_entity(cfs_rq, se, weight); 3118 2920 load->inv_weight = sched_prio_to_wmult[prio]; 3119 2921 } 3120 2922 ··· 3226 3028 */ 3227 3029 return clamp_t(long, shares, MIN_SHARES, tg_shares); 3228 3030 } 3229 - 3230 - /* 3231 - * This calculates the effective runnable weight for a group entity based on 3232 - * the group entity weight calculated above. 3233 - * 3234 - * Because of the above approximation (2), our group entity weight is 3235 - * an load_avg based ratio (3). This means that it includes blocked load and 3236 - * does not represent the runnable weight. 3237 - * 3238 - * Approximate the group entity's runnable weight per ratio from the group 3239 - * runqueue: 3240 - * 3241 - * grq->avg.runnable_load_avg 3242 - * ge->runnable_weight = ge->load.weight * -------------------------- (7) 3243 - * grq->avg.load_avg 3244 - * 3245 - * However, analogous to above, since the avg numbers are slow, this leads to 3246 - * transients in the from-idle case. Instead we use: 3247 - * 3248 - * ge->runnable_weight = ge->load.weight * 3249 - * 3250 - * max(grq->avg.runnable_load_avg, grq->runnable_weight) 3251 - * ----------------------------------------------------- (8) 3252 - * max(grq->avg.load_avg, grq->load.weight) 3253 - * 3254 - * Where these max() serve both to use the 'instant' values to fix the slow 3255 - * from-idle and avoid the /0 on to-idle, similar to (6). 3256 - */ 3257 - static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) 3258 - { 3259 - long runnable, load_avg; 3260 - 3261 - load_avg = max(cfs_rq->avg.load_avg, 3262 - scale_load_down(cfs_rq->load.weight)); 3263 - 3264 - runnable = max(cfs_rq->avg.runnable_load_avg, 3265 - scale_load_down(cfs_rq->runnable_weight)); 3266 - 3267 - runnable *= shares; 3268 - if (load_avg) 3269 - runnable /= load_avg; 3270 - 3271 - return clamp_t(long, runnable, MIN_SHARES, shares); 3272 - } 3273 3031 #endif /* CONFIG_SMP */ 3274 3032 3275 3033 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); ··· 3237 3083 static void update_cfs_group(struct sched_entity *se) 3238 3084 { 3239 3085 struct cfs_rq *gcfs_rq = group_cfs_rq(se); 3240 - long shares, runnable; 3086 + long shares; 3241 3087 3242 3088 if (!gcfs_rq) 3243 3089 return; ··· 3246 3092 return; 3247 3093 3248 3094 #ifndef CONFIG_SMP 3249 - runnable = shares = READ_ONCE(gcfs_rq->tg->shares); 3095 + shares = READ_ONCE(gcfs_rq->tg->shares); 3250 3096 3251 3097 if (likely(se->load.weight == shares)) 3252 3098 return; 3253 3099 #else 3254 3100 shares = calc_group_shares(gcfs_rq); 3255 - runnable = calc_group_runnable(gcfs_rq, shares); 3256 3101 #endif 3257 3102 3258 - reweight_entity(cfs_rq_of(se), se, shares, runnable); 3103 + reweight_entity(cfs_rq_of(se), se, shares); 3259 3104 } 3260 3105 3261 3106 #else /* CONFIG_FAIR_GROUP_SCHED */ ··· 3379 3226 * _IFF_ we look at the pure running and runnable sums. Because they 3380 3227 * represent the very same entity, just at different points in the hierarchy. 3381 3228 * 3382 - * Per the above update_tg_cfs_util() is trivial and simply copies the running 3383 - * sum over (but still wrong, because the group entity and group rq do not have 3384 - * their PELT windows aligned). 3229 + * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial 3230 + * and simply copies the running/runnable sum over (but still wrong, because 3231 + * the group entity and group rq do not have their PELT windows aligned). 3385 3232 * 3386 - * However, update_tg_cfs_runnable() is more complex. So we have: 3233 + * However, update_tg_cfs_load() is more complex. So we have: 3387 3234 * 3388 3235 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) 3389 3236 * ··· 3466 3313 static inline void 3467 3314 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3468 3315 { 3316 + long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; 3317 + 3318 + /* Nothing to update */ 3319 + if (!delta) 3320 + return; 3321 + 3322 + /* 3323 + * The relation between sum and avg is: 3324 + * 3325 + * LOAD_AVG_MAX - 1024 + sa->period_contrib 3326 + * 3327 + * however, the PELT windows are not aligned between grq and gse. 3328 + */ 3329 + 3330 + /* Set new sched_entity's runnable */ 3331 + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; 3332 + se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX; 3333 + 3334 + /* Update parent cfs_rq runnable */ 3335 + add_positive(&cfs_rq->avg.runnable_avg, delta); 3336 + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX; 3337 + } 3338 + 3339 + static inline void 3340 + update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3341 + { 3469 3342 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; 3470 - unsigned long runnable_load_avg, load_avg; 3471 - u64 runnable_load_sum, load_sum = 0; 3343 + unsigned long load_avg; 3344 + u64 load_sum = 0; 3472 3345 s64 delta_sum; 3473 3346 3474 3347 if (!runnable_sum) ··· 3542 3363 se->avg.load_avg = load_avg; 3543 3364 add_positive(&cfs_rq->avg.load_avg, delta_avg); 3544 3365 add_positive(&cfs_rq->avg.load_sum, delta_sum); 3545 - 3546 - runnable_load_sum = (s64)se_runnable(se) * runnable_sum; 3547 - runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); 3548 - 3549 - if (se->on_rq) { 3550 - delta_sum = runnable_load_sum - 3551 - se_weight(se) * se->avg.runnable_load_sum; 3552 - delta_avg = runnable_load_avg - se->avg.runnable_load_avg; 3553 - add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); 3554 - add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); 3555 - } 3556 - 3557 - se->avg.runnable_load_sum = runnable_sum; 3558 - se->avg.runnable_load_avg = runnable_load_avg; 3559 3366 } 3560 3367 3561 3368 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) ··· 3570 3405 3571 3406 update_tg_cfs_util(cfs_rq, se, gcfs_rq); 3572 3407 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); 3408 + update_tg_cfs_load(cfs_rq, se, gcfs_rq); 3573 3409 3574 3410 trace_pelt_cfs_tp(cfs_rq); 3575 3411 trace_pelt_se_tp(se); ··· 3640 3474 static inline int 3641 3475 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 3642 3476 { 3643 - unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; 3477 + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; 3644 3478 struct sched_avg *sa = &cfs_rq->avg; 3645 3479 int decayed = 0; 3646 3480 ··· 3651 3485 raw_spin_lock(&cfs_rq->removed.lock); 3652 3486 swap(cfs_rq->removed.util_avg, removed_util); 3653 3487 swap(cfs_rq->removed.load_avg, removed_load); 3654 - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); 3488 + swap(cfs_rq->removed.runnable_avg, removed_runnable); 3655 3489 cfs_rq->removed.nr = 0; 3656 3490 raw_spin_unlock(&cfs_rq->removed.lock); 3657 3491 ··· 3663 3497 sub_positive(&sa->util_avg, r); 3664 3498 sub_positive(&sa->util_sum, r * divider); 3665 3499 3666 - add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); 3500 + r = removed_runnable; 3501 + sub_positive(&sa->runnable_avg, r); 3502 + sub_positive(&sa->runnable_sum, r * divider); 3503 + 3504 + /* 3505 + * removed_runnable is the unweighted version of removed_load so we 3506 + * can use it to estimate removed_load_sum. 3507 + */ 3508 + add_tg_cfs_propagate(cfs_rq, 3509 + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); 3667 3510 3668 3511 decayed = 1; 3669 3512 } ··· 3717 3542 */ 3718 3543 se->avg.util_sum = se->avg.util_avg * divider; 3719 3544 3545 + se->avg.runnable_sum = se->avg.runnable_avg * divider; 3546 + 3720 3547 se->avg.load_sum = divider; 3721 3548 if (se_weight(se)) { 3722 3549 se->avg.load_sum = 3723 3550 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); 3724 3551 } 3725 3552 3726 - se->avg.runnable_load_sum = se->avg.load_sum; 3727 - 3728 3553 enqueue_load_avg(cfs_rq, se); 3729 3554 cfs_rq->avg.util_avg += se->avg.util_avg; 3730 3555 cfs_rq->avg.util_sum += se->avg.util_sum; 3556 + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; 3557 + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; 3731 3558 3732 3559 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3733 3560 ··· 3751 3574 dequeue_load_avg(cfs_rq, se); 3752 3575 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3753 3576 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3577 + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); 3578 + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); 3754 3579 3755 3580 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3756 3581 ··· 3859 3680 ++cfs_rq->removed.nr; 3860 3681 cfs_rq->removed.util_avg += se->avg.util_avg; 3861 3682 cfs_rq->removed.load_avg += se->avg.load_avg; 3862 - cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ 3683 + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; 3863 3684 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); 3864 3685 } 3865 3686 3866 - static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) 3687 + static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) 3867 3688 { 3868 - return cfs_rq->avg.runnable_load_avg; 3689 + return cfs_rq->avg.runnable_avg; 3869 3690 } 3870 3691 3871 3692 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) ··· 4136 3957 #endif 4137 3958 } 4138 3959 3960 + static inline bool cfs_bandwidth_used(void); 4139 3961 4140 3962 /* 4141 3963 * MIGRATION ··· 4201 4021 * - Add its new weight to cfs_rq->load.weight 4202 4022 */ 4203 4023 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); 4024 + se_update_runnable(se); 4204 4025 update_cfs_group(se); 4205 - enqueue_runnable_load_avg(cfs_rq, se); 4206 4026 account_entity_enqueue(cfs_rq, se); 4207 4027 4208 4028 if (flags & ENQUEUE_WAKEUP) ··· 4215 4035 __enqueue_entity(cfs_rq, se); 4216 4036 se->on_rq = 1; 4217 4037 4218 - if (cfs_rq->nr_running == 1) { 4038 + /* 4039 + * When bandwidth control is enabled, cfs might have been removed 4040 + * because of a parent been throttled but cfs->nr_running > 1. Try to 4041 + * add it unconditionnally. 4042 + */ 4043 + if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) 4219 4044 list_add_leaf_cfs_rq(cfs_rq); 4045 + 4046 + if (cfs_rq->nr_running == 1) 4220 4047 check_enqueue_throttle(cfs_rq); 4221 - } 4222 4048 } 4223 4049 4224 4050 static void __clear_buddies_last(struct sched_entity *se) ··· 4291 4105 * of its group cfs_rq. 4292 4106 */ 4293 4107 update_load_avg(cfs_rq, se, UPDATE_TG); 4294 - dequeue_runnable_load_avg(cfs_rq, se); 4108 + se_update_runnable(se); 4295 4109 4296 4110 update_stats_dequeue(cfs_rq, se, flags); 4297 4111 ··· 4727 4541 if (!se->on_rq) 4728 4542 break; 4729 4543 4730 - if (dequeue) 4544 + if (dequeue) { 4731 4545 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); 4546 + } else { 4547 + update_load_avg(qcfs_rq, se, 0); 4548 + se_update_runnable(se); 4549 + } 4550 + 4732 4551 qcfs_rq->h_nr_running -= task_delta; 4733 4552 qcfs_rq->idle_h_nr_running -= idle_task_delta; 4734 4553 ··· 4801 4610 enqueue = 0; 4802 4611 4803 4612 cfs_rq = cfs_rq_of(se); 4804 - if (enqueue) 4613 + if (enqueue) { 4805 4614 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); 4615 + } else { 4616 + update_load_avg(cfs_rq, se, 0); 4617 + se_update_runnable(se); 4618 + } 4619 + 4806 4620 cfs_rq->h_nr_running += task_delta; 4807 4621 cfs_rq->idle_h_nr_running += idle_task_delta; 4808 4622 ··· 4815 4619 break; 4816 4620 } 4817 4621 4818 - assert_list_leaf_cfs_rq(rq); 4819 - 4820 4622 if (!se) 4821 4623 add_nr_running(rq, task_delta); 4624 + 4625 + /* 4626 + * The cfs_rq_throttled() breaks in the above iteration can result in 4627 + * incomplete leaf list maintenance, resulting in triggering the 4628 + * assertion below. 4629 + */ 4630 + for_each_sched_entity(se) { 4631 + cfs_rq = cfs_rq_of(se); 4632 + 4633 + list_add_leaf_cfs_rq(cfs_rq); 4634 + } 4635 + 4636 + assert_list_leaf_cfs_rq(rq); 4822 4637 4823 4638 /* Determine whether we need to wake up potentially idle CPU: */ 4824 4639 if (rq->curr == rq->idle && rq->cfs.nr_running) ··· 5465 5258 cfs_rq = cfs_rq_of(se); 5466 5259 enqueue_entity(cfs_rq, se, flags); 5467 5260 5468 - /* 5469 - * end evaluation on encountering a throttled cfs_rq 5470 - * 5471 - * note: in the case of encountering a throttled cfs_rq we will 5472 - * post the final h_nr_running increment below. 5473 - */ 5474 - if (cfs_rq_throttled(cfs_rq)) 5475 - break; 5476 5261 cfs_rq->h_nr_running++; 5477 5262 cfs_rq->idle_h_nr_running += idle_h_nr_running; 5263 + 5264 + /* end evaluation on encountering a throttled cfs_rq */ 5265 + if (cfs_rq_throttled(cfs_rq)) 5266 + goto enqueue_throttle; 5478 5267 5479 5268 flags = ENQUEUE_WAKEUP; 5480 5269 } 5481 5270 5482 5271 for_each_sched_entity(se) { 5483 5272 cfs_rq = cfs_rq_of(se); 5273 + 5274 + update_load_avg(cfs_rq, se, UPDATE_TG); 5275 + se_update_runnable(se); 5276 + update_cfs_group(se); 5277 + 5484 5278 cfs_rq->h_nr_running++; 5485 5279 cfs_rq->idle_h_nr_running += idle_h_nr_running; 5486 5280 5281 + /* end evaluation on encountering a throttled cfs_rq */ 5487 5282 if (cfs_rq_throttled(cfs_rq)) 5488 - break; 5489 - 5490 - update_load_avg(cfs_rq, se, UPDATE_TG); 5491 - update_cfs_group(se); 5283 + goto enqueue_throttle; 5492 5284 } 5493 5285 5286 + enqueue_throttle: 5494 5287 if (!se) { 5495 5288 add_nr_running(rq, 1); 5496 5289 /* ··· 5551 5344 cfs_rq = cfs_rq_of(se); 5552 5345 dequeue_entity(cfs_rq, se, flags); 5553 5346 5554 - /* 5555 - * end evaluation on encountering a throttled cfs_rq 5556 - * 5557 - * note: in the case of encountering a throttled cfs_rq we will 5558 - * post the final h_nr_running decrement below. 5559 - */ 5560 - if (cfs_rq_throttled(cfs_rq)) 5561 - break; 5562 5347 cfs_rq->h_nr_running--; 5563 5348 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 5349 + 5350 + /* end evaluation on encountering a throttled cfs_rq */ 5351 + if (cfs_rq_throttled(cfs_rq)) 5352 + goto dequeue_throttle; 5564 5353 5565 5354 /* Don't dequeue parent if it has other entities besides us */ 5566 5355 if (cfs_rq->load.weight) { ··· 5575 5372 5576 5373 for_each_sched_entity(se) { 5577 5374 cfs_rq = cfs_rq_of(se); 5375 + 5376 + update_load_avg(cfs_rq, se, UPDATE_TG); 5377 + se_update_runnable(se); 5378 + update_cfs_group(se); 5379 + 5578 5380 cfs_rq->h_nr_running--; 5579 5381 cfs_rq->idle_h_nr_running -= idle_h_nr_running; 5580 5382 5383 + /* end evaluation on encountering a throttled cfs_rq */ 5581 5384 if (cfs_rq_throttled(cfs_rq)) 5582 - break; 5385 + goto dequeue_throttle; 5583 5386 5584 - update_load_avg(cfs_rq, se, UPDATE_TG); 5585 - update_cfs_group(se); 5586 5387 } 5587 5388 5389 + dequeue_throttle: 5588 5390 if (!se) 5589 5391 sub_nr_running(rq, 1); 5590 5392 ··· 5653 5445 lsub_positive(&load, task_h_load(p)); 5654 5446 5655 5447 return load; 5448 + } 5449 + 5450 + static unsigned long cpu_runnable(struct rq *rq) 5451 + { 5452 + return cfs_rq_runnable_avg(&rq->cfs); 5453 + } 5454 + 5455 + static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) 5456 + { 5457 + struct cfs_rq *cfs_rq; 5458 + unsigned int runnable; 5459 + 5460 + /* Task has no contribution or is new */ 5461 + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 5462 + return cpu_runnable(rq); 5463 + 5464 + cfs_rq = &rq->cfs; 5465 + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); 5466 + 5467 + /* Discount task's runnable from CPU's runnable */ 5468 + lsub_positive(&runnable, p->se.avg.runnable_avg); 5469 + 5470 + return runnable; 5656 5471 } 5657 5472 5658 5473 static unsigned long capacity_of(int cpu) ··· 6017 5786 bool idle = true; 6018 5787 6019 5788 for_each_cpu(cpu, cpu_smt_mask(core)) { 6020 - __cpumask_clear_cpu(cpu, cpus); 6021 - if (!available_idle_cpu(cpu)) 5789 + if (!available_idle_cpu(cpu)) { 6022 5790 idle = false; 5791 + break; 5792 + } 6023 5793 } 5794 + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); 6024 5795 6025 5796 if (idle) 6026 5797 return core; ··· 6127 5894 } 6128 5895 6129 5896 /* 5897 + * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which 5898 + * the task fits. If no CPU is big enough, but there are idle ones, try to 5899 + * maximize capacity. 5900 + */ 5901 + static int 5902 + select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) 5903 + { 5904 + unsigned long best_cap = 0; 5905 + int cpu, best_cpu = -1; 5906 + struct cpumask *cpus; 5907 + 5908 + sync_entity_load_avg(&p->se); 5909 + 5910 + cpus = this_cpu_cpumask_var_ptr(select_idle_mask); 5911 + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); 5912 + 5913 + for_each_cpu_wrap(cpu, cpus, target) { 5914 + unsigned long cpu_cap = capacity_of(cpu); 5915 + 5916 + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) 5917 + continue; 5918 + if (task_fits_capacity(p, cpu_cap)) 5919 + return cpu; 5920 + 5921 + if (cpu_cap > best_cap) { 5922 + best_cap = cpu_cap; 5923 + best_cpu = cpu; 5924 + } 5925 + } 5926 + 5927 + return best_cpu; 5928 + } 5929 + 5930 + /* 6130 5931 * Try and locate an idle core/thread in the LLC cache domain. 6131 5932 */ 6132 5933 static int select_idle_sibling(struct task_struct *p, int prev, int target) ··· 6168 5901 struct sched_domain *sd; 6169 5902 int i, recent_used_cpu; 6170 5903 5904 + /* 5905 + * For asymmetric CPU capacity systems, our domain of interest is 5906 + * sd_asym_cpucapacity rather than sd_llc. 5907 + */ 5908 + if (static_branch_unlikely(&sched_asym_cpucapacity)) { 5909 + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); 5910 + /* 5911 + * On an asymmetric CPU capacity system where an exclusive 5912 + * cpuset defines a symmetric island (i.e. one unique 5913 + * capacity_orig value through the cpuset), the key will be set 5914 + * but the CPUs within that cpuset will not have a domain with 5915 + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric 5916 + * capacity path. 5917 + */ 5918 + if (!sd) 5919 + goto symmetric; 5920 + 5921 + i = select_idle_capacity(p, sd, target); 5922 + return ((unsigned)i < nr_cpumask_bits) ? i : target; 5923 + } 5924 + 5925 + symmetric: 6171 5926 if (available_idle_cpu(target) || sched_idle_cpu(target)) 6172 5927 return target; 6173 5928 ··· 6387 6098 * the cpu_util call. 6388 6099 */ 6389 6100 return min_t(unsigned long, util, capacity_orig_of(cpu)); 6390 - } 6391 - 6392 - /* 6393 - * Disable WAKE_AFFINE in the case where task @p doesn't fit in the 6394 - * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. 6395 - * 6396 - * In that case WAKE_AFFINE doesn't make sense and we'll let 6397 - * BALANCE_WAKE sort things out. 6398 - */ 6399 - static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) 6400 - { 6401 - long min_cap, max_cap; 6402 - 6403 - if (!static_branch_unlikely(&sched_asym_cpucapacity)) 6404 - return 0; 6405 - 6406 - min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); 6407 - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; 6408 - 6409 - /* Minimum capacity is close to max, no need to abort wake_affine */ 6410 - if (max_cap - min_cap < max_cap >> 3) 6411 - return 0; 6412 - 6413 - /* Bring task utilization in sync with prev_cpu */ 6414 - sync_entity_load_avg(&p->se); 6415 - 6416 - return !task_fits_capacity(p, min_cap); 6417 6101 } 6418 6102 6419 6103 /* ··· 6653 6391 new_cpu = prev_cpu; 6654 6392 } 6655 6393 6656 - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && 6657 - cpumask_test_cpu(cpu, p->cpus_ptr); 6394 + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); 6658 6395 } 6659 6396 6660 6397 rcu_read_lock(); ··· 7767 7506 if (READ_ONCE(rq->avg_dl.util_avg)) 7768 7507 return true; 7769 7508 7509 + if (thermal_load_avg(rq)) 7510 + return true; 7511 + 7770 7512 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 7771 7513 if (READ_ONCE(rq->avg_irq.util_avg)) 7772 7514 return true; ··· 7795 7531 { 7796 7532 const struct sched_class *curr_class; 7797 7533 u64 now = rq_clock_pelt(rq); 7534 + unsigned long thermal_pressure; 7798 7535 bool decayed; 7799 7536 7800 7537 /* ··· 7804 7539 */ 7805 7540 curr_class = rq->curr->sched_class; 7806 7541 7542 + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); 7543 + 7807 7544 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | 7808 7545 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | 7546 + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | 7809 7547 update_irq_load_avg(rq, 0); 7810 7548 7811 7549 if (others_have_blocked(rq)) ··· 7830 7562 if (cfs_rq->avg.util_sum) 7831 7563 return false; 7832 7564 7833 - if (cfs_rq->avg.runnable_load_sum) 7565 + if (cfs_rq->avg.runnable_sum) 7834 7566 return false; 7835 7567 7836 7568 return true; ··· 7968 7700 unsigned long avg_load; /*Avg load across the CPUs of the group */ 7969 7701 unsigned long group_load; /* Total load over the CPUs of the group */ 7970 7702 unsigned long group_capacity; 7971 - unsigned long group_util; /* Total utilization of the group */ 7703 + unsigned long group_util; /* Total utilization over the CPUs of the group */ 7704 + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ 7972 7705 unsigned int sum_nr_running; /* Nr of tasks running in the group */ 7973 7706 unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ 7974 7707 unsigned int idle_cpus; ··· 8032 7763 if (unlikely(irq >= max)) 8033 7764 return 1; 8034 7765 7766 + /* 7767 + * avg_rt.util_avg and avg_dl.util_avg track binary signals 7768 + * (running and not running) with weights 0 and 1024 respectively. 7769 + * avg_thermal.load_avg tracks thermal pressure and the weighted 7770 + * average uses the actual delta max capacity(load). 7771 + */ 8035 7772 used = READ_ONCE(rq->avg_rt.util_avg); 8036 7773 used += READ_ONCE(rq->avg_dl.util_avg); 7774 + used += thermal_load_avg(rq); 8037 7775 8038 7776 if (unlikely(used >= max)) 8039 7777 return 1; ··· 8197 7921 if (sgs->sum_nr_running < sgs->group_weight) 8198 7922 return true; 8199 7923 7924 + if ((sgs->group_capacity * imbalance_pct) < 7925 + (sgs->group_runnable * 100)) 7926 + return false; 7927 + 8200 7928 if ((sgs->group_capacity * 100) > 8201 7929 (sgs->group_util * imbalance_pct)) 8202 7930 return true; ··· 8224 7944 8225 7945 if ((sgs->group_capacity * 100) < 8226 7946 (sgs->group_util * imbalance_pct)) 7947 + return true; 7948 + 7949 + if ((sgs->group_capacity * imbalance_pct) < 7950 + (sgs->group_runnable * 100)) 8227 7951 return true; 8228 7952 8229 7953 return false; ··· 8324 8040 8325 8041 sgs->group_load += cpu_load(rq); 8326 8042 sgs->group_util += cpu_util(i); 8043 + sgs->group_runnable += cpu_runnable(rq); 8327 8044 sgs->sum_h_nr_running += rq->cfs.h_nr_running; 8328 8045 8329 8046 nr_running = rq->nr_running; ··· 8600 8315 8601 8316 sgs->group_load += cpu_load_without(rq, p); 8602 8317 sgs->group_util += cpu_util_without(i, p); 8318 + sgs->group_runnable += cpu_runnable_without(rq, p); 8603 8319 local = task_running_on_cpu(i, p); 8604 8320 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; 8605 8321 ··· 8631 8345 * Computing avg_load makes sense only when group is fully busy or 8632 8346 * overloaded 8633 8347 */ 8634 - if (sgs->group_type < group_fully_busy) 8348 + if (sgs->group_type == group_fully_busy || 8349 + sgs->group_type == group_overloaded) 8635 8350 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / 8636 8351 sgs->group_capacity; 8637 8352 } ··· 8915 8628 } 8916 8629 } 8917 8630 8631 + static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) 8632 + { 8633 + unsigned int imbalance_min; 8634 + 8635 + /* 8636 + * Allow a small imbalance based on a simple pair of communicating 8637 + * tasks that remain local when the source domain is almost idle. 8638 + */ 8639 + imbalance_min = 2; 8640 + if (src_nr_running <= imbalance_min) 8641 + return 0; 8642 + 8643 + return imbalance; 8644 + } 8645 + 8918 8646 /** 8919 8647 * calculate_imbalance - Calculate the amount of imbalance present within the 8920 8648 * groups of a given sched_domain during load balance. ··· 9026 8724 } 9027 8725 9028 8726 /* Consider allowing a small imbalance between NUMA groups */ 9029 - if (env->sd->flags & SD_NUMA) { 9030 - unsigned int imbalance_min; 9031 - 9032 - /* 9033 - * Compute an allowed imbalance based on a simple 9034 - * pair of communicating tasks that should remain 9035 - * local and ignore them. 9036 - * 9037 - * NOTE: Generally this would have been based on 9038 - * the domain size and this was evaluated. However, 9039 - * the benefit is similar across a range of workloads 9040 - * and machines but scaling by the domain size adds 9041 - * the risk that lower domains have to be rebalanced. 9042 - */ 9043 - imbalance_min = 2; 9044 - if (busiest->sum_nr_running <= imbalance_min) 9045 - env->imbalance = 0; 9046 - } 8727 + if (env->sd->flags & SD_NUMA) 8728 + env->imbalance = adjust_numa_imbalance(env->imbalance, 8729 + busiest->sum_nr_running); 9047 8730 9048 8731 return; 9049 8732 } ··· 9313 9026 9314 9027 case migrate_util: 9315 9028 util = cpu_util(cpu_of(rq)); 9029 + 9030 + /* 9031 + * Don't try to pull utilization from a CPU with one 9032 + * running task. Whatever its utilization, we will fail 9033 + * detach the task. 9034 + */ 9035 + if (nr_running <= 1) 9036 + continue; 9316 9037 9317 9038 if (busiest_util < util) { 9318 9039 busiest_util = util;

+62 -28

kernel/sched/pelt.c

··· 121 121 */ 122 122 if (periods) { 123 123 sa->load_sum = decay_load(sa->load_sum, periods); 124 - sa->runnable_load_sum = 125 - decay_load(sa->runnable_load_sum, periods); 124 + sa->runnable_sum = 125 + decay_load(sa->runnable_sum, periods); 126 126 sa->util_sum = decay_load((u64)(sa->util_sum), periods); 127 127 128 128 /* ··· 149 149 if (load) 150 150 sa->load_sum += load * contrib; 151 151 if (runnable) 152 - sa->runnable_load_sum += runnable * contrib; 152 + sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT; 153 153 if (running) 154 154 sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; 155 155 ··· 238 238 } 239 239 240 240 static __always_inline void 241 - ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) 241 + ___update_load_avg(struct sched_avg *sa, unsigned long load) 242 242 { 243 243 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; 244 244 ··· 246 246 * Step 2: update *_avg. 247 247 */ 248 248 sa->load_avg = div_u64(load * sa->load_sum, divider); 249 - sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); 249 + sa->runnable_avg = div_u64(sa->runnable_sum, divider); 250 250 WRITE_ONCE(sa->util_avg, sa->util_sum / divider); 251 251 } 252 252 ··· 254 254 * sched_entity: 255 255 * 256 256 * task: 257 - * se_runnable() == se_weight() 257 + * se_weight() = se->load.weight 258 + * se_runnable() = !!on_rq 258 259 * 259 260 * group: [ see update_cfs_group() ] 260 261 * se_weight() = tg->weight * grq->load_avg / tg->load_avg 261 - * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg 262 + * se_runnable() = grq->h_nr_running 262 263 * 263 - * load_sum := runnable_sum 264 - * load_avg = se_weight(se) * runnable_avg 264 + * runnable_sum = se_runnable() * runnable = grq->runnable_sum 265 + * runnable_avg = runnable_sum 265 266 * 266 - * runnable_load_sum := runnable_sum 267 - * runnable_load_avg = se_runnable(se) * runnable_avg 268 - * 269 - * XXX collapse load_sum and runnable_load_sum 267 + * load_sum := runnable 268 + * load_avg = se_weight(se) * load_sum 270 269 * 271 270 * cfq_rq: 272 271 * 272 + * runnable_sum = \Sum se->avg.runnable_sum 273 + * runnable_avg = \Sum se->avg.runnable_avg 274 + * 273 275 * load_sum = \Sum se_weight(se) * se->avg.load_sum 274 276 * load_avg = \Sum se->avg.load_avg 275 - * 276 - * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum 277 - * runnable_load_avg = \Sum se->avg.runable_load_avg 278 277 */ 279 278 280 279 int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) 281 280 { 282 281 if (___update_load_sum(now, &se->avg, 0, 0, 0)) { 283 - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 282 + ___update_load_avg(&se->avg, se_weight(se)); 284 283 trace_pelt_se_tp(se); 285 284 return 1; 286 285 } ··· 289 290 290 291 int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) 291 292 { 292 - if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, 293 + if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se), 293 294 cfs_rq->curr == se)) { 294 295 295 - ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 296 + ___update_load_avg(&se->avg, se_weight(se)); 296 297 cfs_se_util_change(&se->avg); 297 298 trace_pelt_se_tp(se); 298 299 return 1; ··· 305 306 { 306 307 if (___update_load_sum(now, &cfs_rq->avg, 307 308 scale_load_down(cfs_rq->load.weight), 308 - scale_load_down(cfs_rq->runnable_weight), 309 + cfs_rq->h_nr_running, 309 310 cfs_rq->curr != NULL)) { 310 311 311 - ___update_load_avg(&cfs_rq->avg, 1, 1); 312 + ___update_load_avg(&cfs_rq->avg, 1); 312 313 trace_pelt_cfs_tp(cfs_rq); 313 314 return 1; 314 315 } ··· 321 322 * 322 323 * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked 323 324 * util_sum = cpu_scale * load_sum 324 - * runnable_load_sum = load_sum 325 + * runnable_sum = util_sum 325 326 * 326 - * load_avg and runnable_load_avg are not supported and meaningless. 327 + * load_avg and runnable_avg are not supported and meaningless. 327 328 * 328 329 */ 329 330 ··· 334 335 running, 335 336 running)) { 336 337 337 - ___update_load_avg(&rq->avg_rt, 1, 1); 338 + ___update_load_avg(&rq->avg_rt, 1); 338 339 trace_pelt_rt_tp(rq); 339 340 return 1; 340 341 } ··· 347 348 * 348 349 * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked 349 350 * util_sum = cpu_scale * load_sum 350 - * runnable_load_sum = load_sum 351 + * runnable_sum = util_sum 352 + * 353 + * load_avg and runnable_avg are not supported and meaningless. 351 354 * 352 355 */ 353 356 ··· 360 359 running, 361 360 running)) { 362 361 363 - ___update_load_avg(&rq->avg_dl, 1, 1); 362 + ___update_load_avg(&rq->avg_dl, 1); 364 363 trace_pelt_dl_tp(rq); 365 364 return 1; 366 365 } ··· 368 367 return 0; 369 368 } 370 369 370 + #ifdef CONFIG_SCHED_THERMAL_PRESSURE 371 + /* 372 + * thermal: 373 + * 374 + * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked 375 + * 376 + * util_avg and runnable_load_avg are not supported and meaningless. 377 + * 378 + * Unlike rt/dl utilization tracking that track time spent by a cpu 379 + * running a rt/dl task through util_avg, the average thermal pressure is 380 + * tracked through load_avg. This is because thermal pressure signal is 381 + * time weighted "delta" capacity unlike util_avg which is binary. 382 + * "delta capacity" = actual capacity - 383 + * capped capacity a cpu due to a thermal event. 384 + */ 385 + 386 + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) 387 + { 388 + if (___update_load_sum(now, &rq->avg_thermal, 389 + capacity, 390 + capacity, 391 + capacity)) { 392 + ___update_load_avg(&rq->avg_thermal, 1); 393 + trace_pelt_thermal_tp(rq); 394 + return 1; 395 + } 396 + 397 + return 0; 398 + } 399 + #endif 400 + 371 401 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 372 402 /* 373 403 * irq: 374 404 * 375 405 * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked 376 406 * util_sum = cpu_scale * load_sum 377 - * runnable_load_sum = load_sum 407 + * runnable_sum = util_sum 408 + * 409 + * load_avg and runnable_avg are not supported and meaningless. 378 410 * 379 411 */ 380 412 ··· 444 410 1); 445 411 446 412 if (ret) { 447 - ___update_load_avg(&rq->avg_irq, 1, 1); 413 + ___update_load_avg(&rq->avg_irq, 1); 448 414 trace_pelt_irq_tp(rq); 449 415 } 450 416

+31

kernel/sched/pelt.h

··· 7 7 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); 8 8 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); 9 9 10 + #ifdef CONFIG_SCHED_THERMAL_PRESSURE 11 + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); 12 + 13 + static inline u64 thermal_load_avg(struct rq *rq) 14 + { 15 + return READ_ONCE(rq->avg_thermal.load_avg); 16 + } 17 + #else 18 + static inline int 19 + update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) 20 + { 21 + return 0; 22 + } 23 + 24 + static inline u64 thermal_load_avg(struct rq *rq) 25 + { 26 + return 0; 27 + } 28 + #endif 29 + 10 30 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 11 31 int update_irq_load_avg(struct rq *rq, u64 running); 12 32 #else ··· 174 154 175 155 static inline int 176 156 update_dl_rq_load_avg(u64 now, struct rq *rq, int running) 157 + { 158 + return 0; 159 + } 160 + 161 + static inline int 162 + update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) 163 + { 164 + return 0; 165 + } 166 + 167 + static inline u64 thermal_load_avg(struct rq *rq) 177 168 { 178 169 return 0; 179 170 }

+81 -32

kernel/sched/psi.c

··· 225 225 case PSI_MEM_FULL: 226 226 return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; 227 227 case PSI_CPU_SOME: 228 - return tasks[NR_RUNNING] > 1; 228 + return tasks[NR_RUNNING] > tasks[NR_ONCPU]; 229 229 case PSI_NONIDLE: 230 230 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || 231 231 tasks[NR_RUNNING]; ··· 669 669 groupc->times[PSI_NONIDLE] += delta; 670 670 } 671 671 672 - static u32 psi_group_change(struct psi_group *group, int cpu, 673 - unsigned int clear, unsigned int set) 672 + static void psi_group_change(struct psi_group *group, int cpu, 673 + unsigned int clear, unsigned int set, 674 + bool wake_clock) 674 675 { 675 676 struct psi_group_cpu *groupc; 677 + u32 state_mask = 0; 676 678 unsigned int t, m; 677 679 enum psi_states s; 678 - u32 state_mask = 0; 679 680 680 681 groupc = per_cpu_ptr(group->pcpu, cpu); 681 682 ··· 696 695 if (!(m & (1 << t))) 697 696 continue; 698 697 if (groupc->tasks[t] == 0 && !psi_bug) { 699 - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", 698 + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", 700 699 cpu, t, groupc->tasks[0], 701 700 groupc->tasks[1], groupc->tasks[2], 702 - clear, set); 701 + groupc->tasks[3], clear, set); 703 702 psi_bug = 1; 704 703 } 705 704 groupc->tasks[t]--; ··· 718 717 719 718 write_seqcount_end(&groupc->seq); 720 719 721 - return state_mask; 720 + if (state_mask & group->poll_states) 721 + psi_schedule_poll_work(group, 1); 722 + 723 + if (wake_clock && !delayed_work_pending(&group->avgs_work)) 724 + schedule_delayed_work(&group->avgs_work, PSI_FREQ); 722 725 } 723 726 724 727 static struct psi_group *iterate_groups(struct task_struct *task, void **iter) ··· 749 744 return &psi_system; 750 745 } 751 746 747 + static void psi_flags_change(struct task_struct *task, int clear, int set) 748 + { 749 + if (((task->psi_flags & set) || 750 + (task->psi_flags & clear) != clear) && 751 + !psi_bug) { 752 + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", 753 + task->pid, task->comm, task_cpu(task), 754 + task->psi_flags, clear, set); 755 + psi_bug = 1; 756 + } 757 + 758 + task->psi_flags &= ~clear; 759 + task->psi_flags |= set; 760 + } 761 + 752 762 void psi_task_change(struct task_struct *task, int clear, int set) 753 763 { 754 764 int cpu = task_cpu(task); ··· 774 754 if (!task->pid) 775 755 return; 776 756 777 - if (((task->psi_flags & set) || 778 - (task->psi_flags & clear) != clear) && 779 - !psi_bug) { 780 - printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", 781 - task->pid, task->comm, cpu, 782 - task->psi_flags, clear, set); 783 - psi_bug = 1; 784 - } 785 - 786 - task->psi_flags &= ~clear; 787 - task->psi_flags |= set; 757 + psi_flags_change(task, clear, set); 788 758 789 759 /* 790 760 * Periodic aggregation shuts off if there is a period of no ··· 787 777 wq_worker_last_func(task) == psi_avgs_work)) 788 778 wake_clock = false; 789 779 790 - while ((group = iterate_groups(task, &iter))) { 791 - u32 state_mask = psi_group_change(group, cpu, clear, set); 780 + while ((group = iterate_groups(task, &iter))) 781 + psi_group_change(group, cpu, clear, set, wake_clock); 782 + } 792 783 793 - if (state_mask & group->poll_states) 794 - psi_schedule_poll_work(group, 1); 784 + void psi_task_switch(struct task_struct *prev, struct task_struct *next, 785 + bool sleep) 786 + { 787 + struct psi_group *group, *common = NULL; 788 + int cpu = task_cpu(prev); 789 + void *iter; 795 790 796 - if (wake_clock && !delayed_work_pending(&group->avgs_work)) 797 - schedule_delayed_work(&group->avgs_work, PSI_FREQ); 791 + if (next->pid) { 792 + psi_flags_change(next, 0, TSK_ONCPU); 793 + /* 794 + * When moving state between tasks, the group that 795 + * contains them both does not change: we can stop 796 + * updating the tree once we reach the first common 797 + * ancestor. Iterate @next's ancestors until we 798 + * encounter @prev's state. 799 + */ 800 + iter = NULL; 801 + while ((group = iterate_groups(next, &iter))) { 802 + if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { 803 + common = group; 804 + break; 805 + } 806 + 807 + psi_group_change(group, cpu, 0, TSK_ONCPU, true); 808 + } 809 + } 810 + 811 + /* 812 + * If this is a voluntary sleep, dequeue will have taken care 813 + * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We 814 + * only need to deal with it during preemption. 815 + */ 816 + if (sleep) 817 + return; 818 + 819 + if (prev->pid) { 820 + psi_flags_change(prev, TSK_ONCPU, 0); 821 + 822 + iter = NULL; 823 + while ((group = iterate_groups(prev, &iter)) && group != common) 824 + psi_group_change(group, cpu, TSK_ONCPU, 0, true); 798 825 } 799 826 } 800 827 ··· 865 818 if (static_branch_likely(&psi_disabled)) 866 819 return; 867 820 868 - *flags = current->flags & PF_MEMSTALL; 821 + *flags = current->in_memstall; 869 822 if (*flags) 870 823 return; 871 824 /* 872 - * PF_MEMSTALL setting & accounting needs to be atomic wrt 825 + * in_memstall setting & accounting needs to be atomic wrt 873 826 * changes to the task's scheduling state, otherwise we can 874 827 * race with CPU migration. 875 828 */ 876 829 rq = this_rq_lock_irq(&rf); 877 830 878 - current->flags |= PF_MEMSTALL; 831 + current->in_memstall = 1; 879 832 psi_task_change(current, 0, TSK_MEMSTALL); 880 833 881 834 rq_unlock_irq(rq, &rf); ··· 898 851 if (*flags) 899 852 return; 900 853 /* 901 - * PF_MEMSTALL clearing & accounting needs to be atomic wrt 854 + * in_memstall clearing & accounting needs to be atomic wrt 902 855 * changes to the task's scheduling state, otherwise we could 903 856 * race with CPU migration. 904 857 */ 905 858 rq = this_rq_lock_irq(&rf); 906 859 907 - current->flags &= ~PF_MEMSTALL; 860 + current->in_memstall = 0; 908 861 psi_task_change(current, TSK_MEMSTALL, 0); 909 862 910 863 rq_unlock_irq(rq, &rf); ··· 963 916 964 917 rq = task_rq_lock(task, &rf); 965 918 966 - if (task_on_rq_queued(task)) 919 + if (task_on_rq_queued(task)) { 967 920 task_flags = TSK_RUNNING; 968 - else if (task->in_iowait) 921 + if (task_current(rq, task)) 922 + task_flags |= TSK_ONCPU; 923 + } else if (task->in_iowait) 969 924 task_flags = TSK_IOWAIT; 970 925 971 - if (task->flags & PF_MEMSTALL) 926 + if (task->in_memstall) 972 927 task_flags |= TSK_MEMSTALL; 973 928 974 929 if (task_flags)

+42 -24

kernel/sched/rt.c

··· 1475 1475 int target = find_lowest_rq(p); 1476 1476 1477 1477 /* 1478 + * Bail out if we were forcing a migration to find a better 1479 + * fitting CPU but our search failed. 1480 + */ 1481 + if (!test && target != -1 && !rt_task_fits_capacity(p, target)) 1482 + goto out_unlock; 1483 + 1484 + /* 1478 1485 * Don't bother moving it if the destination CPU is 1479 1486 * not running a lower priority task. 1480 1487 */ ··· 1489 1482 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1490 1483 cpu = target; 1491 1484 } 1485 + 1486 + out_unlock: 1492 1487 rcu_read_unlock(); 1493 1488 1494 1489 out: ··· 1504 1495 * let's hope p can move out. 1505 1496 */ 1506 1497 if (rq->curr->nr_cpus_allowed == 1 || 1507 - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) 1498 + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1508 1499 return; 1509 1500 1510 1501 /* ··· 1512 1503 * see if it is pushed or pulled somewhere else. 1513 1504 */ 1514 1505 if (p->nr_cpus_allowed != 1 && 1515 - cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) 1506 + cpupri_find(&rq->rd->cpupri, p, NULL)) 1516 1507 return; 1517 1508 1518 1509 /* ··· 1656 1647 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1657 1648 { 1658 1649 if (!task_running(rq, p) && 1659 - cpumask_test_cpu(cpu, p->cpus_ptr) && 1660 - rt_task_fits_capacity(p, cpu)) 1650 + cpumask_test_cpu(cpu, p->cpus_ptr)) 1661 1651 return 1; 1662 1652 1663 1653 return 0; ··· 1690 1682 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1691 1683 int this_cpu = smp_processor_id(); 1692 1684 int cpu = task_cpu(task); 1685 + int ret; 1693 1686 1694 1687 /* Make sure the mask is initialized first */ 1695 1688 if (unlikely(!lowest_mask)) ··· 1699 1690 if (task->nr_cpus_allowed == 1) 1700 1691 return -1; /* No other targets possible */ 1701 1692 1702 - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, 1703 - rt_task_fits_capacity)) 1693 + /* 1694 + * If we're on asym system ensure we consider the different capacities 1695 + * of the CPUs when searching for the lowest_mask. 1696 + */ 1697 + if (static_branch_unlikely(&sched_asym_cpucapacity)) { 1698 + 1699 + ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, 1700 + task, lowest_mask, 1701 + rt_task_fits_capacity); 1702 + } else { 1703 + 1704 + ret = cpupri_find(&task_rq(task)->rd->cpupri, 1705 + task, lowest_mask); 1706 + } 1707 + 1708 + if (!ret) 1704 1709 return -1; /* No targets found */ 1705 1710 1706 1711 /* ··· 2225 2202 (rq->curr->nr_cpus_allowed < 2 || 2226 2203 rq->curr->prio <= p->prio); 2227 2204 2228 - if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) 2205 + if (need_to_push) 2229 2206 push_rt_tasks(rq); 2230 2207 } 2231 2208 ··· 2297 2274 */ 2298 2275 if (task_on_rq_queued(p) && rq->curr != p) { 2299 2276 #ifdef CONFIG_SMP 2300 - bool need_to_push = rq->rt.overloaded || 2301 - !rt_task_fits_capacity(p, cpu_of(rq)); 2302 - 2303 - if (p->nr_cpus_allowed > 1 && need_to_push) 2277 + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2304 2278 rt_queue_push_tasks(rq); 2305 2279 #endif /* CONFIG_SMP */ 2306 2280 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) ··· 2469 2449 */ 2470 2450 static DEFINE_MUTEX(rt_constraints_mutex); 2471 2451 2472 - /* Must be called with tasklist_lock held */ 2473 2452 static inline int tg_has_rt_tasks(struct task_group *tg) 2474 2453 { 2475 - struct task_struct *g, *p; 2454 + struct task_struct *task; 2455 + struct css_task_iter it; 2456 + int ret = 0; 2476 2457 2477 2458 /* 2478 2459 * Autogroups do not have RT tasks; see autogroup_create(). ··· 2481 2460 if (task_group_is_autogroup(tg)) 2482 2461 return 0; 2483 2462 2484 - for_each_process_thread(g, p) { 2485 - if (rt_task(p) && task_group(p) == tg) 2486 - return 1; 2487 - } 2463 + css_task_iter_start(&tg->css, 0, &it); 2464 + while (!ret && (task = css_task_iter_next(&it))) 2465 + ret |= rt_task(task); 2466 + css_task_iter_end(&it); 2488 2467 2489 - return 0; 2468 + return ret; 2490 2469 } 2491 2470 2492 2471 struct rt_schedulable_data { ··· 2517 2496 return -EINVAL; 2518 2497 2519 2498 /* 2520 - * Ensure we don't starve existing RT tasks. 2499 + * Ensure we don't starve existing RT tasks if runtime turns zero. 2521 2500 */ 2522 - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 2501 + if (rt_bandwidth_enabled() && !runtime && 2502 + tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) 2523 2503 return -EBUSY; 2524 2504 2525 2505 total = to_ratio(period, runtime); ··· 2586 2564 return -EINVAL; 2587 2565 2588 2566 mutex_lock(&rt_constraints_mutex); 2589 - read_lock(&tasklist_lock); 2590 2567 err = __rt_schedulable(tg, rt_period, rt_runtime); 2591 2568 if (err) 2592 2569 goto unlock; ··· 2603 2582 } 2604 2583 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2605 2584 unlock: 2606 - read_unlock(&tasklist_lock); 2607 2585 mutex_unlock(&rt_constraints_mutex); 2608 2586 2609 2587 return err; ··· 2661 2641 int ret = 0; 2662 2642 2663 2643 mutex_lock(&rt_constraints_mutex); 2664 - read_lock(&tasklist_lock); 2665 2644 ret = __rt_schedulable(NULL, 0, 0); 2666 - read_unlock(&tasklist_lock); 2667 2645 mutex_unlock(&rt_constraints_mutex); 2668 2646 2669 2647 return ret;

+58 -11

kernel/sched/sched.h

··· 118 118 #ifdef CONFIG_64BIT 119 119 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 120 120 # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) 121 - # define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) 121 + # define scale_load_down(w) \ 122 + ({ \ 123 + unsigned long __w = (w); \ 124 + if (__w) \ 125 + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ 126 + __w; \ 127 + }) 122 128 #else 123 129 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) 124 130 # define scale_load(w) (w) ··· 311 305 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 312 306 } 313 307 314 - extern void dl_change_utilization(struct task_struct *p, u64 new_bw); 315 308 extern void init_dl_bw(struct dl_bw *dl_b); 316 309 extern int sched_dl_global_validate(void); 317 310 extern void sched_dl_do_global(void); ··· 494 489 /* CFS-related fields in a runqueue */ 495 490 struct cfs_rq { 496 491 struct load_weight load; 497 - unsigned long runnable_weight; 498 492 unsigned int nr_running; 499 493 unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ 500 494 unsigned int idle_h_nr_running; /* SCHED_IDLE */ ··· 532 528 int nr; 533 529 unsigned long load_avg; 534 530 unsigned long util_avg; 535 - unsigned long runnable_sum; 531 + unsigned long runnable_avg; 536 532 } removed; 537 533 538 534 #ifdef CONFIG_FAIR_GROUP_SCHED ··· 692 688 #ifdef CONFIG_FAIR_GROUP_SCHED 693 689 /* An entity is a task if it doesn't "own" a runqueue */ 694 690 #define entity_is_task(se) (!se->my_q) 691 + 692 + static inline void se_update_runnable(struct sched_entity *se) 693 + { 694 + if (!entity_is_task(se)) 695 + se->runnable_weight = se->my_q->h_nr_running; 696 + } 697 + 698 + static inline long se_runnable(struct sched_entity *se) 699 + { 700 + if (entity_is_task(se)) 701 + return !!se->on_rq; 702 + else 703 + return se->runnable_weight; 704 + } 705 + 695 706 #else 696 707 #define entity_is_task(se) 1 708 + 709 + static inline void se_update_runnable(struct sched_entity *se) {} 710 + 711 + static inline long se_runnable(struct sched_entity *se) 712 + { 713 + return !!se->on_rq; 714 + } 697 715 #endif 698 716 699 717 #ifdef CONFIG_SMP ··· 727 701 return scale_load_down(se->load.weight); 728 702 } 729 703 730 - static inline long se_runnable(struct sched_entity *se) 731 - { 732 - return scale_load_down(se->runnable_weight); 733 - } 734 704 735 705 static inline bool sched_asym_prefer(int a, int b) 736 706 { ··· 966 944 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 967 945 struct sched_avg avg_irq; 968 946 #endif 947 + #ifdef CONFIG_SCHED_THERMAL_PRESSURE 948 + struct sched_avg avg_thermal; 949 + #endif 969 950 u64 idle_stamp; 970 951 u64 avg_idle; 971 952 ··· 1130 1105 assert_clock_updated(rq); 1131 1106 1132 1107 return rq->clock_task; 1108 + } 1109 + 1110 + /** 1111 + * By default the decay is the default pelt decay period. 1112 + * The decay shift can change the decay period in 1113 + * multiples of 32. 1114 + * Decay shift Decay period(ms) 1115 + * 0 32 1116 + * 1 64 1117 + * 2 128 1118 + * 3 256 1119 + * 4 512 1120 + */ 1121 + extern int sched_thermal_decay_shift; 1122 + 1123 + static inline u64 rq_clock_thermal(struct rq *rq) 1124 + { 1125 + return rq_clock_task(rq) >> sched_thermal_decay_shift; 1133 1126 } 1134 1127 1135 1128 static inline void rq_clock_skip_update(struct rq *rq) ··· 1379 1336 #define for_each_domain(cpu, __sd) \ 1380 1337 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ 1381 1338 __sd; __sd = __sd->parent) 1382 - 1383 - #define for_each_lower_domain(sd) for (; sd; sd = sd->child) 1384 1339 1385 1340 /** 1386 1341 * highest_flag_domain - Return highest sched_domain containing flag. ··· 1910 1869 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 1911 1870 extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 1912 1871 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1913 - extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1914 1872 1915 1873 #define BW_SHIFT 20 1916 1874 #define BW_UNIT (1 << BW_SHIFT) ··· 2007 1967 } 2008 1968 2009 1969 #endif /* CONFIG_SCHED_HRTICK */ 1970 + 1971 + #ifndef arch_scale_freq_tick 1972 + static __always_inline 1973 + void arch_scale_freq_tick(void) 1974 + { 1975 + } 1976 + #endif 2010 1977 2011 1978 #ifndef arch_scale_freq_capacity 2012 1979 static __always_inline

+26 -5

kernel/sched/stats.h

··· 70 70 return; 71 71 72 72 if (!wakeup || p->sched_psi_wake_requeue) { 73 - if (p->flags & PF_MEMSTALL) 73 + if (p->in_memstall) 74 74 set |= TSK_MEMSTALL; 75 75 if (p->sched_psi_wake_requeue) 76 76 p->sched_psi_wake_requeue = 0; ··· 90 90 return; 91 91 92 92 if (!sleep) { 93 - if (p->flags & PF_MEMSTALL) 93 + if (p->in_memstall) 94 94 clear |= TSK_MEMSTALL; 95 95 } else { 96 + /* 97 + * When a task sleeps, schedule() dequeues it before 98 + * switching to the next one. Merge the clearing of 99 + * TSK_RUNNING and TSK_ONCPU to save an unnecessary 100 + * psi_task_change() call in psi_sched_switch(). 101 + */ 102 + clear |= TSK_ONCPU; 103 + 96 104 if (p->in_iowait) 97 105 set |= TSK_IOWAIT; 98 106 } ··· 117 109 * deregister its sleep-persistent psi states from the old 118 110 * queue, and let psi_enqueue() know it has to requeue. 119 111 */ 120 - if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { 112 + if (unlikely(p->in_iowait || p->in_memstall)) { 121 113 struct rq_flags rf; 122 114 struct rq *rq; 123 115 int clear = 0; 124 116 125 117 if (p->in_iowait) 126 118 clear |= TSK_IOWAIT; 127 - if (p->flags & PF_MEMSTALL) 119 + if (p->in_memstall) 128 120 clear |= TSK_MEMSTALL; 129 121 130 122 rq = __task_rq_lock(p, &rf); ··· 134 126 } 135 127 } 136 128 129 + static inline void psi_sched_switch(struct task_struct *prev, 130 + struct task_struct *next, 131 + bool sleep) 132 + { 133 + if (static_branch_likely(&psi_disabled)) 134 + return; 135 + 136 + psi_task_switch(prev, next, sleep); 137 + } 138 + 137 139 static inline void psi_task_tick(struct rq *rq) 138 140 { 139 141 if (static_branch_likely(&psi_disabled)) 140 142 return; 141 143 142 - if (unlikely(rq->curr->flags & PF_MEMSTALL)) 144 + if (unlikely(rq->curr->in_memstall)) 143 145 psi_memstall_tick(rq->curr, cpu_of(rq)); 144 146 } 145 147 #else /* CONFIG_PSI */ 146 148 static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} 147 149 static inline void psi_dequeue(struct task_struct *p, bool sleep) {} 148 150 static inline void psi_ttwu_dequeue(struct task_struct *p) {} 151 + static inline void psi_sched_switch(struct task_struct *prev, 152 + struct task_struct *next, 153 + bool sleep) {} 149 154 static inline void psi_task_tick(struct rq *rq) {} 150 155 #endif /* CONFIG_PSI */ 151 156

+13 -14

kernel/sched/topology.c

··· 317 317 * EAS can be used on a root domain if it meets all the following conditions: 318 318 * 1. an Energy Model (EM) is available; 319 319 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. 320 - * 3. the EM complexity is low enough to keep scheduling overheads low; 321 - * 4. schedutil is driving the frequency of all CPUs of the rd; 320 + * 3. no SMT is detected. 321 + * 4. the EM complexity is low enough to keep scheduling overheads low; 322 + * 5. schedutil is driving the frequency of all CPUs of the rd; 322 323 * 323 324 * The complexity of the Energy Model is defined as: 324 325 * ··· 358 357 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", 359 358 cpumask_pr_args(cpu_map)); 360 359 } 360 + goto free; 361 + } 362 + 363 + /* EAS definitely does *not* handle SMT */ 364 + if (sched_smt_active()) { 365 + pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", 366 + cpumask_pr_args(cpu_map)); 361 367 goto free; 362 368 } 363 369 ··· 1382 1374 * Convert topological properties into behaviour. 1383 1375 */ 1384 1376 1385 - if (sd->flags & SD_ASYM_CPUCAPACITY) { 1386 - struct sched_domain *t = sd; 1387 - 1388 - /* 1389 - * Don't attempt to spread across CPUs of different capacities. 1390 - */ 1391 - if (sd->child) 1392 - sd->child->flags &= ~SD_PREFER_SIBLING; 1393 - 1394 - for_each_lower_domain(t) 1395 - t->flags |= SD_BALANCE_WAKE; 1396 - } 1377 + /* Don't attempt to spread across CPUs of different capacities. */ 1378 + if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) 1379 + sd->child->flags &= ~SD_PREFER_SIBLING; 1397 1380 1398 1381 if (sd->flags & SD_SHARE_CPUCAPACITY) { 1399 1382 sd->imbalance_pct = 110;

+29

lib/cpumask.c

··· 232 232 BUG(); 233 233 } 234 234 EXPORT_SYMBOL(cpumask_local_spread); 235 + 236 + static DEFINE_PER_CPU(int, distribute_cpu_mask_prev); 237 + 238 + /** 239 + * Returns an arbitrary cpu within srcp1 & srcp2. 240 + * 241 + * Iterated calls using the same srcp1 and srcp2 will be distributed within 242 + * their intersection. 243 + * 244 + * Returns >= nr_cpu_ids if the intersection is empty. 245 + */ 246 + int cpumask_any_and_distribute(const struct cpumask *src1p, 247 + const struct cpumask *src2p) 248 + { 249 + int next, prev; 250 + 251 + /* NOTE: our first selection will skip 0. */ 252 + prev = __this_cpu_read(distribute_cpu_mask_prev); 253 + 254 + next = cpumask_next_and(prev, src1p, src2p); 255 + if (next >= nr_cpu_ids) 256 + next = cpumask_first_and(src1p, src2p); 257 + 258 + if (next < nr_cpu_ids) 259 + __this_cpu_write(distribute_cpu_mask_prev, next); 260 + 261 + return next; 262 + } 263 + EXPORT_SYMBOL(cpumask_any_and_distribute);

Configure Feed

Configure Feed