Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+2 -1

Documentation/scheduler/sched-pelt.c

··· 20 20 int i; 21 21 unsigned int x; 22 22 23 - printf("static const u32 runnable_avg_yN_inv[] = {"); 23 + /* To silence -Wunused-but-set-variable warnings. */ 24 + printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {"); 24 25 for (i = 0; i < HALFLIFE; i++) { 25 26 x = ((1UL<<32)-1)*pow(y, i); 26 27

+1 -1

arch/arm/kernel/topology.c

··· 169 169 topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity); 170 170 171 171 pr_info("CPU%u: update cpu_capacity %lu\n", 172 - cpu, topology_get_cpu_scale(NULL, cpu)); 172 + cpu, topology_get_cpu_scale(cpu)); 173 173 } 174 174 175 175 #else

+1 -1

arch/ia64/kernel/mca.c

··· 1831 1831 ti->cpu = cpu; 1832 1832 p->stack = ti; 1833 1833 p->state = TASK_UNINTERRUPTIBLE; 1834 - cpumask_set_cpu(cpu, &p->cpus_allowed); 1834 + cpumask_set_cpu(cpu, &p->cpus_mask); 1835 1835 INIT_LIST_HEAD(&p->tasks); 1836 1836 p->parent = p->real_parent = p->group_leader = p; 1837 1837 INIT_LIST_HEAD(&p->children);

+2 -2

arch/mips/include/asm/switch_to.h

··· 42 42 * inline to try to keep the overhead down. If we have been forced to run on 43 43 * a "CPU" with an FPU because of a previous high level of FP computation, 44 44 * but did not actually use the FPU during the most recent time-slice (CU1 45 - * isn't set), we undo the restriction on cpus_allowed. 45 + * isn't set), we undo the restriction on cpus_mask. 46 46 * 47 47 * We're not calling set_cpus_allowed() here, because we have no need to 48 48 * force prompt migration - we're already switching the current CPU to a ··· 57 57 test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \ 58 58 (!(KSTK_STATUS(prev) & ST0_CU1))) { \ 59 59 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \ 60 - prev->cpus_allowed = prev->thread.user_cpus_allowed; \ 60 + prev->cpus_mask = prev->thread.user_cpus_allowed; \ 61 61 } \ 62 62 next->thread.emulated_fp = 0; \ 63 63 } while(0)

+1 -1

arch/mips/kernel/mips-mt-fpaff.c

··· 177 177 if (retval) 178 178 goto out_unlock; 179 179 180 - cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed); 180 + cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr); 181 181 cpumask_and(&mask, &allowed, cpu_active_mask); 182 182 183 183 out_unlock:

+3 -3

arch/mips/kernel/traps.c

··· 891 891 * restricted the allowed set to exclude any CPUs with FPUs, 892 892 * we'll skip the procedure. 893 893 */ 894 - if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) { 894 + if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) { 895 895 cpumask_t tmask; 896 896 897 897 current->thread.user_cpus_allowed 898 - = current->cpus_allowed; 899 - cpumask_and(&tmask, &current->cpus_allowed, 898 + = current->cpus_mask; 899 + cpumask_and(&tmask, &current->cpus_mask, 900 900 &mt_fpu_cpumask); 901 901 set_cpus_allowed_ptr(current, &tmask); 902 902 set_thread_flag(TIF_FPUBOUND);

+1 -1

arch/powerpc/platforms/cell/spufs/sched.c

··· 128 128 * runqueue. The context will be rescheduled on the proper node 129 129 * if it is timesliced or preempted. 130 130 */ 131 - cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed); 131 + cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr); 132 132 133 133 /* Save the current cpu id for spu interrupt routing. */ 134 134 ctx->last_ran = raw_smp_processor_id();

+1 -1

arch/x86/kernel/cpu/resctrl/pseudo_lock.c

··· 1503 1503 * may be scheduled elsewhere and invalidate entries in the 1504 1504 * pseudo-locked region. 1505 1505 */ 1506 - if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) { 1506 + if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { 1507 1507 mutex_unlock(&rdtgroup_mutex); 1508 1508 return -EINVAL; 1509 1509 }

+3 -3

drivers/base/arch_topology.c

··· 43 43 { 44 44 struct cpu *cpu = container_of(dev, struct cpu, dev); 45 45 46 - return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); 46 + return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id)); 47 47 } 48 48 49 49 static void update_topology_flags_workfn(struct work_struct *work); ··· 116 116 / capacity_scale; 117 117 topology_set_cpu_scale(cpu, capacity); 118 118 pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", 119 - cpu, topology_get_cpu_scale(NULL, cpu)); 119 + cpu, topology_get_cpu_scale(cpu)); 120 120 } 121 121 } 122 122 ··· 185 185 cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); 186 186 187 187 for_each_cpu(cpu, policy->related_cpus) { 188 - raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) * 188 + raw_capacity[cpu] = topology_get_cpu_scale(cpu) * 189 189 policy->cpuinfo.max_freq / 1000UL; 190 190 capacity_scale = max(raw_capacity[cpu], capacity_scale); 191 191 }

+3 -3

drivers/infiniband/hw/hfi1/affinity.c

··· 1038 1038 struct hfi1_affinity_node *entry; 1039 1039 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; 1040 1040 const struct cpumask *node_mask, 1041 - *proc_mask = &current->cpus_allowed; 1041 + *proc_mask = current->cpus_ptr; 1042 1042 struct hfi1_affinity_node_list *affinity = &node_affinity; 1043 1043 struct cpu_mask_set *set = &affinity->proc; 1044 1044 ··· 1046 1046 * check whether process/context affinity has already 1047 1047 * been set 1048 1048 */ 1049 - if (cpumask_weight(proc_mask) == 1) { 1049 + if (current->nr_cpus_allowed == 1) { 1050 1050 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", 1051 1051 current->pid, current->comm, 1052 1052 cpumask_pr_args(proc_mask)); ··· 1057 1057 cpu = cpumask_first(proc_mask); 1058 1058 cpumask_set_cpu(cpu, &set->used); 1059 1059 goto done; 1060 - } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { 1060 + } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { 1061 1061 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", 1062 1062 current->pid, current->comm, 1063 1063 cpumask_pr_args(proc_mask));

+1 -2

drivers/infiniband/hw/hfi1/sdma.c

··· 869 869 { 870 870 struct sdma_rht_node *rht_node; 871 871 struct sdma_engine *sde = NULL; 872 - const struct cpumask *current_mask = &current->cpus_allowed; 873 872 unsigned long cpu_id; 874 873 875 874 /* 876 875 * To ensure that always the same sdma engine(s) will be 877 876 * selected make sure the process is pinned to this CPU only. 878 877 */ 879 - if (cpumask_weight(current_mask) != 1) 878 + if (current->nr_cpus_allowed != 1) 880 879 goto out; 881 880 882 881 cpu_id = smp_processor_id();

+3 -4

drivers/infiniband/hw/qib/qib_file_ops.c

··· 1142 1142 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) 1143 1143 { 1144 1144 struct qib_filedata *fd = fp->private_data; 1145 - const unsigned int weight = cpumask_weight(&current->cpus_allowed); 1145 + const unsigned int weight = current->nr_cpus_allowed; 1146 1146 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); 1147 1147 int local_cpu; 1148 1148 ··· 1623 1623 ret = find_free_ctxt(i_minor - 1, fp, uinfo); 1624 1624 else { 1625 1625 int unit; 1626 - const unsigned int cpu = cpumask_first(&current->cpus_allowed); 1627 - const unsigned int weight = 1628 - cpumask_weight(&current->cpus_allowed); 1626 + const unsigned int cpu = cpumask_first(current->cpus_ptr); 1627 + const unsigned int weight = current->nr_cpus_allowed; 1629 1628 1630 1629 if (weight == 1 && !test_bit(cpu, qib_cpulist)) 1631 1630 if (!find_hca(cpu, &unit) && unit >= 0)

+2 -2

fs/proc/array.c

··· 381 381 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 382 382 { 383 383 seq_printf(m, "Cpus_allowed:\t%*pb\n", 384 - cpumask_pr_args(&task->cpus_allowed)); 384 + cpumask_pr_args(task->cpus_ptr)); 385 385 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", 386 - cpumask_pr_args(&task->cpus_allowed)); 386 + cpumask_pr_args(task->cpus_ptr)); 387 387 } 388 388 389 389 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

+1 -1

include/linux/arch_topology.h

··· 18 18 19 19 struct sched_domain; 20 20 static inline 21 - unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu) 21 + unsigned long topology_get_cpu_scale(int cpu) 22 22 { 23 23 return per_cpu(cpu_scale, cpu); 24 24 }

+1 -1

include/linux/energy_model.h

··· 89 89 * like schedutil. 90 90 */ 91 91 cpu = cpumask_first(to_cpumask(pd->cpus)); 92 - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 92 + scale_cpu = arch_scale_cpu_capacity(cpu); 93 93 cs = &pd->table[pd->nr_cap_states - 1]; 94 94 freq = map_util_freq(max_util, cs->frequency, scale_cpu); 95 95

+34

include/linux/log2.h

··· 220 220 ilog2((n) - 1) + 1) : \ 221 221 __order_base_2(n) \ 222 222 ) 223 + 224 + static inline __attribute__((const)) 225 + int __bits_per(unsigned long n) 226 + { 227 + if (n < 2) 228 + return 1; 229 + if (is_power_of_2(n)) 230 + return order_base_2(n) + 1; 231 + return order_base_2(n); 232 + } 233 + 234 + /** 235 + * bits_per - calculate the number of bits required for the argument 236 + * @n: parameter 237 + * 238 + * This is constant-capable and can be used for compile time 239 + * initializations, e.g bitfields. 240 + * 241 + * The first few values calculated by this routine: 242 + * bf(0) = 1 243 + * bf(1) = 1 244 + * bf(2) = 2 245 + * bf(3) = 2 246 + * bf(4) = 3 247 + * ... and so on. 248 + */ 249 + #define bits_per(n) \ 250 + ( \ 251 + __builtin_constant_p(n) ? ( \ 252 + ((n) == 0 || (n) == 1) \ 253 + ? 1 : ilog2(n) + 1 \ 254 + ) : \ 255 + __bits_per(n) \ 256 + ) 223 257 #endif /* _LINUX_LOG2_H */

+76 -3

include/linux/sched.h

··· 35 35 struct backing_dev_info; 36 36 struct bio_list; 37 37 struct blk_plug; 38 + struct capture_control; 38 39 struct cfs_rq; 39 40 struct fs_struct; 40 41 struct futex_pi_state; ··· 48 47 struct pipe_inode_info; 49 48 struct rcu_node; 50 49 struct reclaim_state; 51 - struct capture_control; 52 50 struct robust_list_head; 51 + struct root_domain; 52 + struct rq; 53 53 struct sched_attr; 54 54 struct sched_param; 55 55 struct seq_file; ··· 283 281 u64 gtime; 284 282 }; 285 283 284 + /* 285 + * Utilization clamp constraints. 286 + * @UCLAMP_MIN: Minimum utilization 287 + * @UCLAMP_MAX: Maximum utilization 288 + * @UCLAMP_CNT: Utilization clamp constraints count 289 + */ 290 + enum uclamp_id { 291 + UCLAMP_MIN = 0, 292 + UCLAMP_MAX, 293 + UCLAMP_CNT 294 + }; 295 + 286 296 struct sched_info { 287 297 #ifdef CONFIG_SCHED_INFO 288 298 /* Cumulative counters: */ ··· 325 311 */ 326 312 # define SCHED_FIXEDPOINT_SHIFT 10 327 313 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) 314 + 315 + /* Increase resolution of cpu_capacity calculations */ 316 + # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT 317 + # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) 328 318 329 319 struct load_weight { 330 320 unsigned long weight; ··· 578 560 struct hrtimer inactive_timer; 579 561 }; 580 562 563 + #ifdef CONFIG_UCLAMP_TASK 564 + /* Number of utilization clamp buckets (shorter alias) */ 565 + #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT 566 + 567 + /* 568 + * Utilization clamp for a scheduling entity 569 + * @value: clamp value "assigned" to a se 570 + * @bucket_id: bucket index corresponding to the "assigned" value 571 + * @active: the se is currently refcounted in a rq's bucket 572 + * @user_defined: the requested clamp value comes from user-space 573 + * 574 + * The bucket_id is the index of the clamp bucket matching the clamp value 575 + * which is pre-computed and stored to avoid expensive integer divisions from 576 + * the fast path. 577 + * 578 + * The active bit is set whenever a task has got an "effective" value assigned, 579 + * which can be different from the clamp value "requested" from user-space. 580 + * This allows to know a task is refcounted in the rq's bucket corresponding 581 + * to the "effective" bucket_id. 582 + * 583 + * The user_defined bit is set whenever a task has got a task-specific clamp 584 + * value requested from userspace, i.e. the system defaults apply to this task 585 + * just as a restriction. This allows to relax default clamps when a less 586 + * restrictive task-specific value has been requested, thus allowing to 587 + * implement a "nice" semantic. For example, a task running with a 20% 588 + * default boost can still drop its own boosting to 0%. 589 + */ 590 + struct uclamp_se { 591 + unsigned int value : bits_per(SCHED_CAPACITY_SCALE); 592 + unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); 593 + unsigned int active : 1; 594 + unsigned int user_defined : 1; 595 + }; 596 + #endif /* CONFIG_UCLAMP_TASK */ 597 + 581 598 union rcu_special { 582 599 struct { 583 600 u8 blocked; ··· 693 640 #endif 694 641 struct sched_dl_entity dl; 695 642 643 + #ifdef CONFIG_UCLAMP_TASK 644 + /* Clamp values requested for a scheduling entity */ 645 + struct uclamp_se uclamp_req[UCLAMP_CNT]; 646 + /* Effective clamp values used for a scheduling entity */ 647 + struct uclamp_se uclamp[UCLAMP_CNT]; 648 + #endif 649 + 696 650 #ifdef CONFIG_PREEMPT_NOTIFIERS 697 651 /* List of struct preempt_notifier: */ 698 652 struct hlist_head preempt_notifiers; ··· 711 651 712 652 unsigned int policy; 713 653 int nr_cpus_allowed; 714 - cpumask_t cpus_allowed; 654 + const cpumask_t *cpus_ptr; 655 + cpumask_t cpus_mask; 715 656 716 657 #ifdef CONFIG_PREEMPT_RCU 717 658 int rcu_read_lock_nesting; ··· 1460 1399 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1461 1400 #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ 1462 1401 #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ 1463 - #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ 1402 + #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ 1464 1403 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1465 1404 #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ 1466 1405 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ ··· 1975 1914 } 1976 1915 1977 1916 #endif 1917 + 1918 + const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); 1919 + char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); 1920 + int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); 1921 + 1922 + const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); 1923 + const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); 1924 + const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); 1925 + 1926 + int sched_trace_rq_cpu(struct rq *rq); 1927 + 1928 + const struct cpumask *sched_trace_rd_span(struct root_domain *rd); 1978 1929 1979 1930 #endif

-8

include/linux/sched/nohz.h

··· 7 7 */ 8 8 9 9 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 10 - extern void cpu_load_update_nohz_start(void); 11 - extern void cpu_load_update_nohz_stop(void); 12 - #else 13 - static inline void cpu_load_update_nohz_start(void) { } 14 - static inline void cpu_load_update_nohz_stop(void) { } 15 - #endif 16 - 17 - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 18 10 extern void nohz_balance_enter_idle(int cpu); 19 11 extern int get_nohz_timer_target(void); 20 12 #else

+11

include/linux/sched/sysctl.h

··· 56 56 extern unsigned int sysctl_sched_rt_period; 57 57 extern int sysctl_sched_rt_runtime; 58 58 59 + #ifdef CONFIG_UCLAMP_TASK 60 + extern unsigned int sysctl_sched_uclamp_util_min; 61 + extern unsigned int sysctl_sched_uclamp_util_max; 62 + #endif 63 + 59 64 #ifdef CONFIG_CFS_BANDWIDTH 60 65 extern unsigned int sysctl_sched_cfs_bandwidth_slice; 61 66 #endif ··· 79 74 extern int sched_rt_handler(struct ctl_table *table, int write, 80 75 void __user *buffer, size_t *lenp, 81 76 loff_t *ppos); 77 + 78 + #ifdef CONFIG_UCLAMP_TASK 79 + extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, 80 + void __user *buffer, size_t *lenp, 81 + loff_t *ppos); 82 + #endif 82 83 83 84 extern int sysctl_numa_balancing(struct ctl_table *table, int write, 84 85 void __user *buffer, size_t *lenp,

+3 -22

include/linux/sched/topology.h

··· 7 7 #include <linux/sched/idle.h> 8 8 9 9 /* 10 - * Increase resolution of cpu_capacity calculations 11 - */ 12 - #define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT 13 - #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) 14 - 15 - /* 16 10 * sched-domains (multiprocessor balancing) declarations: 17 11 */ 18 12 #ifdef CONFIG_SMP ··· 78 84 unsigned int busy_factor; /* less balancing by factor if busy */ 79 85 unsigned int imbalance_pct; /* No balance until over watermark */ 80 86 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ 81 - unsigned int busy_idx; 82 - unsigned int idle_idx; 83 - unsigned int newidle_idx; 84 - unsigned int wake_idx; 85 - unsigned int forkexec_idx; 86 87 87 88 int nohz_idle; /* NOHZ IDLE status */ 88 89 int flags; /* See SD_* */ ··· 190 201 # define SD_INIT_NAME(type) 191 202 #endif 192 203 193 - #ifndef arch_scale_cpu_capacity 194 - static __always_inline 195 - unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) 196 - { 197 - return SCHED_CAPACITY_SCALE; 198 - } 199 - #endif 200 - 201 204 #else /* CONFIG_SMP */ 202 205 203 206 struct sched_domain_attr; ··· 205 224 return true; 206 225 } 207 226 227 + #endif /* !CONFIG_SMP */ 228 + 208 229 #ifndef arch_scale_cpu_capacity 209 230 static __always_inline 210 - unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) 231 + unsigned long arch_scale_cpu_capacity(int cpu) 211 232 { 212 233 return SCHED_CAPACITY_SCALE; 213 234 } 214 235 #endif 215 - 216 - #endif /* !CONFIG_SMP */ 217 236 218 237 static inline int task_node(const struct task_struct *p) 219 238 {

+31

include/trace/events/sched.h

··· 594 594 595 595 TP_printk("cpu=%d", __entry->cpu) 596 596 ); 597 + 598 + /* 599 + * Following tracepoints are not exported in tracefs and provide hooking 600 + * mechanisms only for testing and debugging purposes. 601 + * 602 + * Postfixed with _tp to make them easily identifiable in the code. 603 + */ 604 + DECLARE_TRACE(pelt_cfs_tp, 605 + TP_PROTO(struct cfs_rq *cfs_rq), 606 + TP_ARGS(cfs_rq)); 607 + 608 + DECLARE_TRACE(pelt_rt_tp, 609 + TP_PROTO(struct rq *rq), 610 + TP_ARGS(rq)); 611 + 612 + DECLARE_TRACE(pelt_dl_tp, 613 + TP_PROTO(struct rq *rq), 614 + TP_ARGS(rq)); 615 + 616 + DECLARE_TRACE(pelt_irq_tp, 617 + TP_PROTO(struct rq *rq), 618 + TP_ARGS(rq)); 619 + 620 + DECLARE_TRACE(pelt_se_tp, 621 + TP_PROTO(struct sched_entity *se), 622 + TP_ARGS(se)); 623 + 624 + DECLARE_TRACE(sched_overutilized_tp, 625 + TP_PROTO(struct root_domain *rd, bool overutilized), 626 + TP_ARGS(rd, overutilized)); 627 + 597 628 #endif /* _TRACE_SCHED_H */ 598 629 599 630 /* This part must be outside protection */

+13 -1

include/uapi/linux/sched.h

··· 51 51 #define SCHED_FLAG_RESET_ON_FORK 0x01 52 52 #define SCHED_FLAG_RECLAIM 0x02 53 53 #define SCHED_FLAG_DL_OVERRUN 0x04 54 + #define SCHED_FLAG_KEEP_POLICY 0x08 55 + #define SCHED_FLAG_KEEP_PARAMS 0x10 56 + #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 57 + #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 58 + 59 + #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ 60 + SCHED_FLAG_KEEP_PARAMS) 61 + 62 + #define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ 63 + SCHED_FLAG_UTIL_CLAMP_MAX) 54 64 55 65 #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ 56 66 SCHED_FLAG_RECLAIM | \ 57 - SCHED_FLAG_DL_OVERRUN) 67 + SCHED_FLAG_DL_OVERRUN | \ 68 + SCHED_FLAG_KEEP_ALL | \ 69 + SCHED_FLAG_UTIL_CLAMP) 58 70 59 71 #endif /* _UAPI_LINUX_SCHED_H */

+57 -9

include/uapi/linux/sched/types.h

··· 9 9 }; 10 10 11 11 #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ 12 + #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ 12 13 13 14 /* 14 15 * Extended scheduling parameters data structure. ··· 22 21 * the tasks may be useful for a wide variety of application fields, e.g., 23 22 * multimedia, streaming, automation and control, and many others. 24 23 * 25 - * This variant (sched_attr) is meant at describing a so-called 26 - * sporadic time-constrained task. In such model a task is specified by: 24 + * This variant (sched_attr) allows to define additional attributes to 25 + * improve the scheduler knowledge about task requirements. 26 + * 27 + * Scheduling Class Attributes 28 + * =========================== 29 + * 30 + * A subset of sched_attr attributes specifies the 31 + * scheduling policy and relative POSIX attributes: 32 + * 33 + * @size size of the structure, for fwd/bwd compat. 34 + * 35 + * @sched_policy task's scheduling policy 36 + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) 37 + * @sched_priority task's static priority (SCHED_FIFO/RR) 38 + * 39 + * Certain more advanced scheduling features can be controlled by a 40 + * predefined set of flags via the attribute: 41 + * 42 + * @sched_flags for customizing the scheduler behaviour 43 + * 44 + * Sporadic Time-Constrained Task Attributes 45 + * ========================================= 46 + * 47 + * A subset of sched_attr attributes allows to describe a so-called 48 + * sporadic time-constrained task. 49 + * 50 + * In such a model a task is specified by: 27 51 * - the activation period or minimum instance inter-arrival time; 28 52 * - the maximum (or average, depending on the actual scheduling 29 53 * discipline) computation time of all instances, a.k.a. runtime; ··· 60 34 * than the runtime and must be completed by time instant t equal to 61 35 * the instance activation time + the deadline. 62 36 * 63 - * This is reflected by the actual fields of the sched_attr structure: 37 + * This is reflected by the following fields of the sched_attr structure: 64 38 * 65 - * @size size of the structure, for fwd/bwd compat. 66 - * 67 - * @sched_policy task's scheduling policy 68 - * @sched_flags for customizing the scheduler behaviour 69 - * @sched_nice task's nice value (SCHED_NORMAL/BATCH) 70 - * @sched_priority task's static priority (SCHED_FIFO/RR) 71 39 * @sched_deadline representative of the task's deadline 72 40 * @sched_runtime representative of the task's runtime 73 41 * @sched_period representative of the task's period ··· 73 53 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the 74 54 * only user of this new interface. More information about the algorithm 75 55 * available in the scheduling class file or in Documentation/. 56 + * 57 + * Task Utilization Attributes 58 + * =========================== 59 + * 60 + * A subset of sched_attr attributes allows to specify the utilization 61 + * expected for a task. These attributes allow to inform the scheduler about 62 + * the utilization boundaries within which it should schedule the task. These 63 + * boundaries are valuable hints to support scheduler decisions on both task 64 + * placement and frequency selection. 65 + * 66 + * @sched_util_min represents the minimum utilization 67 + * @sched_util_max represents the maximum utilization 68 + * 69 + * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It 70 + * represents the percentage of CPU time used by a task when running at the 71 + * maximum frequency on the highest capacity CPU of the system. For example, a 72 + * 20% utilization task is a task running for 2ms every 10ms at maximum 73 + * frequency. 74 + * 75 + * A task with a min utilization value bigger than 0 is more likely scheduled 76 + * on a CPU with a capacity big enough to fit the specified value. 77 + * A task with a max utilization value smaller than 1024 is more likely 78 + * scheduled on a CPU with no more capacity than the specified value. 76 79 */ 77 80 struct sched_attr { 78 81 __u32 size; ··· 113 70 __u64 sched_runtime; 114 71 __u64 sched_deadline; 115 72 __u64 sched_period; 73 + 74 + /* Utilization hints */ 75 + __u32 sched_util_min; 76 + __u32 sched_util_max; 77 + 116 78 }; 117 79 118 80 #endif /* _UAPI_LINUX_SCHED_TYPES_H */

+53

init/Kconfig

··· 677 677 config GENERIC_SCHED_CLOCK 678 678 bool 679 679 680 + menu "Scheduler features" 681 + 682 + config UCLAMP_TASK 683 + bool "Enable utilization clamping for RT/FAIR tasks" 684 + depends on CPU_FREQ_GOV_SCHEDUTIL 685 + help 686 + This feature enables the scheduler to track the clamped utilization 687 + of each CPU based on RUNNABLE tasks scheduled on that CPU. 688 + 689 + With this option, the user can specify the min and max CPU 690 + utilization allowed for RUNNABLE tasks. The max utilization defines 691 + the maximum frequency a task should use while the min utilization 692 + defines the minimum frequency it should use. 693 + 694 + Both min and max utilization clamp values are hints to the scheduler, 695 + aiming at improving its frequency selection policy, but they do not 696 + enforce or grant any specific bandwidth for tasks. 697 + 698 + If in doubt, say N. 699 + 700 + config UCLAMP_BUCKETS_COUNT 701 + int "Number of supported utilization clamp buckets" 702 + range 5 20 703 + default 5 704 + depends on UCLAMP_TASK 705 + help 706 + Defines the number of clamp buckets to use. The range of each bucket 707 + will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the 708 + number of clamp buckets the finer their granularity and the higher 709 + the precision of clamping aggregation and tracking at run-time. 710 + 711 + For example, with the minimum configuration value we will have 5 712 + clamp buckets tracking 20% utilization each. A 25% boosted tasks will 713 + be refcounted in the [20..39]% bucket and will set the bucket clamp 714 + effective value to 25%. 715 + If a second 30% boosted task should be co-scheduled on the same CPU, 716 + that task will be refcounted in the same bucket of the first task and 717 + it will boost the bucket clamp effective value to 30%. 718 + The clamp effective value of a bucket is reset to its nominal value 719 + (20% in the example above) when there are no more tasks refcounted in 720 + that bucket. 721 + 722 + An additional boost/capping margin can be added to some tasks. In the 723 + example above the 25% task will be boosted to 30% until it exits the 724 + CPU. If that should be considered not acceptable on certain systems, 725 + it's always possible to reduce the margin by increasing the number of 726 + clamp buckets to trade off used memory for run-time tracking 727 + precision. 728 + 729 + If in doubt, use the default value. 730 + 731 + endmenu 732 + 680 733 # 681 734 # For architectures that want to enable the support for NUMA-affine scheduler 682 735 # balancing logic:

+2 -1

init/init_task.c

··· 72 72 .static_prio = MAX_PRIO - 20, 73 73 .normal_prio = MAX_PRIO - 20, 74 74 .policy = SCHED_NORMAL, 75 - .cpus_allowed = CPU_MASK_ALL, 75 + .cpus_ptr = &init_task.cpus_mask, 76 + .cpus_mask = CPU_MASK_ALL, 76 77 .nr_cpus_allowed= NR_CPUS, 77 78 .mm = NULL, 78 79 .active_mm = &init_mm,

+1 -1

kernel/cgroup/cpuset.c

··· 2829 2829 if (task_css_is_root(task, cpuset_cgrp_id)) 2830 2830 return; 2831 2831 2832 - set_cpus_allowed_ptr(task, &current->cpus_allowed); 2832 + set_cpus_allowed_ptr(task, current->cpus_ptr); 2833 2833 task->mems_allowed = current->mems_allowed; 2834 2834 } 2835 2835

+2

kernel/fork.c

··· 898 898 #ifdef CONFIG_STACKPROTECTOR 899 899 tsk->stack_canary = get_random_canary(); 900 900 #endif 901 + if (orig->cpus_ptr == &orig->cpus_mask) 902 + tsk->cpus_ptr = &tsk->cpus_mask; 901 903 902 904 /* 903 905 * One for us, one for whoever does the "release_task()" (usually

+1 -1

kernel/power/energy_model.c

··· 223 223 * All CPUs of a domain must have the same micro-architecture 224 224 * since they all share the same table. 225 225 */ 226 - cap = arch_scale_cpu_capacity(NULL, cpu); 226 + cap = arch_scale_cpu_capacity(cpu); 227 227 if (prev_cap && prev_cap != cap) { 228 228 pr_err("CPUs of %*pbl must have the same capacity\n", 229 229 cpumask_pr_args(span));

-2

kernel/sched/autogroup.c

··· 259 259 } 260 260 #endif /* CONFIG_PROC_FS */ 261 261 262 - #ifdef CONFIG_SCHED_DEBUG 263 262 int autogroup_path(struct task_group *tg, char *buf, int buflen) 264 263 { 265 264 if (!task_group_is_autogroup(tg)) ··· 266 267 267 268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 268 269 } 269 - #endif

+499 -34

kernel/sched/core.c

··· 23 23 #define CREATE_TRACE_POINTS 24 24 #include <trace/events/sched.h> 25 25 26 + /* 27 + * Export tracepoints that act as a bare tracehook (ie: have no trace event 28 + * associated with them) to allow external modules to probe them. 29 + */ 30 + EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); 31 + EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); 32 + EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); 33 + EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); 34 + EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); 35 + EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); 36 + 26 37 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 27 38 28 39 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) ··· 772 761 } 773 762 } 774 763 764 + #ifdef CONFIG_UCLAMP_TASK 765 + /* Max allowed minimum utilization */ 766 + unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; 767 + 768 + /* Max allowed maximum utilization */ 769 + unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; 770 + 771 + /* All clamps are required to be less or equal than these values */ 772 + static struct uclamp_se uclamp_default[UCLAMP_CNT]; 773 + 774 + /* Integer rounded range for each bucket */ 775 + #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) 776 + 777 + #define for_each_clamp_id(clamp_id) \ 778 + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) 779 + 780 + static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) 781 + { 782 + return clamp_value / UCLAMP_BUCKET_DELTA; 783 + } 784 + 785 + static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) 786 + { 787 + return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); 788 + } 789 + 790 + static inline unsigned int uclamp_none(int clamp_id) 791 + { 792 + if (clamp_id == UCLAMP_MIN) 793 + return 0; 794 + return SCHED_CAPACITY_SCALE; 795 + } 796 + 797 + static inline void uclamp_se_set(struct uclamp_se *uc_se, 798 + unsigned int value, bool user_defined) 799 + { 800 + uc_se->value = value; 801 + uc_se->bucket_id = uclamp_bucket_id(value); 802 + uc_se->user_defined = user_defined; 803 + } 804 + 805 + static inline unsigned int 806 + uclamp_idle_value(struct rq *rq, unsigned int clamp_id, 807 + unsigned int clamp_value) 808 + { 809 + /* 810 + * Avoid blocked utilization pushing up the frequency when we go 811 + * idle (which drops the max-clamp) by retaining the last known 812 + * max-clamp. 813 + */ 814 + if (clamp_id == UCLAMP_MAX) { 815 + rq->uclamp_flags |= UCLAMP_FLAG_IDLE; 816 + return clamp_value; 817 + } 818 + 819 + return uclamp_none(UCLAMP_MIN); 820 + } 821 + 822 + static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, 823 + unsigned int clamp_value) 824 + { 825 + /* Reset max-clamp retention only on idle exit */ 826 + if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) 827 + return; 828 + 829 + WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); 830 + } 831 + 832 + static inline 833 + unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, 834 + unsigned int clamp_value) 835 + { 836 + struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; 837 + int bucket_id = UCLAMP_BUCKETS - 1; 838 + 839 + /* 840 + * Since both min and max clamps are max aggregated, find the 841 + * top most bucket with tasks in. 842 + */ 843 + for ( ; bucket_id >= 0; bucket_id--) { 844 + if (!bucket[bucket_id].tasks) 845 + continue; 846 + return bucket[bucket_id].value; 847 + } 848 + 849 + /* No tasks -- default clamp values */ 850 + return uclamp_idle_value(rq, clamp_id, clamp_value); 851 + } 852 + 853 + /* 854 + * The effective clamp bucket index of a task depends on, by increasing 855 + * priority: 856 + * - the task specific clamp value, when explicitly requested from userspace 857 + * - the system default clamp value, defined by the sysadmin 858 + */ 859 + static inline struct uclamp_se 860 + uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) 861 + { 862 + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; 863 + struct uclamp_se uc_max = uclamp_default[clamp_id]; 864 + 865 + /* System default restrictions always apply */ 866 + if (unlikely(uc_req.value > uc_max.value)) 867 + return uc_max; 868 + 869 + return uc_req; 870 + } 871 + 872 + unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) 873 + { 874 + struct uclamp_se uc_eff; 875 + 876 + /* Task currently refcounted: use back-annotated (effective) value */ 877 + if (p->uclamp[clamp_id].active) 878 + return p->uclamp[clamp_id].value; 879 + 880 + uc_eff = uclamp_eff_get(p, clamp_id); 881 + 882 + return uc_eff.value; 883 + } 884 + 885 + /* 886 + * When a task is enqueued on a rq, the clamp bucket currently defined by the 887 + * task's uclamp::bucket_id is refcounted on that rq. This also immediately 888 + * updates the rq's clamp value if required. 889 + * 890 + * Tasks can have a task-specific value requested from user-space, track 891 + * within each bucket the maximum value for tasks refcounted in it. 892 + * This "local max aggregation" allows to track the exact "requested" value 893 + * for each bucket when all its RUNNABLE tasks require the same clamp. 894 + */ 895 + static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, 896 + unsigned int clamp_id) 897 + { 898 + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 899 + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 900 + struct uclamp_bucket *bucket; 901 + 902 + lockdep_assert_held(&rq->lock); 903 + 904 + /* Update task effective clamp */ 905 + p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); 906 + 907 + bucket = &uc_rq->bucket[uc_se->bucket_id]; 908 + bucket->tasks++; 909 + uc_se->active = true; 910 + 911 + uclamp_idle_reset(rq, clamp_id, uc_se->value); 912 + 913 + /* 914 + * Local max aggregation: rq buckets always track the max 915 + * "requested" clamp value of its RUNNABLE tasks. 916 + */ 917 + if (bucket->tasks == 1 || uc_se->value > bucket->value) 918 + bucket->value = uc_se->value; 919 + 920 + if (uc_se->value > READ_ONCE(uc_rq->value)) 921 + WRITE_ONCE(uc_rq->value, uc_se->value); 922 + } 923 + 924 + /* 925 + * When a task is dequeued from a rq, the clamp bucket refcounted by the task 926 + * is released. If this is the last task reference counting the rq's max 927 + * active clamp value, then the rq's clamp value is updated. 928 + * 929 + * Both refcounted tasks and rq's cached clamp values are expected to be 930 + * always valid. If it's detected they are not, as defensive programming, 931 + * enforce the expected state and warn. 932 + */ 933 + static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, 934 + unsigned int clamp_id) 935 + { 936 + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 937 + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 938 + struct uclamp_bucket *bucket; 939 + unsigned int bkt_clamp; 940 + unsigned int rq_clamp; 941 + 942 + lockdep_assert_held(&rq->lock); 943 + 944 + bucket = &uc_rq->bucket[uc_se->bucket_id]; 945 + SCHED_WARN_ON(!bucket->tasks); 946 + if (likely(bucket->tasks)) 947 + bucket->tasks--; 948 + uc_se->active = false; 949 + 950 + /* 951 + * Keep "local max aggregation" simple and accept to (possibly) 952 + * overboost some RUNNABLE tasks in the same bucket. 953 + * The rq clamp bucket value is reset to its base value whenever 954 + * there are no more RUNNABLE tasks refcounting it. 955 + */ 956 + if (likely(bucket->tasks)) 957 + return; 958 + 959 + rq_clamp = READ_ONCE(uc_rq->value); 960 + /* 961 + * Defensive programming: this should never happen. If it happens, 962 + * e.g. due to future modification, warn and fixup the expected value. 963 + */ 964 + SCHED_WARN_ON(bucket->value > rq_clamp); 965 + if (bucket->value >= rq_clamp) { 966 + bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); 967 + WRITE_ONCE(uc_rq->value, bkt_clamp); 968 + } 969 + } 970 + 971 + static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) 972 + { 973 + unsigned int clamp_id; 974 + 975 + if (unlikely(!p->sched_class->uclamp_enabled)) 976 + return; 977 + 978 + for_each_clamp_id(clamp_id) 979 + uclamp_rq_inc_id(rq, p, clamp_id); 980 + 981 + /* Reset clamp idle holding when there is one RUNNABLE task */ 982 + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) 983 + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; 984 + } 985 + 986 + static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) 987 + { 988 + unsigned int clamp_id; 989 + 990 + if (unlikely(!p->sched_class->uclamp_enabled)) 991 + return; 992 + 993 + for_each_clamp_id(clamp_id) 994 + uclamp_rq_dec_id(rq, p, clamp_id); 995 + } 996 + 997 + int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, 998 + void __user *buffer, size_t *lenp, 999 + loff_t *ppos) 1000 + { 1001 + int old_min, old_max; 1002 + static DEFINE_MUTEX(mutex); 1003 + int result; 1004 + 1005 + mutex_lock(&mutex); 1006 + old_min = sysctl_sched_uclamp_util_min; 1007 + old_max = sysctl_sched_uclamp_util_max; 1008 + 1009 + result = proc_dointvec(table, write, buffer, lenp, ppos); 1010 + if (result) 1011 + goto undo; 1012 + if (!write) 1013 + goto done; 1014 + 1015 + if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || 1016 + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { 1017 + result = -EINVAL; 1018 + goto undo; 1019 + } 1020 + 1021 + if (old_min != sysctl_sched_uclamp_util_min) { 1022 + uclamp_se_set(&uclamp_default[UCLAMP_MIN], 1023 + sysctl_sched_uclamp_util_min, false); 1024 + } 1025 + if (old_max != sysctl_sched_uclamp_util_max) { 1026 + uclamp_se_set(&uclamp_default[UCLAMP_MAX], 1027 + sysctl_sched_uclamp_util_max, false); 1028 + } 1029 + 1030 + /* 1031 + * Updating all the RUNNABLE task is expensive, keep it simple and do 1032 + * just a lazy update at each next enqueue time. 1033 + */ 1034 + goto done; 1035 + 1036 + undo: 1037 + sysctl_sched_uclamp_util_min = old_min; 1038 + sysctl_sched_uclamp_util_max = old_max; 1039 + done: 1040 + mutex_unlock(&mutex); 1041 + 1042 + return result; 1043 + } 1044 + 1045 + static int uclamp_validate(struct task_struct *p, 1046 + const struct sched_attr *attr) 1047 + { 1048 + unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; 1049 + unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; 1050 + 1051 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) 1052 + lower_bound = attr->sched_util_min; 1053 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) 1054 + upper_bound = attr->sched_util_max; 1055 + 1056 + if (lower_bound > upper_bound) 1057 + return -EINVAL; 1058 + if (upper_bound > SCHED_CAPACITY_SCALE) 1059 + return -EINVAL; 1060 + 1061 + return 0; 1062 + } 1063 + 1064 + static void __setscheduler_uclamp(struct task_struct *p, 1065 + const struct sched_attr *attr) 1066 + { 1067 + unsigned int clamp_id; 1068 + 1069 + /* 1070 + * On scheduling class change, reset to default clamps for tasks 1071 + * without a task-specific value. 1072 + */ 1073 + for_each_clamp_id(clamp_id) { 1074 + struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; 1075 + unsigned int clamp_value = uclamp_none(clamp_id); 1076 + 1077 + /* Keep using defined clamps across class changes */ 1078 + if (uc_se->user_defined) 1079 + continue; 1080 + 1081 + /* By default, RT tasks always get 100% boost */ 1082 + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) 1083 + clamp_value = uclamp_none(UCLAMP_MAX); 1084 + 1085 + uclamp_se_set(uc_se, clamp_value, false); 1086 + } 1087 + 1088 + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) 1089 + return; 1090 + 1091 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { 1092 + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], 1093 + attr->sched_util_min, true); 1094 + } 1095 + 1096 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { 1097 + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], 1098 + attr->sched_util_max, true); 1099 + } 1100 + } 1101 + 1102 + static void uclamp_fork(struct task_struct *p) 1103 + { 1104 + unsigned int clamp_id; 1105 + 1106 + for_each_clamp_id(clamp_id) 1107 + p->uclamp[clamp_id].active = false; 1108 + 1109 + if (likely(!p->sched_reset_on_fork)) 1110 + return; 1111 + 1112 + for_each_clamp_id(clamp_id) { 1113 + unsigned int clamp_value = uclamp_none(clamp_id); 1114 + 1115 + /* By default, RT tasks always get 100% boost */ 1116 + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) 1117 + clamp_value = uclamp_none(UCLAMP_MAX); 1118 + 1119 + uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); 1120 + } 1121 + } 1122 + 1123 + static void __init init_uclamp(void) 1124 + { 1125 + struct uclamp_se uc_max = {}; 1126 + unsigned int clamp_id; 1127 + int cpu; 1128 + 1129 + for_each_possible_cpu(cpu) { 1130 + memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); 1131 + cpu_rq(cpu)->uclamp_flags = 0; 1132 + } 1133 + 1134 + for_each_clamp_id(clamp_id) { 1135 + uclamp_se_set(&init_task.uclamp_req[clamp_id], 1136 + uclamp_none(clamp_id), false); 1137 + } 1138 + 1139 + /* System defaults allow max clamp values for both indexes */ 1140 + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); 1141 + for_each_clamp_id(clamp_id) 1142 + uclamp_default[clamp_id] = uc_max; 1143 + } 1144 + 1145 + #else /* CONFIG_UCLAMP_TASK */ 1146 + static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } 1147 + static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } 1148 + static inline int uclamp_validate(struct task_struct *p, 1149 + const struct sched_attr *attr) 1150 + { 1151 + return -EOPNOTSUPP; 1152 + } 1153 + static void __setscheduler_uclamp(struct task_struct *p, 1154 + const struct sched_attr *attr) { } 1155 + static inline void uclamp_fork(struct task_struct *p) { } 1156 + static inline void init_uclamp(void) { } 1157 + #endif /* CONFIG_UCLAMP_TASK */ 1158 + 775 1159 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 776 1160 { 777 1161 if (!(flags & ENQUEUE_NOCLOCK)) ··· 1177 771 psi_enqueue(p, flags & ENQUEUE_WAKEUP); 1178 772 } 1179 773 774 + uclamp_rq_inc(rq, p); 1180 775 p->sched_class->enqueue_task(rq, p, flags); 1181 776 } 1182 777 ··· 1191 784 psi_dequeue(p, flags & DEQUEUE_SLEEP); 1192 785 } 1193 786 787 + uclamp_rq_dec(rq, p); 1194 788 p->sched_class->dequeue_task(rq, p, flags); 1195 789 } 1196 790 ··· 1338 930 */ 1339 931 static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 1340 932 { 1341 - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 933 + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1342 934 return false; 1343 935 1344 936 if (is_per_cpu_kthread(p)) ··· 1433 1025 local_irq_disable(); 1434 1026 /* 1435 1027 * We need to explicitly wake pending tasks before running 1436 - * __migrate_task() such that we will not miss enforcing cpus_allowed 1028 + * __migrate_task() such that we will not miss enforcing cpus_ptr 1437 1029 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 1438 1030 */ 1439 1031 sched_ttwu_pending(); ··· 1464 1056 */ 1465 1057 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1466 1058 { 1467 - cpumask_copy(&p->cpus_allowed, new_mask); 1059 + cpumask_copy(&p->cpus_mask, new_mask); 1468 1060 p->nr_cpus_allowed = cpumask_weight(new_mask); 1469 1061 } 1470 1062 ··· 1534 1126 goto out; 1535 1127 } 1536 1128 1537 - if (cpumask_equal(&p->cpus_allowed, new_mask)) 1129 + if (cpumask_equal(p->cpus_ptr, new_mask)) 1538 1130 goto out; 1539 1131 1540 1132 if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ··· 1694 1286 if (task_cpu(arg->src_task) != arg->src_cpu) 1695 1287 goto unlock; 1696 1288 1697 - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) 1289 + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) 1698 1290 goto unlock; 1699 1291 1700 - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) 1292 + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) 1701 1293 goto unlock; 1702 1294 1703 1295 __migrate_swap_task(arg->src_task, arg->dst_cpu); ··· 1739 1331 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1740 1332 goto out; 1741 1333 1742 - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) 1334 + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) 1743 1335 goto out; 1744 1336 1745 - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) 1337 + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) 1746 1338 goto out; 1747 1339 1748 1340 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); ··· 1887 1479 EXPORT_SYMBOL_GPL(kick_process); 1888 1480 1889 1481 /* 1890 - * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1482 + * ->cpus_ptr is protected by both rq->lock and p->pi_lock 1891 1483 * 1892 1484 * A few notes on cpu_active vs cpu_online: 1893 1485 * ··· 1927 1519 for_each_cpu(dest_cpu, nodemask) { 1928 1520 if (!cpu_active(dest_cpu)) 1929 1521 continue; 1930 - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 1522 + if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) 1931 1523 return dest_cpu; 1932 1524 } 1933 1525 } 1934 1526 1935 1527 for (;;) { 1936 1528 /* Any allowed, online CPU? */ 1937 - for_each_cpu(dest_cpu, &p->cpus_allowed) { 1529 + for_each_cpu(dest_cpu, p->cpus_ptr) { 1938 1530 if (!is_cpu_allowed(p, dest_cpu)) 1939 1531 continue; 1940 1532 ··· 1978 1570 } 1979 1571 1980 1572 /* 1981 - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1573 + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. 1982 1574 */ 1983 1575 static inline 1984 1576 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) ··· 1988 1580 if (p->nr_cpus_allowed > 1) 1989 1581 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1990 1582 else 1991 - cpu = cpumask_any(&p->cpus_allowed); 1583 + cpu = cpumask_any(p->cpus_ptr); 1992 1584 1993 1585 /* 1994 1586 * In order not to call set_task_cpu() on a blocking task we need 1995 - * to rely on ttwu() to place the task on a valid ->cpus_allowed 1587 + * to rely on ttwu() to place the task on a valid ->cpus_ptr 1996 1588 * CPU. 1997 1589 * 1998 1590 * Since this is common to all placement strategies, this lives here. ··· 2399 1991 unsigned long flags; 2400 1992 int cpu, success = 0; 2401 1993 1994 + if (p == current) { 1995 + /* 1996 + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) 1997 + * == smp_processor_id()'. Together this means we can special 1998 + * case the whole 'p->on_rq && ttwu_remote()' case below 1999 + * without taking any locks. 2000 + * 2001 + * In particular: 2002 + * - we rely on Program-Order guarantees for all the ordering, 2003 + * - we're serialized against set_special_state() by virtue of 2004 + * it disabling IRQs (this allows not taking ->pi_lock). 2005 + */ 2006 + if (!(p->state & state)) 2007 + return false; 2008 + 2009 + success = 1; 2010 + cpu = task_cpu(p); 2011 + trace_sched_waking(p); 2012 + p->state = TASK_RUNNING; 2013 + trace_sched_wakeup(p); 2014 + goto out; 2015 + } 2016 + 2402 2017 /* 2403 2018 * If we are going to wake up a thread waiting for CONDITION we 2404 2019 * need to ensure that CONDITION=1 done by the caller can not be ··· 2431 2000 raw_spin_lock_irqsave(&p->pi_lock, flags); 2432 2001 smp_mb__after_spinlock(); 2433 2002 if (!(p->state & state)) 2434 - goto out; 2003 + goto unlock; 2435 2004 2436 2005 trace_sched_waking(p); 2437 2006 ··· 2461 2030 */ 2462 2031 smp_rmb(); 2463 2032 if (p->on_rq && ttwu_remote(p, wake_flags)) 2464 - goto stat; 2033 + goto unlock; 2465 2034 2466 2035 #ifdef CONFIG_SMP 2467 2036 /* ··· 2521 2090 #endif /* CONFIG_SMP */ 2522 2091 2523 2092 ttwu_queue(p, cpu, wake_flags); 2524 - stat: 2525 - ttwu_stat(p, cpu, wake_flags); 2526 - out: 2093 + unlock: 2527 2094 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2095 + out: 2096 + if (success) 2097 + ttwu_stat(p, cpu, wake_flags); 2528 2098 2529 2099 return success; 2530 2100 } ··· 2732 2300 */ 2733 2301 p->prio = current->normal_prio; 2734 2302 2303 + uclamp_fork(p); 2304 + 2735 2305 /* 2736 2306 * Revert to default priority/policy on fork if requested. 2737 2307 */ ··· 2829 2395 #ifdef CONFIG_SMP 2830 2396 /* 2831 2397 * Fork balancing, do it here and not earlier because: 2832 - * - cpus_allowed can change in the fork path 2398 + * - cpus_ptr can change in the fork path 2833 2399 * - any previously selected CPU might disappear through hotplug 2834 2400 * 2835 2401 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ··· 3467 3033 3468 3034 update_rq_clock(rq); 3469 3035 curr->sched_class->task_tick(rq, curr, 0); 3470 - cpu_load_update_active(rq); 3471 3036 calc_global_load_tick(rq); 3472 3037 psi_task_tick(rq); 3473 3038 ··· 4504 4071 static void __setscheduler(struct rq *rq, struct task_struct *p, 4505 4072 const struct sched_attr *attr, bool keep_boost) 4506 4073 { 4074 + /* 4075 + * If params can't change scheduling class changes aren't allowed 4076 + * either. 4077 + */ 4078 + if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) 4079 + return; 4080 + 4507 4081 __setscheduler_params(p, attr); 4508 4082 4509 4083 /* ··· 4648 4208 return retval; 4649 4209 } 4650 4210 4211 + /* Update task specific "requested" clamps */ 4212 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { 4213 + retval = uclamp_validate(p, attr); 4214 + if (retval) 4215 + return retval; 4216 + } 4217 + 4651 4218 /* 4652 4219 * Make sure no PI-waiters arrive (or leave) while we are 4653 4220 * changing the priority of the task: ··· 4683 4236 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 4684 4237 goto change; 4685 4238 if (dl_policy(policy) && dl_param_changed(p, attr)) 4239 + goto change; 4240 + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) 4686 4241 goto change; 4687 4242 4688 4243 p->sched_reset_on_fork = reset_on_fork; ··· 4716 4267 * the entire root_domain to become SCHED_DEADLINE. We 4717 4268 * will also fail if there's no bandwidth available. 4718 4269 */ 4719 - if (!cpumask_subset(span, &p->cpus_allowed) || 4270 + if (!cpumask_subset(span, p->cpus_ptr) || 4720 4271 rq->rd->dl_bw.bw == 0) { 4721 4272 task_rq_unlock(rq, p, &rf); 4722 4273 return -EPERM; ··· 4766 4317 put_prev_task(rq, p); 4767 4318 4768 4319 prev_class = p->sched_class; 4320 + 4769 4321 __setscheduler(rq, p, attr, pi); 4322 + __setscheduler_uclamp(p, attr); 4770 4323 4771 4324 if (queued) { 4772 4325 /* ··· 4944 4493 if (ret) 4945 4494 return -EFAULT; 4946 4495 4496 + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && 4497 + size < SCHED_ATTR_SIZE_VER1) 4498 + return -EINVAL; 4499 + 4947 4500 /* 4948 4501 * XXX: Do we want to be lenient like existing syscalls; or do we want 4949 4502 * to be strict and return an error on out-of-bounds values? ··· 5011 4556 5012 4557 if ((int)attr.sched_policy < 0) 5013 4558 return -EINVAL; 4559 + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) 4560 + attr.sched_policy = SETPARAM_POLICY; 5014 4561 5015 4562 rcu_read_lock(); 5016 4563 retval = -ESRCH; 5017 4564 p = find_process_by_pid(pid); 5018 - if (p != NULL) 5019 - retval = sched_setattr(p, &attr); 4565 + if (likely(p)) 4566 + get_task_struct(p); 5020 4567 rcu_read_unlock(); 4568 + 4569 + if (likely(p)) { 4570 + retval = sched_setattr(p, &attr); 4571 + put_task_struct(p); 4572 + } 5021 4573 5022 4574 return retval; 5023 4575 } ··· 5175 4713 attr.sched_priority = p->rt_priority; 5176 4714 else 5177 4715 attr.sched_nice = task_nice(p); 4716 + 4717 + #ifdef CONFIG_UCLAMP_TASK 4718 + attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; 4719 + attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; 4720 + #endif 5178 4721 5179 4722 rcu_read_unlock(); 5180 4723 ··· 5333 4866 goto out_unlock; 5334 4867 5335 4868 raw_spin_lock_irqsave(&p->pi_lock, flags); 5336 - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4869 + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); 5337 4870 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5338 4871 5339 4872 out_unlock: ··· 5590 5123 } 5591 5124 EXPORT_SYMBOL(io_schedule_timeout); 5592 5125 5593 - void io_schedule(void) 5126 + void __sched io_schedule(void) 5594 5127 { 5595 5128 int token; 5596 5129 ··· 5910 5443 * allowed nodes is unnecessary. Thus, cpusets are not 5911 5444 * applicable for such threads. This prevents checking for 5912 5445 * success of set_cpus_allowed_ptr() on all attached tasks 5913 - * before cpus_allowed may be changed. 5446 + * before cpus_mask may be changed. 5914 5447 */ 5915 5448 if (p->flags & PF_NO_SETAFFINITY) { 5916 5449 ret = -EINVAL; ··· 5937 5470 if (curr_cpu == target_cpu) 5938 5471 return 0; 5939 5472 5940 - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) 5473 + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) 5941 5474 return -EINVAL; 5942 5475 5943 5476 /* TODO: This is not properly updating schedstats */ ··· 6075 5608 put_prev_task(rq, next); 6076 5609 6077 5610 /* 6078 - * Rules for changing task_struct::cpus_allowed are holding 5611 + * Rules for changing task_struct::cpus_mask are holding 6079 5612 * both pi_lock and rq->lock, such that holding either 6080 5613 * stabilizes the mask. 6081 5614 * ··· 6369 5902 6370 5903 void __init sched_init(void) 6371 5904 { 6372 - int i, j; 6373 5905 unsigned long alloc_size = 0, ptr; 5906 + int i; 6374 5907 6375 5908 wait_bit_init(); 6376 5909 ··· 6472 6005 #ifdef CONFIG_RT_GROUP_SCHED 6473 6006 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6474 6007 #endif 6475 - 6476 - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6477 - rq->cpu_load[j] = 0; 6478 - 6479 6008 #ifdef CONFIG_SMP 6480 6009 rq->sd = NULL; 6481 6010 rq->rd = NULL; ··· 6525 6062 init_schedstats(); 6526 6063 6527 6064 psi_init(); 6065 + 6066 + init_uclamp(); 6528 6067 6529 6068 scheduler_running = 1; 6530 6069 }

+2 -2

kernel/sched/cpudeadline.c

··· 120 120 const struct sched_dl_entity *dl_se = &p->dl; 121 121 122 122 if (later_mask && 123 - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { 123 + cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { 124 124 return 1; 125 125 } else { 126 126 int best_cpu = cpudl_maximum(cp); 127 127 128 128 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 129 129 130 - if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && 130 + if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && 131 131 dl_time_before(dl_se->deadline, cp->elements[0].dl)) { 132 132 if (later_mask) 133 133 cpumask_set_cpu(best_cpu, later_mask);

+17 -7

kernel/sched/cpufreq_schedutil.c

··· 196 196 * based on the task model parameters and gives the minimal utilization 197 197 * required to meet deadlines. 198 198 */ 199 - unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, 200 - unsigned long max, enum schedutil_type type) 199 + unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, 200 + unsigned long max, enum schedutil_type type, 201 + struct task_struct *p) 201 202 { 202 203 unsigned long dl_util, util, irq; 203 204 struct rq *rq = cpu_rq(cpu); 204 205 205 - if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) 206 + if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && 207 + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { 206 208 return max; 209 + } 207 210 208 211 /* 209 212 * Early check to see if IRQ/steal time saturates the CPU, can be ··· 222 219 * CFS tasks and we use the same metric to track the effective 223 220 * utilization (PELT windows are synchronized) we can directly add them 224 221 * to obtain the CPU's actual utilization. 222 + * 223 + * CFS and RT utilization can be boosted or capped, depending on 224 + * utilization clamp constraints requested by currently RUNNABLE 225 + * tasks. 226 + * When there are no CFS RUNNABLE tasks, clamps are released and 227 + * frequency will be gracefully reduced with the utilization decay. 225 228 */ 226 - util = util_cfs; 227 - util += cpu_util_rt(rq); 229 + util = util_cfs + cpu_util_rt(rq); 230 + if (type == FREQUENCY_UTIL) 231 + util = uclamp_util_with(rq, util, p); 228 232 229 233 dl_util = cpu_util_dl(rq); 230 234 ··· 286 276 { 287 277 struct rq *rq = cpu_rq(sg_cpu->cpu); 288 278 unsigned long util = cpu_util_cfs(rq); 289 - unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); 279 + unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); 290 280 291 281 sg_cpu->max = max; 292 282 sg_cpu->bw_dl = cpu_bw_dl(rq); 293 283 294 - return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); 284 + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); 295 285 } 296 286 297 287 /**

+2 -2

kernel/sched/cpupri.c

··· 94 94 if (skip) 95 95 continue; 96 96 97 - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 97 + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) 98 98 continue; 99 99 100 100 if (lowest_mask) { 101 - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 101 + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); 102 102 103 103 /* 104 104 * We have to ensure that we have at least one bit

+4 -4

kernel/sched/deadline.c

··· 538 538 * If we cannot preempt any rq, fall back to pick any 539 539 * online CPU: 540 540 */ 541 - cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 541 + cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr); 542 542 if (cpu >= nr_cpu_ids) { 543 543 /* 544 544 * Failed to find any suitable CPU. ··· 1195 1195 &curr->dl); 1196 1196 } else { 1197 1197 unsigned long scale_freq = arch_scale_freq_capacity(cpu); 1198 - unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 1198 + unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); 1199 1199 1200 1200 scaled_delta_exec = cap_scale(delta_exec, scale_freq); 1201 1201 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); ··· 1824 1824 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1825 1825 { 1826 1826 if (!task_running(rq, p) && 1827 - cpumask_test_cpu(cpu, &p->cpus_allowed)) 1827 + cpumask_test_cpu(cpu, p->cpus_ptr)) 1828 1828 return 1; 1829 1829 return 0; 1830 1830 } ··· 1974 1974 /* Retry if something changed. */ 1975 1975 if (double_lock_balance(rq, later_rq)) { 1976 1976 if (unlikely(task_rq(task) != rq || 1977 - !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || 1977 + !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || 1978 1978 task_running(rq, task) || 1979 1979 !dl_task(task) || 1980 1980 !task_on_rq_queued(task))) {

+11 -32

kernel/sched/debug.c

··· 233 233 *tablep = NULL; 234 234 } 235 235 236 - static int min_load_idx = 0; 237 - static int max_load_idx = CPU_LOAD_IDX_MAX-1; 238 - 239 236 static void 240 237 set_table_entry(struct ctl_table *entry, 241 238 const char *procname, void *data, int maxlen, 242 - umode_t mode, proc_handler *proc_handler, 243 - bool load_idx) 239 + umode_t mode, proc_handler *proc_handler) 244 240 { 245 241 entry->procname = procname; 246 242 entry->data = data; 247 243 entry->maxlen = maxlen; 248 244 entry->mode = mode; 249 245 entry->proc_handler = proc_handler; 250 - 251 - if (load_idx) { 252 - entry->extra1 = &min_load_idx; 253 - entry->extra2 = &max_load_idx; 254 - } 255 246 } 256 247 257 248 static struct ctl_table * 258 249 sd_alloc_ctl_domain_table(struct sched_domain *sd) 259 250 { 260 - struct ctl_table *table = sd_alloc_ctl_entry(14); 251 + struct ctl_table *table = sd_alloc_ctl_entry(9); 261 252 262 253 if (table == NULL) 263 254 return NULL; 264 255 265 - set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); 266 - set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); 267 - set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 268 - set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 269 - set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 270 - set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 271 - set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 272 - set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); 273 - set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); 274 - set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); 275 - set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); 276 - set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); 277 - set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); 278 - /* &table[13] is terminator */ 256 + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); 257 + set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); 258 + set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); 259 + set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); 260 + set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); 261 + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); 262 + set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); 263 + set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); 264 + /* &table[8] is terminator */ 279 265 280 266 return table; 281 267 } ··· 639 653 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 640 654 641 655 P(nr_running); 642 - SEQ_printf(m, " .%-30s: %lu\n", "load", 643 - rq->load.weight); 644 656 P(nr_switches); 645 657 P(nr_load_updates); 646 658 P(nr_uninterruptible); ··· 646 662 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 647 663 PN(clock); 648 664 PN(clock_task); 649 - P(cpu_load[0]); 650 - P(cpu_load[1]); 651 - P(cpu_load[2]); 652 - P(cpu_load[3]); 653 - P(cpu_load[4]); 654 665 #undef P 655 666 #undef PN 656 667

+219 -404

kernel/sched/fair.c

··· 275 275 return grp->my_q; 276 276 } 277 277 278 + static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) 279 + { 280 + if (!path) 281 + return; 282 + 283 + if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) 284 + autogroup_path(cfs_rq->tg, path, len); 285 + else if (cfs_rq && cfs_rq->tg->css.cgroup) 286 + cgroup_path(cfs_rq->tg->css.cgroup, path, len); 287 + else 288 + strlcpy(path, "(null)", len); 289 + } 290 + 278 291 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 279 292 { 280 293 struct rq *rq = rq_of(cfs_rq); ··· 460 447 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 461 448 { 462 449 return NULL; 450 + } 451 + 452 + static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) 453 + { 454 + if (path) 455 + strlcpy(path, "(null)", len); 463 456 } 464 457 465 458 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) ··· 783 764 struct sched_entity *se = &p->se; 784 765 struct cfs_rq *cfs_rq = cfs_rq_of(se); 785 766 struct sched_avg *sa = &se->avg; 786 - long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); 767 + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); 787 768 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; 788 769 789 770 if (cap > 0) { ··· 1485 1466 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 1486 1467 } 1487 1468 1488 - static unsigned long weighted_cpuload(struct rq *rq); 1489 - static unsigned long source_load(int cpu, int type); 1490 - static unsigned long target_load(int cpu, int type); 1469 + static unsigned long cpu_runnable_load(struct rq *rq); 1491 1470 1492 1471 /* Cached statistics for all CPUs within a node */ 1493 1472 struct numa_stats { ··· 1506 1489 for_each_cpu(cpu, cpumask_of_node(nid)) { 1507 1490 struct rq *rq = cpu_rq(cpu); 1508 1491 1509 - ns->load += weighted_cpuload(rq); 1492 + ns->load += cpu_runnable_load(rq); 1510 1493 ns->compute_capacity += capacity_of(cpu); 1511 1494 } 1512 1495 ··· 1638 1621 * be incurred if the tasks were swapped. 1639 1622 */ 1640 1623 /* Skip this swap candidate if cannot move to the source cpu */ 1641 - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1624 + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) 1642 1625 goto unlock; 1643 1626 1644 1627 /* ··· 1735 1718 1736 1719 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 1737 1720 /* Skip this CPU if the source task cannot migrate */ 1738 - if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) 1721 + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) 1739 1722 continue; 1740 1723 1741 1724 env->dst_cpu = cpu; ··· 2703 2686 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2704 2687 { 2705 2688 update_load_add(&cfs_rq->load, se->load.weight); 2706 - if (!parent_entity(se)) 2707 - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 2708 2689 #ifdef CONFIG_SMP 2709 2690 if (entity_is_task(se)) { 2710 2691 struct rq *rq = rq_of(cfs_rq); ··· 2718 2703 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2719 2704 { 2720 2705 update_load_sub(&cfs_rq->load, se->load.weight); 2721 - if (!parent_entity(se)) 2722 - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 2723 2706 #ifdef CONFIG_SMP 2724 2707 if (entity_is_task(se)) { 2725 2708 account_numa_dequeue(rq_of(cfs_rq), task_of(se)); ··· 3347 3334 update_tg_cfs_util(cfs_rq, se, gcfs_rq); 3348 3335 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); 3349 3336 3337 + trace_pelt_cfs_tp(cfs_rq); 3338 + trace_pelt_se_tp(se); 3339 + 3350 3340 return 1; 3351 3341 } 3352 3342 ··· 3502 3486 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3503 3487 3504 3488 cfs_rq_util_change(cfs_rq, flags); 3489 + 3490 + trace_pelt_cfs_tp(cfs_rq); 3505 3491 } 3506 3492 3507 3493 /** ··· 3523 3505 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3524 3506 3525 3507 cfs_rq_util_change(cfs_rq, 0); 3508 + 3509 + trace_pelt_cfs_tp(cfs_rq); 3526 3510 } 3527 3511 3528 3512 /* ··· 4120 4100 * least twice that of our own weight (i.e. dont track it 4121 4101 * when there are only lesser-weight tasks around): 4122 4102 */ 4123 - if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 4103 + if (schedstat_enabled() && 4104 + rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { 4124 4105 schedstat_set(se->statistics.slice_max, 4125 4106 max((u64)schedstat_val(se->statistics.slice_max), 4126 4107 se->sum_exec_runtime - se->prev_sum_exec_runtime)); ··· 4755 4734 if (runtime_refresh_within(cfs_b, min_left)) 4756 4735 return; 4757 4736 4737 + /* don't push forwards an existing deferred unthrottle */ 4738 + if (cfs_b->slack_started) 4739 + return; 4740 + cfs_b->slack_started = true; 4741 + 4758 4742 hrtimer_start(&cfs_b->slack_timer, 4759 4743 ns_to_ktime(cfs_bandwidth_slack_period), 4760 4744 HRTIMER_MODE_REL); ··· 4813 4787 4814 4788 /* confirm we're still not at a refresh boundary */ 4815 4789 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4790 + cfs_b->slack_started = false; 4816 4791 if (cfs_b->distribute_running) { 4817 4792 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 4818 4793 return; ··· 4977 4950 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4978 4951 cfs_b->slack_timer.function = sched_cfs_slack_timer; 4979 4952 cfs_b->distribute_running = 0; 4953 + cfs_b->slack_started = false; 4980 4954 } 4981 4955 4982 4956 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) ··· 5181 5153 5182 5154 static inline void update_overutilized_status(struct rq *rq) 5183 5155 { 5184 - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) 5156 + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { 5185 5157 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); 5158 + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); 5159 + } 5186 5160 } 5187 5161 #else 5188 5162 static inline void update_overutilized_status(struct rq *rq) { } ··· 5355 5325 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); 5356 5326 5357 5327 #ifdef CONFIG_NO_HZ_COMMON 5358 - /* 5359 - * per rq 'load' arrray crap; XXX kill this. 5360 - */ 5361 - 5362 - /* 5363 - * The exact cpuload calculated at every tick would be: 5364 - * 5365 - * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load 5366 - * 5367 - * If a CPU misses updates for n ticks (as it was idle) and update gets 5368 - * called on the n+1-th tick when CPU may be busy, then we have: 5369 - * 5370 - * load_n = (1 - 1/2^i)^n * load_0 5371 - * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load 5372 - * 5373 - * decay_load_missed() below does efficient calculation of 5374 - * 5375 - * load' = (1 - 1/2^i)^n * load 5376 - * 5377 - * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. 5378 - * This allows us to precompute the above in said factors, thereby allowing the 5379 - * reduction of an arbitrary n in O(log_2 n) steps. (See also 5380 - * fixed_power_int()) 5381 - * 5382 - * The calculation is approximated on a 128 point scale. 5383 - */ 5384 - #define DEGRADE_SHIFT 7 5385 - 5386 - static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 5387 - static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 5388 - { 0, 0, 0, 0, 0, 0, 0, 0 }, 5389 - { 64, 32, 8, 0, 0, 0, 0, 0 }, 5390 - { 96, 72, 40, 12, 1, 0, 0, 0 }, 5391 - { 112, 98, 75, 43, 15, 1, 0, 0 }, 5392 - { 120, 112, 98, 76, 45, 16, 2, 0 } 5393 - }; 5394 - 5395 - /* 5396 - * Update cpu_load for any missed ticks, due to tickless idle. The backlog 5397 - * would be when CPU is idle and so we just decay the old load without 5398 - * adding any new load. 5399 - */ 5400 - static unsigned long 5401 - decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 5402 - { 5403 - int j = 0; 5404 - 5405 - if (!missed_updates) 5406 - return load; 5407 - 5408 - if (missed_updates >= degrade_zero_ticks[idx]) 5409 - return 0; 5410 - 5411 - if (idx == 1) 5412 - return load >> missed_updates; 5413 - 5414 - while (missed_updates) { 5415 - if (missed_updates % 2) 5416 - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 5417 - 5418 - missed_updates >>= 1; 5419 - j++; 5420 - } 5421 - return load; 5422 - } 5423 5328 5424 5329 static struct { 5425 5330 cpumask_var_t idle_cpus_mask; ··· 5366 5401 5367 5402 #endif /* CONFIG_NO_HZ_COMMON */ 5368 5403 5369 - /** 5370 - * __cpu_load_update - update the rq->cpu_load[] statistics 5371 - * @this_rq: The rq to update statistics for 5372 - * @this_load: The current load 5373 - * @pending_updates: The number of missed updates 5374 - * 5375 - * Update rq->cpu_load[] statistics. This function is usually called every 5376 - * scheduler tick (TICK_NSEC). 5377 - * 5378 - * This function computes a decaying average: 5379 - * 5380 - * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load 5381 - * 5382 - * Because of NOHZ it might not get called on every tick which gives need for 5383 - * the @pending_updates argument. 5384 - * 5385 - * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 5386 - * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load 5387 - * = A * (A * load[i]_n-2 + B) + B 5388 - * = A * (A * (A * load[i]_n-3 + B) + B) + B 5389 - * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B 5390 - * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B 5391 - * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B 5392 - * = (1 - 1/2^i)^n * (load[i]_0 - load) + load 5393 - * 5394 - * In the above we've assumed load_n := load, which is true for NOHZ_FULL as 5395 - * any change in load would have resulted in the tick being turned back on. 5396 - * 5397 - * For regular NOHZ, this reduces to: 5398 - * 5399 - * load[i]_n = (1 - 1/2^i)^n * load[i]_0 5400 - * 5401 - * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra 5402 - * term. 5403 - */ 5404 - static void cpu_load_update(struct rq *this_rq, unsigned long this_load, 5405 - unsigned long pending_updates) 5406 - { 5407 - unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; 5408 - int i, scale; 5409 - 5410 - this_rq->nr_load_updates++; 5411 - 5412 - /* Update our load: */ 5413 - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 5414 - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 5415 - unsigned long old_load, new_load; 5416 - 5417 - /* scale is effectively 1 << i now, and >> i divides by scale */ 5418 - 5419 - old_load = this_rq->cpu_load[i]; 5420 - #ifdef CONFIG_NO_HZ_COMMON 5421 - old_load = decay_load_missed(old_load, pending_updates - 1, i); 5422 - if (tickless_load) { 5423 - old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); 5424 - /* 5425 - * old_load can never be a negative value because a 5426 - * decayed tickless_load cannot be greater than the 5427 - * original tickless_load. 5428 - */ 5429 - old_load += tickless_load; 5430 - } 5431 - #endif 5432 - new_load = this_load; 5433 - /* 5434 - * Round up the averaging division if load is increasing. This 5435 - * prevents us from getting stuck on 9 if the load is 10, for 5436 - * example. 5437 - */ 5438 - if (new_load > old_load) 5439 - new_load += scale - 1; 5440 - 5441 - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 5442 - } 5443 - } 5444 - 5445 - /* Used instead of source_load when we know the type == 0 */ 5446 - static unsigned long weighted_cpuload(struct rq *rq) 5404 + static unsigned long cpu_runnable_load(struct rq *rq) 5447 5405 { 5448 5406 return cfs_rq_runnable_load_avg(&rq->cfs); 5449 - } 5450 - 5451 - #ifdef CONFIG_NO_HZ_COMMON 5452 - /* 5453 - * There is no sane way to deal with nohz on smp when using jiffies because the 5454 - * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading 5455 - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 5456 - * 5457 - * Therefore we need to avoid the delta approach from the regular tick when 5458 - * possible since that would seriously skew the load calculation. This is why we 5459 - * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on 5460 - * jiffies deltas for updates happening while in nohz mode (idle ticks, idle 5461 - * loop exit, nohz_idle_balance, nohz full exit...) 5462 - * 5463 - * This means we might still be one tick off for nohz periods. 5464 - */ 5465 - 5466 - static void cpu_load_update_nohz(struct rq *this_rq, 5467 - unsigned long curr_jiffies, 5468 - unsigned long load) 5469 - { 5470 - unsigned long pending_updates; 5471 - 5472 - pending_updates = curr_jiffies - this_rq->last_load_update_tick; 5473 - if (pending_updates) { 5474 - this_rq->last_load_update_tick = curr_jiffies; 5475 - /* 5476 - * In the regular NOHZ case, we were idle, this means load 0. 5477 - * In the NOHZ_FULL case, we were non-idle, we should consider 5478 - * its weighted load. 5479 - */ 5480 - cpu_load_update(this_rq, load, pending_updates); 5481 - } 5482 - } 5483 - 5484 - /* 5485 - * Called from nohz_idle_balance() to update the load ratings before doing the 5486 - * idle balance. 5487 - */ 5488 - static void cpu_load_update_idle(struct rq *this_rq) 5489 - { 5490 - /* 5491 - * bail if there's load or we're actually up-to-date. 5492 - */ 5493 - if (weighted_cpuload(this_rq)) 5494 - return; 5495 - 5496 - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); 5497 - } 5498 - 5499 - /* 5500 - * Record CPU load on nohz entry so we know the tickless load to account 5501 - * on nohz exit. cpu_load[0] happens then to be updated more frequently 5502 - * than other cpu_load[idx] but it should be fine as cpu_load readers 5503 - * shouldn't rely into synchronized cpu_load[*] updates. 5504 - */ 5505 - void cpu_load_update_nohz_start(void) 5506 - { 5507 - struct rq *this_rq = this_rq(); 5508 - 5509 - /* 5510 - * This is all lockless but should be fine. If weighted_cpuload changes 5511 - * concurrently we'll exit nohz. And cpu_load write can race with 5512 - * cpu_load_update_idle() but both updater would be writing the same. 5513 - */ 5514 - this_rq->cpu_load[0] = weighted_cpuload(this_rq); 5515 - } 5516 - 5517 - /* 5518 - * Account the tickless load in the end of a nohz frame. 5519 - */ 5520 - void cpu_load_update_nohz_stop(void) 5521 - { 5522 - unsigned long curr_jiffies = READ_ONCE(jiffies); 5523 - struct rq *this_rq = this_rq(); 5524 - unsigned long load; 5525 - struct rq_flags rf; 5526 - 5527 - if (curr_jiffies == this_rq->last_load_update_tick) 5528 - return; 5529 - 5530 - load = weighted_cpuload(this_rq); 5531 - rq_lock(this_rq, &rf); 5532 - update_rq_clock(this_rq); 5533 - cpu_load_update_nohz(this_rq, curr_jiffies, load); 5534 - rq_unlock(this_rq, &rf); 5535 - } 5536 - #else /* !CONFIG_NO_HZ_COMMON */ 5537 - static inline void cpu_load_update_nohz(struct rq *this_rq, 5538 - unsigned long curr_jiffies, 5539 - unsigned long load) { } 5540 - #endif /* CONFIG_NO_HZ_COMMON */ 5541 - 5542 - static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) 5543 - { 5544 - #ifdef CONFIG_NO_HZ_COMMON 5545 - /* See the mess around cpu_load_update_nohz(). */ 5546 - this_rq->last_load_update_tick = READ_ONCE(jiffies); 5547 - #endif 5548 - cpu_load_update(this_rq, load, 1); 5549 - } 5550 - 5551 - /* 5552 - * Called from scheduler_tick() 5553 - */ 5554 - void cpu_load_update_active(struct rq *this_rq) 5555 - { 5556 - unsigned long load = weighted_cpuload(this_rq); 5557 - 5558 - if (tick_nohz_tick_stopped()) 5559 - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); 5560 - else 5561 - cpu_load_update_periodic(this_rq, load); 5562 - } 5563 - 5564 - /* 5565 - * Return a low guess at the load of a migration-source CPU weighted 5566 - * according to the scheduling class and "nice" value. 5567 - * 5568 - * We want to under-estimate the load of migration sources, to 5569 - * balance conservatively. 5570 - */ 5571 - static unsigned long source_load(int cpu, int type) 5572 - { 5573 - struct rq *rq = cpu_rq(cpu); 5574 - unsigned long total = weighted_cpuload(rq); 5575 - 5576 - if (type == 0 || !sched_feat(LB_BIAS)) 5577 - return total; 5578 - 5579 - return min(rq->cpu_load[type-1], total); 5580 - } 5581 - 5582 - /* 5583 - * Return a high guess at the load of a migration-target CPU weighted 5584 - * according to the scheduling class and "nice" value. 5585 - */ 5586 - static unsigned long target_load(int cpu, int type) 5587 - { 5588 - struct rq *rq = cpu_rq(cpu); 5589 - unsigned long total = weighted_cpuload(rq); 5590 - 5591 - if (type == 0 || !sched_feat(LB_BIAS)) 5592 - return total; 5593 - 5594 - return max(rq->cpu_load[type-1], total); 5595 5407 } 5596 5408 5597 5409 static unsigned long capacity_of(int cpu) ··· 5380 5638 { 5381 5639 struct rq *rq = cpu_rq(cpu); 5382 5640 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); 5383 - unsigned long load_avg = weighted_cpuload(rq); 5641 + unsigned long load_avg = cpu_runnable_load(rq); 5384 5642 5385 5643 if (nr_running) 5386 5644 return load_avg / nr_running; ··· 5478 5736 s64 this_eff_load, prev_eff_load; 5479 5737 unsigned long task_load; 5480 5738 5481 - this_eff_load = target_load(this_cpu, sd->wake_idx); 5739 + this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); 5482 5740 5483 5741 if (sync) { 5484 5742 unsigned long current_load = task_h_load(current); ··· 5496 5754 this_eff_load *= 100; 5497 5755 this_eff_load *= capacity_of(prev_cpu); 5498 5756 5499 - prev_eff_load = source_load(prev_cpu, sd->wake_idx); 5757 + prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); 5500 5758 prev_eff_load -= task_load; 5501 5759 if (sched_feat(WA_BIAS)) 5502 5760 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; ··· 5557 5815 unsigned long this_runnable_load = ULONG_MAX; 5558 5816 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; 5559 5817 unsigned long most_spare = 0, this_spare = 0; 5560 - int load_idx = sd->forkexec_idx; 5561 5818 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; 5562 5819 unsigned long imbalance = scale_load_down(NICE_0_LOAD) * 5563 5820 (sd->imbalance_pct-100) / 100; 5564 - 5565 - if (sd_flag & SD_BALANCE_WAKE) 5566 - load_idx = sd->wake_idx; 5567 5821 5568 5822 do { 5569 5823 unsigned long load, avg_load, runnable_load; ··· 5569 5831 5570 5832 /* Skip over this group if it has no CPUs allowed */ 5571 5833 if (!cpumask_intersects(sched_group_span(group), 5572 - &p->cpus_allowed)) 5834 + p->cpus_ptr)) 5573 5835 continue; 5574 5836 5575 5837 local_group = cpumask_test_cpu(this_cpu, ··· 5584 5846 max_spare_cap = 0; 5585 5847 5586 5848 for_each_cpu(i, sched_group_span(group)) { 5587 - /* Bias balancing toward CPUs of our domain */ 5588 - if (local_group) 5589 - load = source_load(i, load_idx); 5590 - else 5591 - load = target_load(i, load_idx); 5592 - 5849 + load = cpu_runnable_load(cpu_rq(i)); 5593 5850 runnable_load += load; 5594 5851 5595 5852 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); ··· 5696 5963 return cpumask_first(sched_group_span(group)); 5697 5964 5698 5965 /* Traverse only the allowed CPUs */ 5699 - for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { 5966 + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { 5700 5967 if (available_idle_cpu(i)) { 5701 5968 struct rq *rq = cpu_rq(i); 5702 5969 struct cpuidle_state *idle = idle_get_state(rq); ··· 5720 5987 shallowest_idle_cpu = i; 5721 5988 } 5722 5989 } else if (shallowest_idle_cpu == -1) { 5723 - load = weighted_cpuload(cpu_rq(i)); 5990 + load = cpu_runnable_load(cpu_rq(i)); 5724 5991 if (load < min_load) { 5725 5992 min_load = load; 5726 5993 least_loaded_cpu = i; ··· 5736 6003 { 5737 6004 int new_cpu = cpu; 5738 6005 5739 - if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) 6006 + if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) 5740 6007 return prev_cpu; 5741 6008 5742 6009 /* ··· 5853 6120 if (!test_idle_cores(target, false)) 5854 6121 return -1; 5855 6122 5856 - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); 6123 + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); 5857 6124 5858 6125 for_each_cpu_wrap(core, cpus, target) { 5859 6126 bool idle = true; ··· 5887 6154 return -1; 5888 6155 5889 6156 for_each_cpu(cpu, cpu_smt_mask(target)) { 5890 - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6157 + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 5891 6158 continue; 5892 6159 if (available_idle_cpu(cpu)) 5893 6160 return cpu; ··· 5951 6218 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { 5952 6219 if (!--nr) 5953 6220 return -1; 5954 - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6221 + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 5955 6222 continue; 5956 6223 if (available_idle_cpu(cpu)) 5957 6224 break; ··· 5988 6255 recent_used_cpu != target && 5989 6256 cpus_share_cache(recent_used_cpu, target) && 5990 6257 available_idle_cpu(recent_used_cpu) && 5991 - cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 6258 + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { 5992 6259 /* 5993 6260 * Replace recent_used_cpu with prev as it is a potential 5994 6261 * candidate for the next wake: ··· 6232 6499 static long 6233 6500 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) 6234 6501 { 6235 - long util, max_util, sum_util, energy = 0; 6502 + unsigned int max_util, util_cfs, cpu_util, cpu_cap; 6503 + unsigned long sum_util, energy = 0; 6504 + struct task_struct *tsk; 6236 6505 int cpu; 6237 6506 6238 6507 for (; pd; pd = pd->next) { 6508 + struct cpumask *pd_mask = perf_domain_span(pd); 6509 + 6510 + /* 6511 + * The energy model mandates all the CPUs of a performance 6512 + * domain have the same capacity. 6513 + */ 6514 + cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); 6239 6515 max_util = sum_util = 0; 6516 + 6240 6517 /* 6241 6518 * The capacity state of CPUs of the current rd can be driven by 6242 6519 * CPUs of another rd if they belong to the same performance ··· 6257 6514 * it will not appear in its pd list and will not be accounted 6258 6515 * by compute_energy(). 6259 6516 */ 6260 - for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { 6261 - util = cpu_util_next(cpu, p, dst_cpu); 6262 - util = schedutil_energy_util(cpu, util); 6263 - max_util = max(util, max_util); 6264 - sum_util += util; 6517 + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { 6518 + util_cfs = cpu_util_next(cpu, p, dst_cpu); 6519 + 6520 + /* 6521 + * Busy time computation: utilization clamping is not 6522 + * required since the ratio (sum_util / cpu_capacity) 6523 + * is already enough to scale the EM reported power 6524 + * consumption at the (eventually clamped) cpu_capacity. 6525 + */ 6526 + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, 6527 + ENERGY_UTIL, NULL); 6528 + 6529 + /* 6530 + * Performance domain frequency: utilization clamping 6531 + * must be considered since it affects the selection 6532 + * of the performance domain frequency. 6533 + * NOTE: in case RT tasks are running, by default the 6534 + * FREQUENCY_UTIL's utilization can be max OPP. 6535 + */ 6536 + tsk = cpu == dst_cpu ? p : NULL; 6537 + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, 6538 + FREQUENCY_UTIL, tsk); 6539 + max_util = max(max_util, cpu_util); 6265 6540 } 6266 6541 6267 6542 energy += em_pd_energy(pd->em_pd, max_util, sum_util); ··· 6362 6601 int max_spare_cap_cpu = -1; 6363 6602 6364 6603 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { 6365 - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6604 + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 6366 6605 continue; 6367 6606 6368 6607 /* Skip CPUs that will be overutilized. */ ··· 6451 6690 } 6452 6691 6453 6692 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && 6454 - cpumask_test_cpu(cpu, &p->cpus_allowed); 6693 + cpumask_test_cpu(cpu, p->cpus_ptr); 6455 6694 } 6456 6695 6457 6696 rcu_read_lock(); ··· 7207 7446 /* 7208 7447 * We do not migrate tasks that are: 7209 7448 * 1) throttled_lb_pair, or 7210 - * 2) cannot be migrated to this CPU due to cpus_allowed, or 7449 + * 2) cannot be migrated to this CPU due to cpus_ptr, or 7211 7450 * 3) running (obviously), or 7212 7451 * 4) are cache-hot on their current CPU. 7213 7452 */ 7214 7453 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 7215 7454 return 0; 7216 7455 7217 - if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { 7456 + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { 7218 7457 int cpu; 7219 7458 7220 7459 schedstat_inc(p->se.statistics.nr_failed_migrations_affine); ··· 7234 7473 7235 7474 /* Prevent to re-select dst_cpu via env's CPUs: */ 7236 7475 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7237 - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7476 + if (cpumask_test_cpu(cpu, p->cpus_ptr)) { 7238 7477 env->flags |= LBF_DST_PINNED; 7239 7478 env->new_dst_cpu = cpu; 7240 7479 break; ··· 7320 7559 static const unsigned int sched_nr_migrate_break = 32; 7321 7560 7322 7561 /* 7323 - * detach_tasks() -- tries to detach up to imbalance weighted load from 7562 + * detach_tasks() -- tries to detach up to imbalance runnable load from 7324 7563 * busiest_rq, as part of a balancing operation within domain "sd". 7325 7564 * 7326 7565 * Returns number of detached tasks if successful and 0 otherwise. ··· 7388 7627 7389 7628 /* 7390 7629 * We only want to steal up to the prescribed amount of 7391 - * weighted load. 7630 + * runnable load. 7392 7631 */ 7393 7632 if (env->imbalance <= 0) 7394 7633 break; ··· 7457 7696 rq_unlock(env->dst_rq, &rf); 7458 7697 } 7459 7698 7699 + #ifdef CONFIG_NO_HZ_COMMON 7460 7700 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) 7461 7701 { 7462 7702 if (cfs_rq->avg.load_avg) ··· 7484 7722 7485 7723 return false; 7486 7724 } 7725 + 7726 + static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 7727 + { 7728 + rq->last_blocked_load_update_tick = jiffies; 7729 + 7730 + if (!has_blocked) 7731 + rq->has_blocked_load = 0; 7732 + } 7733 + #else 7734 + static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } 7735 + static inline bool others_have_blocked(struct rq *rq) { return false; } 7736 + static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} 7737 + #endif 7487 7738 7488 7739 #ifdef CONFIG_FAIR_GROUP_SCHED 7489 7740 ··· 7563 7788 if (others_have_blocked(rq)) 7564 7789 done = false; 7565 7790 7566 - #ifdef CONFIG_NO_HZ_COMMON 7567 - rq->last_blocked_load_update_tick = jiffies; 7568 - if (done) 7569 - rq->has_blocked_load = 0; 7570 - #endif 7791 + update_blocked_load_status(rq, !done); 7571 7792 rq_unlock_irqrestore(rq, &rf); 7572 7793 } 7573 7794 ··· 7629 7858 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); 7630 7859 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); 7631 7860 update_irq_load_avg(rq, 0); 7632 - #ifdef CONFIG_NO_HZ_COMMON 7633 - rq->last_blocked_load_update_tick = jiffies; 7634 - if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) 7635 - rq->has_blocked_load = 0; 7636 - #endif 7861 + update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); 7637 7862 rq_unlock_irqrestore(rq, &rf); 7638 7863 } 7639 7864 ··· 7647 7880 struct sg_lb_stats { 7648 7881 unsigned long avg_load; /*Avg load across the CPUs of the group */ 7649 7882 unsigned long group_load; /* Total load over the CPUs of the group */ 7650 - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 7651 7883 unsigned long load_per_task; 7652 7884 unsigned long group_capacity; 7653 7885 unsigned long group_util; /* Total utilization of the group */ ··· 7700 7934 }; 7701 7935 } 7702 7936 7703 - /** 7704 - * get_sd_load_idx - Obtain the load index for a given sched domain. 7705 - * @sd: The sched_domain whose load_idx is to be obtained. 7706 - * @idle: The idle status of the CPU for whose sd load_idx is obtained. 7707 - * 7708 - * Return: The load index. 7709 - */ 7710 - static inline int get_sd_load_idx(struct sched_domain *sd, 7711 - enum cpu_idle_type idle) 7712 - { 7713 - int load_idx; 7714 - 7715 - switch (idle) { 7716 - case CPU_NOT_IDLE: 7717 - load_idx = sd->busy_idx; 7718 - break; 7719 - 7720 - case CPU_NEWLY_IDLE: 7721 - load_idx = sd->newidle_idx; 7722 - break; 7723 - default: 7724 - load_idx = sd->idle_idx; 7725 - break; 7726 - } 7727 - 7728 - return load_idx; 7729 - } 7730 - 7731 7937 static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) 7732 7938 { 7733 7939 struct rq *rq = cpu_rq(cpu); 7734 - unsigned long max = arch_scale_cpu_capacity(sd, cpu); 7940 + unsigned long max = arch_scale_cpu_capacity(cpu); 7735 7941 unsigned long used, free; 7736 7942 unsigned long irq; 7737 7943 ··· 7728 7990 unsigned long capacity = scale_rt_capacity(sd, cpu); 7729 7991 struct sched_group *sdg = sd->groups; 7730 7992 7731 - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); 7993 + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); 7732 7994 7733 7995 if (!capacity) 7734 7996 capacity = 1; ··· 7838 8100 7839 8101 /* 7840 8102 * Group imbalance indicates (and tries to solve) the problem where balancing 7841 - * groups is inadequate due to ->cpus_allowed constraints. 8103 + * groups is inadequate due to ->cpus_ptr constraints. 7842 8104 * 7843 8105 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a 7844 8106 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. ··· 7988 8250 struct sg_lb_stats *sgs, 7989 8251 int *sg_status) 7990 8252 { 7991 - int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); 7992 - int load_idx = get_sd_load_idx(env->sd, env->idle); 7993 - unsigned long load; 7994 8253 int i, nr_running; 7995 8254 7996 8255 memset(sgs, 0, sizeof(*sgs)); ··· 7998 8263 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) 7999 8264 env->flags |= LBF_NOHZ_AGAIN; 8000 8265 8001 - /* Bias balancing toward CPUs of our domain: */ 8002 - if (local_group) 8003 - load = target_load(i, load_idx); 8004 - else 8005 - load = source_load(i, load_idx); 8006 - 8007 - sgs->group_load += load; 8266 + sgs->group_load += cpu_runnable_load(rq); 8008 8267 sgs->group_util += cpu_util(i); 8009 8268 sgs->sum_nr_running += rq->cfs.h_nr_running; 8010 8269 ··· 8013 8284 sgs->nr_numa_running += rq->nr_numa_running; 8014 8285 sgs->nr_preferred_running += rq->nr_preferred_running; 8015 8286 #endif 8016 - sgs->sum_weighted_load += weighted_cpuload(rq); 8017 8287 /* 8018 8288 * No need to call idle_cpu() if nr_running is not 0 8019 8289 */ ··· 8031 8303 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; 8032 8304 8033 8305 if (sgs->sum_nr_running) 8034 - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 8306 + sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; 8035 8307 8036 8308 sgs->group_weight = group->group_weight; 8037 8309 ··· 8245 8517 8246 8518 /* Update over-utilization (tipping point, U >= 0) indicator */ 8247 8519 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); 8520 + trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); 8248 8521 } else if (sg_status & SG_OVERUTILIZED) { 8249 - WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); 8522 + struct root_domain *rd = env->dst_rq->rd; 8523 + 8524 + WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); 8525 + trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); 8250 8526 } 8251 8527 } 8252 8528 ··· 8456 8724 * find_busiest_group - Returns the busiest group within the sched_domain 8457 8725 * if there is an imbalance. 8458 8726 * 8459 - * Also calculates the amount of weighted load which should be moved 8727 + * Also calculates the amount of runnable load which should be moved 8460 8728 * to restore balance. 8461 8729 * 8462 8730 * @env: The load balancing environment. ··· 8501 8769 /* 8502 8770 * If the busiest group is imbalanced the below checks don't 8503 8771 * work because they assume all things are equal, which typically 8504 - * isn't true due to cpus_allowed constraints and the like. 8772 + * isn't true due to cpus_ptr constraints and the like. 8505 8773 */ 8506 8774 if (busiest->group_type == group_imbalanced) 8507 8775 goto force_balance; ··· 8575 8843 int i; 8576 8844 8577 8845 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 8578 - unsigned long capacity, wl; 8846 + unsigned long capacity, load; 8579 8847 enum fbq_type rt; 8580 8848 8581 8849 rq = cpu_rq(i); ··· 8629 8897 rq->nr_running == 1) 8630 8898 continue; 8631 8899 8632 - wl = weighted_cpuload(rq); 8900 + load = cpu_runnable_load(rq); 8633 8901 8634 8902 /* 8635 - * When comparing with imbalance, use weighted_cpuload() 8903 + * When comparing with imbalance, use cpu_runnable_load() 8636 8904 * which is not scaled with the CPU capacity. 8637 8905 */ 8638 8906 8639 - if (rq->nr_running == 1 && wl > env->imbalance && 8907 + if (rq->nr_running == 1 && load > env->imbalance && 8640 8908 !check_cpu_capacity(rq, env->sd)) 8641 8909 continue; 8642 8910 8643 8911 /* 8644 8912 * For the load comparisons with the other CPU's, consider 8645 - * the weighted_cpuload() scaled with the CPU capacity, so 8913 + * the cpu_runnable_load() scaled with the CPU capacity, so 8646 8914 * that the load can be moved away from the CPU that is 8647 8915 * potentially running at a lower capacity. 8648 8916 * 8649 - * Thus we're looking for max(wl_i / capacity_i), crosswise 8917 + * Thus we're looking for max(load_i / capacity_i), crosswise 8650 8918 * multiplication to rid ourselves of the division works out 8651 - * to: wl_i * capacity_j > wl_j * capacity_i; where j is 8919 + * to: load_i * capacity_j > load_j * capacity_i; where j is 8652 8920 * our previous maximum. 8653 8921 */ 8654 - if (wl * busiest_capacity > busiest_load * capacity) { 8655 - busiest_load = wl; 8922 + if (load * busiest_capacity > busiest_load * capacity) { 8923 + busiest_load = load; 8656 8924 busiest_capacity = capacity; 8657 8925 busiest = rq; 8658 8926 } ··· 8943 9211 * if the curr task on busiest CPU can't be 8944 9212 * moved to this_cpu: 8945 9213 */ 8946 - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 9214 + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { 8947 9215 raw_spin_unlock_irqrestore(&busiest->lock, 8948 9216 flags); 8949 9217 env.flags |= LBF_ALL_PINNED; ··· 9612 9880 9613 9881 rq_lock_irqsave(rq, &rf); 9614 9882 update_rq_clock(rq); 9615 - cpu_load_update_idle(rq); 9616 9883 rq_unlock_irqrestore(rq, &rf); 9617 9884 9618 9885 if (flags & NOHZ_BALANCE_KICK) ··· 10422 10691 #ifdef CONFIG_FAIR_GROUP_SCHED 10423 10692 .task_change_group = task_change_group_fair, 10424 10693 #endif 10694 + 10695 + #ifdef CONFIG_UCLAMP_TASK 10696 + .uclamp_enabled = 1, 10697 + #endif 10425 10698 }; 10426 10699 10427 10700 #ifdef CONFIG_SCHED_DEBUG ··· 10473 10738 #endif /* SMP */ 10474 10739 10475 10740 } 10741 + 10742 + /* 10743 + * Helper functions to facilitate extracting info from tracepoints. 10744 + */ 10745 + 10746 + const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) 10747 + { 10748 + #ifdef CONFIG_SMP 10749 + return cfs_rq ? &cfs_rq->avg : NULL; 10750 + #else 10751 + return NULL; 10752 + #endif 10753 + } 10754 + EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); 10755 + 10756 + char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) 10757 + { 10758 + if (!cfs_rq) { 10759 + if (str) 10760 + strlcpy(str, "(null)", len); 10761 + else 10762 + return NULL; 10763 + } 10764 + 10765 + cfs_rq_tg_path(cfs_rq, str, len); 10766 + return str; 10767 + } 10768 + EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); 10769 + 10770 + int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) 10771 + { 10772 + return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; 10773 + } 10774 + EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); 10775 + 10776 + const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) 10777 + { 10778 + #ifdef CONFIG_SMP 10779 + return rq ? &rq->avg_rt : NULL; 10780 + #else 10781 + return NULL; 10782 + #endif 10783 + } 10784 + EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); 10785 + 10786 + const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) 10787 + { 10788 + #ifdef CONFIG_SMP 10789 + return rq ? &rq->avg_dl : NULL; 10790 + #else 10791 + return NULL; 10792 + #endif 10793 + } 10794 + EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); 10795 + 10796 + const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) 10797 + { 10798 + #if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) 10799 + return rq ? &rq->avg_irq : NULL; 10800 + #else 10801 + return NULL; 10802 + #endif 10803 + } 10804 + EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); 10805 + 10806 + int sched_trace_rq_cpu(struct rq *rq) 10807 + { 10808 + return rq ? cpu_of(rq) : -1; 10809 + } 10810 + EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); 10811 + 10812 + const struct cpumask *sched_trace_rd_span(struct root_domain *rd) 10813 + { 10814 + #ifdef CONFIG_SMP 10815 + return rd ? rd->span : NULL; 10816 + #else 10817 + return NULL; 10818 + #endif 10819 + } 10820 + EXPORT_SYMBOL_GPL(sched_trace_rd_span);

-1

kernel/sched/features.h

··· 39 39 40 40 SCHED_FEAT(HRTICK, false) 41 41 SCHED_FEAT(DOUBLE_TICK, false) 42 - SCHED_FEAT(LB_BIAS, false) 43 42 44 43 /* 45 44 * Decrement CPU capacity based on time not spent running tasks

+11 -2

kernel/sched/pelt.c

··· 28 28 #include "sched.h" 29 29 #include "pelt.h" 30 30 31 + #include <trace/events/sched.h> 32 + 31 33 /* 32 34 * Approximate: 33 35 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) ··· 267 265 { 268 266 if (___update_load_sum(now, &se->avg, 0, 0, 0)) { 269 267 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 268 + trace_pelt_se_tp(se); 270 269 return 1; 271 270 } 272 271 ··· 281 278 282 279 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 283 280 cfs_se_util_change(&se->avg); 281 + trace_pelt_se_tp(se); 284 282 return 1; 285 283 } 286 284 ··· 296 292 cfs_rq->curr != NULL)) { 297 293 298 294 ___update_load_avg(&cfs_rq->avg, 1, 1); 295 + trace_pelt_cfs_tp(cfs_rq); 299 296 return 1; 300 297 } 301 298 ··· 322 317 running)) { 323 318 324 319 ___update_load_avg(&rq->avg_rt, 1, 1); 320 + trace_pelt_rt_tp(rq); 325 321 return 1; 326 322 } 327 323 ··· 346 340 running)) { 347 341 348 342 ___update_load_avg(&rq->avg_dl, 1, 1); 343 + trace_pelt_dl_tp(rq); 349 344 return 1; 350 345 } 351 346 ··· 373 366 * reflect the real amount of computation 374 367 */ 375 368 running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); 376 - running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); 369 + running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq))); 377 370 378 371 /* 379 372 * We know the time that has been used by interrupt since last update ··· 395 388 1, 396 389 1); 397 390 398 - if (ret) 391 + if (ret) { 399 392 ___update_load_avg(&rq->avg_irq, 1, 1); 393 + trace_pelt_irq_tp(rq); 394 + } 400 395 401 396 return ret; 402 397 }

+1 -1

kernel/sched/pelt.h

··· 79 79 * Scale the elapsed time to reflect the real amount of 80 80 * computation 81 81 */ 82 - delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); 82 + delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq))); 83 83 delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); 84 84 85 85 rq->clock_pelt += delta;

+6 -2

kernel/sched/rt.c

··· 1614 1614 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1615 1615 { 1616 1616 if (!task_running(rq, p) && 1617 - cpumask_test_cpu(cpu, &p->cpus_allowed)) 1617 + cpumask_test_cpu(cpu, p->cpus_ptr)) 1618 1618 return 1; 1619 1619 1620 1620 return 0; ··· 1751 1751 * Also make sure that it wasn't scheduled on its rq. 1752 1752 */ 1753 1753 if (unlikely(task_rq(task) != rq || 1754 - !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || 1754 + !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || 1755 1755 task_running(rq, task) || 1756 1756 !rt_task(task) || 1757 1757 !task_on_rq_queued(task))) { ··· 2400 2400 .switched_to = switched_to_rt, 2401 2401 2402 2402 .update_curr = update_curr_rt, 2403 + 2404 + #ifdef CONFIG_UCLAMP_TASK 2405 + .uclamp_enabled = 1, 2406 + #endif 2403 2407 }; 2404 2408 2405 2409 #ifdef CONFIG_RT_GROUP_SCHED

+1 -1

kernel/sched/sched-pelt.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ 3 3 4 - static const u32 runnable_avg_yN_inv[] = { 4 + static const u32 runnable_avg_yN_inv[] __maybe_unused = { 5 5 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 6 6 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 7 7 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,

+108 -26

kernel/sched/sched.h

··· 96 96 extern void calc_global_load_tick(struct rq *this_rq); 97 97 extern long calc_load_fold_active(struct rq *this_rq, long adjust); 98 98 99 - #ifdef CONFIG_SMP 100 - extern void cpu_load_update_active(struct rq *this_rq); 101 - #else 102 - static inline void cpu_load_update_active(struct rq *this_rq) { } 103 - #endif 104 - 105 99 /* 106 100 * Helpers for converting nanosecond timing to jiffy resolution 107 101 */ ··· 338 344 u64 runtime_expires; 339 345 int expires_seq; 340 346 341 - short idle; 342 - short period_active; 347 + u8 idle; 348 + u8 period_active; 349 + u8 distribute_running; 350 + u8 slack_started; 343 351 struct hrtimer period_timer; 344 352 struct hrtimer slack_timer; 345 353 struct list_head throttled_cfs_rq; ··· 350 354 int nr_periods; 351 355 int nr_throttled; 352 356 u64 throttled_time; 353 - 354 - bool distribute_running; 355 357 #endif 356 358 }; 357 359 ··· 791 797 #endif 792 798 #endif /* CONFIG_SMP */ 793 799 800 + #ifdef CONFIG_UCLAMP_TASK 801 + /* 802 + * struct uclamp_bucket - Utilization clamp bucket 803 + * @value: utilization clamp value for tasks on this clamp bucket 804 + * @tasks: number of RUNNABLE tasks on this clamp bucket 805 + * 806 + * Keep track of how many tasks are RUNNABLE for a given utilization 807 + * clamp value. 808 + */ 809 + struct uclamp_bucket { 810 + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); 811 + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); 812 + }; 813 + 814 + /* 815 + * struct uclamp_rq - rq's utilization clamp 816 + * @value: currently active clamp values for a rq 817 + * @bucket: utilization clamp buckets affecting a rq 818 + * 819 + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. 820 + * A clamp value is affecting a rq when there is at least one task RUNNABLE 821 + * (or actually running) with that value. 822 + * 823 + * There are up to UCLAMP_CNT possible different clamp values, currently there 824 + * are only two: minimum utilization and maximum utilization. 825 + * 826 + * All utilization clamping values are MAX aggregated, since: 827 + * - for util_min: we want to run the CPU at least at the max of the minimum 828 + * utilization required by its currently RUNNABLE tasks. 829 + * - for util_max: we want to allow the CPU to run up to the max of the 830 + * maximum utilization allowed by its currently RUNNABLE tasks. 831 + * 832 + * Since on each system we expect only a limited number of different 833 + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track 834 + * the metrics required to compute all the per-rq utilization clamp values. 835 + */ 836 + struct uclamp_rq { 837 + unsigned int value; 838 + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; 839 + }; 840 + #endif /* CONFIG_UCLAMP_TASK */ 841 + 794 842 /* 795 843 * This is the main, per-CPU runqueue data structure. 796 844 * ··· 854 818 unsigned int nr_preferred_running; 855 819 unsigned int numa_migrate_on; 856 820 #endif 857 - #define CPU_LOAD_IDX_MAX 5 858 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 859 821 #ifdef CONFIG_NO_HZ_COMMON 860 822 #ifdef CONFIG_SMP 861 823 unsigned long last_load_update_tick; ··· 864 830 atomic_t nohz_flags; 865 831 #endif /* CONFIG_NO_HZ_COMMON */ 866 832 867 - /* capture load from *all* tasks on this CPU: */ 868 - struct load_weight load; 869 833 unsigned long nr_load_updates; 870 834 u64 nr_switches; 835 + 836 + #ifdef CONFIG_UCLAMP_TASK 837 + /* Utilization clamp values based on CPU's RUNNABLE tasks */ 838 + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; 839 + unsigned int uclamp_flags; 840 + #define UCLAMP_FLAG_IDLE 0x01 841 + #endif 871 842 872 843 struct cfs_rq cfs; 873 844 struct rt_rq rt; ··· 1688 1649 struct sched_class { 1689 1650 const struct sched_class *next; 1690 1651 1652 + #ifdef CONFIG_UCLAMP_TASK 1653 + int uclamp_enabled; 1654 + #endif 1655 + 1691 1656 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1692 1657 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1693 1658 void (*yield_task) (struct rq *rq); ··· 2265 2222 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} 2266 2223 #endif /* CONFIG_CPU_FREQ */ 2267 2224 2225 + #ifdef CONFIG_UCLAMP_TASK 2226 + unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); 2227 + 2228 + static __always_inline 2229 + unsigned int uclamp_util_with(struct rq *rq, unsigned int util, 2230 + struct task_struct *p) 2231 + { 2232 + unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); 2233 + unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); 2234 + 2235 + if (p) { 2236 + min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); 2237 + max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); 2238 + } 2239 + 2240 + /* 2241 + * Since CPU's {min,max}_util clamps are MAX aggregated considering 2242 + * RUNNABLE tasks with _different_ clamps, we can end up with an 2243 + * inversion. Fix it now when the clamps are applied. 2244 + */ 2245 + if (unlikely(min_util >= max_util)) 2246 + return min_util; 2247 + 2248 + return clamp(util, min_util, max_util); 2249 + } 2250 + 2251 + static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) 2252 + { 2253 + return uclamp_util_with(rq, util, NULL); 2254 + } 2255 + #else /* CONFIG_UCLAMP_TASK */ 2256 + static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, 2257 + struct task_struct *p) 2258 + { 2259 + return util; 2260 + } 2261 + static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) 2262 + { 2263 + return util; 2264 + } 2265 + #endif /* CONFIG_UCLAMP_TASK */ 2266 + 2268 2267 #ifdef arch_scale_freq_capacity 2269 2268 # ifndef arch_scale_freq_invariant 2270 2269 # define arch_scale_freq_invariant() true ··· 2322 2237 } 2323 2238 #endif 2324 2239 2325 - #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2326 2240 /** 2327 2241 * enum schedutil_type - CPU utilization type 2328 2242 * @FREQUENCY_UTIL: Utilization used to select frequency ··· 2337 2253 ENERGY_UTIL, 2338 2254 }; 2339 2255 2340 - unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, 2341 - unsigned long max, enum schedutil_type type); 2256 + #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2342 2257 2343 - static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) 2344 - { 2345 - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); 2346 - 2347 - return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); 2348 - } 2258 + unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, 2259 + unsigned long max, enum schedutil_type type, 2260 + struct task_struct *p); 2349 2261 2350 2262 static inline unsigned long cpu_bw_dl(struct rq *rq) 2351 2263 { ··· 2370 2290 return READ_ONCE(rq->avg_rt.util_avg); 2371 2291 } 2372 2292 #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ 2373 - static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) 2293 + static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, 2294 + unsigned long max, enum schedutil_type type, 2295 + struct task_struct *p) 2374 2296 { 2375 - return cfs; 2297 + return 0; 2376 2298 } 2377 - #endif 2299 + #endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ 2378 2300 2379 2301 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 2380 2302 static inline unsigned long cpu_util_irq(struct rq *rq)

+4 -14

kernel/sched/topology.c

··· 1344 1344 .imbalance_pct = 125, 1345 1345 1346 1346 .cache_nice_tries = 0, 1347 - .busy_idx = 0, 1348 - .idle_idx = 0, 1349 - .newidle_idx = 0, 1350 - .wake_idx = 0, 1351 - .forkexec_idx = 0, 1352 1347 1353 1348 .flags = 1*SD_LOAD_BALANCE 1354 1349 | 1*SD_BALANCE_NEWIDLE ··· 1395 1400 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 1396 1401 sd->imbalance_pct = 117; 1397 1402 sd->cache_nice_tries = 1; 1398 - sd->busy_idx = 2; 1399 1403 1400 1404 #ifdef CONFIG_NUMA 1401 1405 } else if (sd->flags & SD_NUMA) { 1402 1406 sd->cache_nice_tries = 2; 1403 - sd->busy_idx = 3; 1404 - sd->idle_idx = 2; 1405 1407 1406 1408 sd->flags &= ~SD_PREFER_SIBLING; 1407 1409 sd->flags |= SD_SERIALIZE; ··· 1411 1419 #endif 1412 1420 } else { 1413 1421 sd->cache_nice_tries = 1; 1414 - sd->busy_idx = 2; 1415 - sd->idle_idx = 1; 1416 1422 } 1417 1423 1418 1424 /* ··· 1874 1884 unsigned long cap; 1875 1885 1876 1886 /* Is there any asymmetry? */ 1877 - cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); 1887 + cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); 1878 1888 1879 1889 for_each_cpu(i, cpu_map) { 1880 - if (arch_scale_cpu_capacity(NULL, i) != cap) { 1890 + if (arch_scale_cpu_capacity(i) != cap) { 1881 1891 asym = true; 1882 1892 break; 1883 1893 } ··· 1892 1902 * to everyone. 1893 1903 */ 1894 1904 for_each_cpu(i, cpu_map) { 1895 - unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); 1905 + unsigned long max_capacity = arch_scale_cpu_capacity(i); 1896 1906 int tl_id = 0; 1897 1907 1898 1908 for_each_sd_topology(tl) { ··· 1902 1912 for_each_cpu_and(j, tl->mask(i), cpu_map) { 1903 1913 unsigned long capacity; 1904 1914 1905 - capacity = arch_scale_cpu_capacity(NULL, j); 1915 + capacity = arch_scale_cpu_capacity(j); 1906 1916 1907 1917 if (capacity <= max_capacity) 1908 1918 continue;

+2 -6

kernel/sched/wait.c

··· 118 118 bookmark.func = NULL; 119 119 INIT_LIST_HEAD(&bookmark.entry); 120 120 121 - spin_lock_irqsave(&wq_head->lock, flags); 122 - nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); 123 - spin_unlock_irqrestore(&wq_head->lock, flags); 124 - 125 - while (bookmark.flags & WQ_FLAG_BOOKMARK) { 121 + do { 126 122 spin_lock_irqsave(&wq_head->lock, flags); 127 123 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, 128 124 wake_flags, key, &bookmark); 129 125 spin_unlock_irqrestore(&wq_head->lock, flags); 130 - } 126 + } while (bookmark.flags & WQ_FLAG_BOOKMARK); 131 127 } 132 128 133 129 /**

+16

kernel/sysctl.c

··· 452 452 .mode = 0644, 453 453 .proc_handler = sched_rr_handler, 454 454 }, 455 + #ifdef CONFIG_UCLAMP_TASK 456 + { 457 + .procname = "sched_util_clamp_min", 458 + .data = &sysctl_sched_uclamp_util_min, 459 + .maxlen = sizeof(unsigned int), 460 + .mode = 0644, 461 + .proc_handler = sysctl_sched_uclamp_handler, 462 + }, 463 + { 464 + .procname = "sched_util_clamp_max", 465 + .data = &sysctl_sched_uclamp_util_max, 466 + .maxlen = sizeof(unsigned int), 467 + .mode = 0644, 468 + .proc_handler = sysctl_sched_uclamp_handler, 469 + }, 470 + #endif 455 471 #ifdef CONFIG_SCHED_AUTOGROUP 456 472 { 457 473 .procname = "sched_autogroup_enabled",

-2

kernel/time/tick-sched.c

··· 782 782 */ 783 783 if (!ts->tick_stopped) { 784 784 calc_load_nohz_start(); 785 - cpu_load_update_nohz_start(); 786 785 quiet_vmstat(); 787 786 788 787 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ··· 828 829 { 829 830 /* Update jiffies first */ 830 831 tick_do_update_jiffies64(now); 831 - cpu_load_update_nohz_stop(); 832 832 833 833 /* 834 834 * Clear the timer idle flag, so we avoid IPIs on remote queueing and

+1 -1

kernel/trace/trace_hwlat.c

··· 277 277 * of this thread, than stop migrating for the duration 278 278 * of the current test. 279 279 */ 280 - if (!cpumask_equal(current_mask, &current->cpus_allowed)) 280 + if (!cpumask_equal(current_mask, current->cpus_ptr)) 281 281 goto disable; 282 282 283 283 get_online_cpus();

+1 -1

lib/smp_processor_id.c

··· 23 23 * Kernel threads bound to a single CPU can safely use 24 24 * smp_processor_id(): 25 25 */ 26 - if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu))) 26 + if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu))) 27 27 goto out; 28 28 29 29 /*

+1 -1

samples/trace_events/trace-events-sample.c

··· 34 34 35 35 /* Silly tracepoints */ 36 36 trace_foo_bar("hello", cnt, array, random_strings[len], 37 - &current->cpus_allowed); 37 + current->cpus_ptr); 38 38 39 39 trace_foo_with_template_simple("HELLO", cnt); 40 40

Configure Feed

Configure Feed