Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task

CPUs whose rq only have SCHED_IDLE tasks running are considered to be
equivalent to truly idle CPUs during wakeup path. For fork and exec
SCHED_IDLE is even preferred.
This is based on the assumption that the SCHED_IDLE CPU is not in an
idle state and might be in a higher P-state, allowing the task/wakee
to run immediately without sharing the rq.

However this assumption doesn't hold if the wakee has SCHED_IDLE policy
itself, as it will share the rq with existing SCHED_IDLE tasks. In this
case, we are better off continuing to look for a truly idle CPU.

On a Intel Xeon 2-socket with 64 logical cores in total this yields
for kernel compilation using SCHED_IDLE:

+---------+----------------------+----------------------+--------+
| workers | mainline (seconds) | patch (seconds) | delta% |
+=========+======================+======================+========+
| 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 |
| 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 |
| 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 |
| 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 |
| 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 |
| 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 |
| 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 |
| 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 |
+---------+----------------------+----------------------+--------+

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260203184939.2138022-1-christian.loehle@arm.com

authored by

Christian Loehle and committed by
Peter Zijlstra
fd54d81c c2a57380

+19 -13
+19 -13
kernel/sched/fair.c
··· 7064 7064 rq->nr_running); 7065 7065 } 7066 7066 7067 - static int sched_idle_cpu(int cpu) 7067 + static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p) 7068 7068 { 7069 - return sched_idle_rq(cpu_rq(cpu)); 7069 + return sched_idle_rq(rq) && !task_has_idle_policy(p); 7070 + } 7071 + 7072 + static int choose_idle_cpu(int cpu, struct task_struct *p) 7073 + { 7074 + return available_idle_cpu(cpu) || 7075 + choose_sched_idle_rq(cpu_rq(cpu), p); 7070 7076 } 7071 7077 7072 7078 static void ··· 7637 7631 if (!sched_core_cookie_match(rq, p)) 7638 7632 continue; 7639 7633 7640 - if (sched_idle_cpu(i)) 7634 + if (choose_sched_idle_rq(rq, p)) 7641 7635 return i; 7642 7636 7643 7637 if (available_idle_cpu(i)) { ··· 7728 7722 7729 7723 static inline int __select_idle_cpu(int cpu, struct task_struct *p) 7730 7724 { 7731 - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && 7732 - sched_cpu_cookie_match(cpu_rq(cpu), p)) 7725 + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p)) 7733 7726 return cpu; 7734 7727 7735 7728 return -1; ··· 7801 7796 if (!available_idle_cpu(cpu)) { 7802 7797 idle = false; 7803 7798 if (*idle_cpu == -1) { 7804 - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { 7799 + if (choose_sched_idle_rq(cpu_rq(cpu), p) && 7800 + cpumask_test_cpu(cpu, cpus)) { 7805 7801 *idle_cpu = cpu; 7806 7802 break; 7807 7803 } ··· 7837 7831 */ 7838 7832 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) 7839 7833 continue; 7840 - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) 7834 + if (choose_idle_cpu(cpu, p)) 7841 7835 return cpu; 7842 7836 } 7843 7837 ··· 7959 7953 for_each_cpu_wrap(cpu, cpus, target) { 7960 7954 unsigned long cpu_cap = capacity_of(cpu); 7961 7955 7962 - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) 7956 + if (!choose_idle_cpu(cpu, p)) 7963 7957 continue; 7964 7958 7965 7959 fits = util_fits_cpu(task_util, util_min, util_max, cpu); ··· 8030 8024 */ 8031 8025 lockdep_assert_irqs_disabled(); 8032 8026 8033 - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && 8027 + if (choose_idle_cpu(target, p) && 8034 8028 asym_fits_cpu(task_util, util_min, util_max, target)) 8035 8029 return target; 8036 8030 ··· 8038 8032 * If the previous CPU is cache affine and idle, don't be stupid: 8039 8033 */ 8040 8034 if (prev != target && cpus_share_cache(prev, target) && 8041 - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && 8035 + choose_idle_cpu(prev, p) && 8042 8036 asym_fits_cpu(task_util, util_min, util_max, prev)) { 8043 8037 8044 8038 if (!static_branch_unlikely(&sched_cluster_active) || ··· 8070 8064 if (recent_used_cpu != prev && 8071 8065 recent_used_cpu != target && 8072 8066 cpus_share_cache(recent_used_cpu, target) && 8073 - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && 8067 + choose_idle_cpu(recent_used_cpu, p) && 8074 8068 cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && 8075 8069 asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { 8076 8070 ··· 12537 12531 { 12538 12532 int continue_balancing = 1; 12539 12533 int cpu = rq->cpu; 12540 - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); 12534 + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); 12541 12535 unsigned long interval; 12542 12536 struct sched_domain *sd; 12543 12537 /* Earliest time when we have to do rebalance again */ ··· 12575 12569 * state even if we migrated tasks. Update it. 12576 12570 */ 12577 12571 idle = idle_cpu(cpu); 12578 - busy = !idle && !sched_idle_cpu(cpu); 12572 + busy = !idle && !sched_idle_rq(rq); 12579 12573 } 12580 12574 sd->last_balance = jiffies; 12581 12575 interval = get_sd_balance_interval(sd, busy);