Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/fair: Add EAS checks before updating root_domain::overutilized

root_domain::overutilized is only used for EAS(energy aware scheduler)
to decide whether to do load balance or not. It is not used if EAS
not possible.

Currently enqueue_task_fair and task_tick_fair accesses, sometime updates
this field. In update_sd_lb_stats it is updated often. This causes cache
contention due to true sharing and burns a lot of cycles. ::overload and
::overutilized are part of the same cacheline. Updating it often invalidates
the cacheline. That causes access to ::overload to slow down due to
false sharing. Hence add EAS check before accessing/updating this field.
EAS check is optimized at compile time or it is a static branch.
Hence it shouldn't cost much.

With the patch, both enqueue_task_fair and newidle_balance don't show
up as hot routines in perf profile.

6.8-rc4:
7.18% swapper [kernel.vmlinux] [k] enqueue_task_fair
6.78% s [kernel.vmlinux] [k] newidle_balance

+patch:
0.14% swapper [kernel.vmlinux] [k] enqueue_task_fair
0.00% swapper [kernel.vmlinux] [k] newidle_balance

While at it: trace_sched_overutilized_tp expect that second argument to
be bool. So do a int to bool conversion for that.

Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator")
Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Qais Yousef <qyousef@layalina.io>
Reviewed-by: Srikar Dronamraju <srikar@linux.ibm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20240307085725.444486-2-sshegde@linux.ibm.com

authored by

Shrikanth Hegde and committed by
Ingo Molnar
be3a51e6 58eeb2d7

+34 -19
+34 -19
kernel/sched/fair.c
··· 6673 6673 #ifdef CONFIG_SMP 6674 6674 static inline bool cpu_overutilized(int cpu) 6675 6675 { 6676 - unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); 6677 - unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); 6676 + unsigned long rq_util_min, rq_util_max; 6677 + 6678 + if (!sched_energy_enabled()) 6679 + return false; 6680 + 6681 + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); 6682 + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); 6678 6683 6679 6684 /* Return true only if the utilization doesn't fit CPU's capacity */ 6680 6685 return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); 6681 6686 } 6682 6687 6683 - static inline void update_overutilized_status(struct rq *rq) 6688 + static inline void set_rd_overutilized_status(struct root_domain *rd, 6689 + unsigned int status) 6684 6690 { 6685 - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { 6686 - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); 6687 - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); 6688 - } 6691 + if (!sched_energy_enabled()) 6692 + return; 6693 + 6694 + WRITE_ONCE(rd->overutilized, status); 6695 + trace_sched_overutilized_tp(rd, !!status); 6696 + } 6697 + 6698 + static inline void check_update_overutilized_status(struct rq *rq) 6699 + { 6700 + /* 6701 + * overutilized field is used for load balancing decisions only 6702 + * if energy aware scheduler is being used 6703 + */ 6704 + if (!sched_energy_enabled()) 6705 + return; 6706 + 6707 + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) 6708 + set_rd_overutilized_status(rq->rd, SG_OVERUTILIZED); 6689 6709 } 6690 6710 #else 6691 - static inline void update_overutilized_status(struct rq *rq) { } 6711 + static inline void check_update_overutilized_status(struct rq *rq) { } 6692 6712 #endif 6693 6713 6694 6714 /* Runqueue only has SCHED_IDLE tasks enqueued */ ··· 6809 6789 * and the following generally works well enough in practice. 6810 6790 */ 6811 6791 if (!task_new) 6812 - update_overutilized_status(rq); 6792 + check_update_overutilized_status(rq); 6813 6793 6814 6794 enqueue_throttle: 6815 6795 assert_list_leaf_cfs_rq(rq); ··· 10650 10630 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 10651 10631 10652 10632 if (!env->sd->parent) { 10653 - struct root_domain *rd = env->dst_rq->rd; 10654 - 10655 10633 /* update overload indicator if we are at root domain */ 10656 - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); 10634 + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); 10657 10635 10658 10636 /* Update over-utilization (tipping point, U >= 0) indicator */ 10659 - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); 10660 - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); 10637 + set_rd_overutilized_status(env->dst_rq->rd, 10638 + sg_status & SG_OVERUTILIZED); 10661 10639 } else if (sg_status & SG_OVERUTILIZED) { 10662 - struct root_domain *rd = env->dst_rq->rd; 10663 - 10664 - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); 10665 - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); 10640 + set_rd_overutilized_status(env->dst_rq->rd, SG_OVERUTILIZED); 10666 10641 } 10667 10642 10668 10643 update_idle_cpu_scan(env, sum_util); ··· 12682 12667 task_tick_numa(rq, curr); 12683 12668 12684 12669 update_misfit_status(curr, rq); 12685 - update_overutilized_status(task_rq(curr)); 12670 + check_update_overutilized_status(task_rq(curr)); 12686 12671 12687 12672 task_tick_core(rq, curr); 12688 12673 }