Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched_ext: Implement load balancer for bypass mode

In bypass mode, tasks are queued on per-CPU bypass DSQs. While this works well
in most cases, there is a failure mode where a BPF scheduler can skew task
placement severely before triggering bypass in highly over-saturated systems.
If most tasks end up concentrated on a few CPUs, those CPUs can accumulate
queues that are too long to drain in a reasonable time, leading to RCU stalls
and hung tasks.

Implement a simple timer-based load balancer that redistributes tasks across
CPUs within each NUMA node. The balancer runs periodically (default 500ms,
tunable via bypass_lb_intv_us module parameter) and moves tasks from overloaded
CPUs to underloaded ones.

When moving tasks between bypass DSQs, the load balancer holds nested DSQ locks
to avoid dropping and reacquiring the donor DSQ lock on each iteration, as
donor DSQs can be very long and highly contended. Add the SCX_ENQ_NESTED flag
and use raw_spin_lock_nested() in dispatch_enqueue() to support this. The load
balancer timer function reads scx_bypass_depth locklessly to check whether
bypass mode is active. Use WRITE_ONCE() when updating scx_bypass_depth to pair
with the READ_ONCE() in the timer function.

This has been tested on a 192 CPU dual socket AMD EPYC machine with ~20k
runnable tasks running scx_cpu0. As scx_cpu0 queues all tasks to CPU0, almost
all tasks end up on CPU0 creating severe imbalance. Without the load balancer,
disabling the scheduler can lead to RCU stalls and hung tasks, taking a very
long time to complete. With the load balancer, disable completes in about a
second.

The load balancing operation can be monitored using the sched_ext_bypass_lb
tracepoint and disabled by setting bypass_lb_intv_us to 0.

v2: Lock both rq and DSQ in bypass_lb_cpu() and use dispatch_dequeue_locked()
to prevent races with dispatch_dequeue() (Andrea Righi).

Cc: Andrea Righi <arighi@nvidia.com>
Cc: Dan Schatzberg <schatzberg.dan@gmail.com>
Cc: Emil Tsalapatis <etsal@meta.com>
Reviewed_by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

Tejun Heo 95d1df61 d18b96ce

+281 -3
+39
include/trace/events/sched_ext.h
··· 45 45 ) 46 46 ); 47 47 48 + TRACE_EVENT(sched_ext_bypass_lb, 49 + 50 + TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced, 51 + __u32 before_min, __u32 before_max, 52 + __u32 after_min, __u32 after_max), 53 + 54 + TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced, 55 + before_min, before_max, after_min, after_max), 56 + 57 + TP_STRUCT__entry( 58 + __field( __u32, node ) 59 + __field( __u32, nr_cpus ) 60 + __field( __u32, nr_tasks ) 61 + __field( __u32, nr_balanced ) 62 + __field( __u32, before_min ) 63 + __field( __u32, before_max ) 64 + __field( __u32, after_min ) 65 + __field( __u32, after_max ) 66 + ), 67 + 68 + TP_fast_assign( 69 + __entry->node = node; 70 + __entry->nr_cpus = nr_cpus; 71 + __entry->nr_tasks = nr_tasks; 72 + __entry->nr_balanced = nr_balanced; 73 + __entry->before_min = before_min; 74 + __entry->before_max = before_max; 75 + __entry->after_min = after_min; 76 + __entry->after_max = after_max; 77 + ), 78 + 79 + TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u", 80 + __entry->node, __entry->nr_cpus, 81 + __entry->nr_tasks, __entry->nr_balanced, 82 + __entry->before_min, __entry->after_min, 83 + __entry->before_max, __entry->after_max 84 + ) 85 + ); 86 + 48 87 #endif /* _TRACE_SCHED_EXT_H */ 49 88 50 89 /* This part must be outside protection */
+236 -3
kernel/sched/ext.c
··· 34 34 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 35 35 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 36 36 static int scx_bypass_depth; 37 + static cpumask_var_t scx_bypass_lb_donee_cpumask; 38 + static cpumask_var_t scx_bypass_lb_resched_cpumask; 37 39 static bool scx_aborting; 38 40 static bool scx_init_task_enabled; 39 41 static bool scx_switching_all; ··· 151 149 */ 152 150 static u64 scx_slice_dfl = SCX_SLICE_DFL; 153 151 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 152 + static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 154 153 155 154 static int set_slice_us(const char *val, const struct kernel_param *kp) 156 155 { ··· 163 160 .get = param_get_uint, 164 161 }; 165 162 163 + static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 164 + { 165 + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 166 + } 167 + 168 + static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 169 + .set = set_bypass_lb_intv_us, 170 + .get = param_get_uint, 171 + }; 172 + 166 173 #undef MODULE_PARAM_PREFIX 167 174 #define MODULE_PARAM_PREFIX "sched_ext." 168 175 169 176 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 170 177 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 178 + module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 179 + MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 171 180 172 181 #undef MODULE_PARAM_PREFIX 173 182 ··· 977 962 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 978 963 979 964 if (!is_local) { 980 - raw_spin_lock(&dsq->lock); 965 + raw_spin_lock_nested(&dsq->lock, 966 + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 967 + 981 968 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 982 969 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 983 970 /* fall back to the global dsq */ ··· 3761 3744 return true; 3762 3745 } 3763 3746 3747 + static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, 3748 + struct cpumask *donee_mask, struct cpumask *resched_mask, 3749 + u32 nr_donor_target, u32 nr_donee_target) 3750 + { 3751 + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; 3752 + struct task_struct *p, *n; 3753 + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); 3754 + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 3755 + u32 nr_balanced = 0, min_delta_us; 3756 + 3757 + /* 3758 + * All we want to guarantee is reasonable forward progress. No reason to 3759 + * fine tune. Assuming every task on @donor_dsq runs their full slice, 3760 + * consider offloading iff the total queued duration is over the 3761 + * threshold. 3762 + */ 3763 + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; 3764 + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) 3765 + return 0; 3766 + 3767 + raw_spin_rq_lock_irq(rq); 3768 + raw_spin_lock(&donor_dsq->lock); 3769 + list_add(&cursor.node, &donor_dsq->list); 3770 + resume: 3771 + n = container_of(&cursor, struct task_struct, scx.dsq_list); 3772 + n = nldsq_next_task(donor_dsq, n, false); 3773 + 3774 + while ((p = n)) { 3775 + struct rq *donee_rq; 3776 + struct scx_dispatch_q *donee_dsq; 3777 + int donee; 3778 + 3779 + n = nldsq_next_task(donor_dsq, n, false); 3780 + 3781 + if (donor_dsq->nr <= nr_donor_target) 3782 + break; 3783 + 3784 + if (cpumask_empty(donee_mask)) 3785 + break; 3786 + 3787 + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 3788 + if (donee >= nr_cpu_ids) 3789 + continue; 3790 + 3791 + donee_rq = cpu_rq(donee); 3792 + donee_dsq = &donee_rq->scx.bypass_dsq; 3793 + 3794 + /* 3795 + * $p's rq is not locked but $p's DSQ lock protects its 3796 + * scheduling properties making this test safe. 3797 + */ 3798 + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) 3799 + continue; 3800 + 3801 + /* 3802 + * Moving $p from one non-local DSQ to another. The source rq 3803 + * and DSQ are already locked. Do an abbreviated dequeue and 3804 + * then perform enqueue without unlocking $donor_dsq. 3805 + * 3806 + * We don't want to drop and reacquire the lock on each 3807 + * iteration as @donor_dsq can be very long and potentially 3808 + * highly contended. Donee DSQs are less likely to be contended. 3809 + * The nested locking is safe as only this LB moves tasks 3810 + * between bypass DSQs. 3811 + */ 3812 + dispatch_dequeue_locked(p, donor_dsq); 3813 + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); 3814 + 3815 + /* 3816 + * $donee might have been idle and need to be woken up. No need 3817 + * to be clever. Kick every CPU that receives tasks. 3818 + */ 3819 + cpumask_set_cpu(donee, resched_mask); 3820 + 3821 + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 3822 + cpumask_clear_cpu(donee, donee_mask); 3823 + 3824 + nr_balanced++; 3825 + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 3826 + list_move_tail(&cursor.node, &n->scx.dsq_list.node); 3827 + raw_spin_unlock(&donor_dsq->lock); 3828 + raw_spin_rq_unlock_irq(rq); 3829 + cpu_relax(); 3830 + raw_spin_rq_lock_irq(rq); 3831 + raw_spin_lock(&donor_dsq->lock); 3832 + goto resume; 3833 + } 3834 + } 3835 + 3836 + list_del_init(&cursor.node); 3837 + raw_spin_unlock(&donor_dsq->lock); 3838 + raw_spin_rq_unlock_irq(rq); 3839 + 3840 + return nr_balanced; 3841 + } 3842 + 3843 + static void bypass_lb_node(struct scx_sched *sch, int node) 3844 + { 3845 + const struct cpumask *node_mask = cpumask_of_node(node); 3846 + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; 3847 + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; 3848 + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 3849 + u32 nr_target, nr_donor_target; 3850 + u32 before_min = U32_MAX, before_max = 0; 3851 + u32 after_min = U32_MAX, after_max = 0; 3852 + int cpu; 3853 + 3854 + /* count the target tasks and CPUs */ 3855 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3856 + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); 3857 + 3858 + nr_tasks += nr; 3859 + nr_cpus++; 3860 + 3861 + before_min = min(nr, before_min); 3862 + before_max = max(nr, before_max); 3863 + } 3864 + 3865 + if (!nr_cpus) 3866 + return; 3867 + 3868 + /* 3869 + * We don't want CPUs to have more than $nr_donor_target tasks and 3870 + * balancing to fill donee CPUs upto $nr_target. Once targets are 3871 + * calculated, find the donee CPUs. 3872 + */ 3873 + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 3874 + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 3875 + 3876 + cpumask_clear(donee_mask); 3877 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3878 + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) 3879 + cpumask_set_cpu(cpu, donee_mask); 3880 + } 3881 + 3882 + /* iterate !donee CPUs and see if they should be offloaded */ 3883 + cpumask_clear(resched_mask); 3884 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3885 + struct rq *rq = cpu_rq(cpu); 3886 + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; 3887 + 3888 + if (cpumask_empty(donee_mask)) 3889 + break; 3890 + if (cpumask_test_cpu(cpu, donee_mask)) 3891 + continue; 3892 + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) 3893 + continue; 3894 + 3895 + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, 3896 + nr_donor_target, nr_target); 3897 + } 3898 + 3899 + for_each_cpu(cpu, resched_mask) { 3900 + struct rq *rq = cpu_rq(cpu); 3901 + 3902 + raw_spin_rq_lock_irq(rq); 3903 + resched_curr(rq); 3904 + raw_spin_rq_unlock_irq(rq); 3905 + } 3906 + 3907 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3908 + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); 3909 + 3910 + after_min = min(nr, after_min); 3911 + after_max = max(nr, after_max); 3912 + 3913 + } 3914 + 3915 + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 3916 + before_min, before_max, after_min, after_max); 3917 + } 3918 + 3919 + /* 3920 + * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 3921 + * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 3922 + * bypass DSQs can be overloaded. If there are enough tasks to saturate other 3923 + * lightly loaded CPUs, such imbalance can lead to very high execution latency 3924 + * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 3925 + * outcomes, a simple load balancing mechanism is implemented by the following 3926 + * timer which runs periodically while bypass mode is in effect. 3927 + */ 3928 + static void scx_bypass_lb_timerfn(struct timer_list *timer) 3929 + { 3930 + struct scx_sched *sch; 3931 + int node; 3932 + u32 intv_us; 3933 + 3934 + sch = rcu_dereference_all(scx_root); 3935 + if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) 3936 + return; 3937 + 3938 + for_each_node_with_cpus(node) 3939 + bypass_lb_node(sch, node); 3940 + 3941 + intv_us = READ_ONCE(scx_bypass_lb_intv_us); 3942 + if (intv_us) 3943 + mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 3944 + } 3945 + 3946 + static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); 3947 + 3764 3948 /** 3765 3949 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 3766 3950 * @bypass: true for bypass, false for unbypass ··· 4005 3787 sch = rcu_dereference_bh(scx_root); 4006 3788 4007 3789 if (bypass) { 4008 - scx_bypass_depth++; 3790 + u32 intv_us; 3791 + 3792 + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); 4009 3793 WARN_ON_ONCE(scx_bypass_depth <= 0); 4010 3794 if (scx_bypass_depth != 1) 4011 3795 goto unlock; ··· 4015 3795 bypass_timestamp = ktime_get_ns(); 4016 3796 if (sch) 4017 3797 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 3798 + 3799 + intv_us = READ_ONCE(scx_bypass_lb_intv_us); 3800 + if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { 3801 + scx_bypass_lb_timer.expires = 3802 + jiffies + usecs_to_jiffies(intv_us); 3803 + add_timer_global(&scx_bypass_lb_timer); 3804 + } 4018 3805 } else { 4019 - scx_bypass_depth--; 3806 + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); 4020 3807 WARN_ON_ONCE(scx_bypass_depth < 0); 4021 3808 if (scx_bypass_depth != 0) 4022 3809 goto unlock; ··· 7277 7050 if (ret < 0) { 7278 7051 pr_err("sched_ext: Failed to add global attributes\n"); 7279 7052 return ret; 7053 + } 7054 + 7055 + if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || 7056 + !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { 7057 + pr_err("sched_ext: Failed to allocate cpumasks\n"); 7058 + return -ENOMEM; 7280 7059 } 7281 7060 7282 7061 return 0;
+6
kernel/sched/ext_internal.h
··· 23 23 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. 24 24 */ 25 25 SCX_TASK_ITER_BATCH = 32, 26 + 27 + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, 28 + SCX_BYPASS_LB_DONOR_PCT = 125, 29 + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, 30 + SCX_BYPASS_LB_BATCH = 256, 26 31 }; 27 32 28 33 enum scx_exit_kind { ··· 968 963 969 964 SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 970 965 SCX_ENQ_DSQ_PRIQ = 1LLU << 57, 966 + SCX_ENQ_NESTED = 1LLU << 58, 971 967 }; 972 968 973 969 enum scx_deq_flags {