sched_ext: Implement load balancer for bypass mode

+39

include/trace/events/sched_ext.h

··· 45 45 ) 46 46 ); 47 47 48 + TRACE_EVENT(sched_ext_bypass_lb, 49 + 50 + TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced, 51 + __u32 before_min, __u32 before_max, 52 + __u32 after_min, __u32 after_max), 53 + 54 + TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced, 55 + before_min, before_max, after_min, after_max), 56 + 57 + TP_STRUCT__entry( 58 + __field( __u32, node ) 59 + __field( __u32, nr_cpus ) 60 + __field( __u32, nr_tasks ) 61 + __field( __u32, nr_balanced ) 62 + __field( __u32, before_min ) 63 + __field( __u32, before_max ) 64 + __field( __u32, after_min ) 65 + __field( __u32, after_max ) 66 + ), 67 + 68 + TP_fast_assign( 69 + __entry->node = node; 70 + __entry->nr_cpus = nr_cpus; 71 + __entry->nr_tasks = nr_tasks; 72 + __entry->nr_balanced = nr_balanced; 73 + __entry->before_min = before_min; 74 + __entry->before_max = before_max; 75 + __entry->after_min = after_min; 76 + __entry->after_max = after_max; 77 + ), 78 + 79 + TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u", 80 + __entry->node, __entry->nr_cpus, 81 + __entry->nr_tasks, __entry->nr_balanced, 82 + __entry->before_min, __entry->after_min, 83 + __entry->before_max, __entry->after_max 84 + ) 85 + ); 86 + 48 87 #endif /* _TRACE_SCHED_EXT_H */ 49 88 50 89 /* This part must be outside protection */

+236 -3

kernel/sched/ext.c

··· 34 34 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 35 35 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 36 36 static int scx_bypass_depth; 37 + static cpumask_var_t scx_bypass_lb_donee_cpumask; 38 + static cpumask_var_t scx_bypass_lb_resched_cpumask; 37 39 static bool scx_aborting; 38 40 static bool scx_init_task_enabled; 39 41 static bool scx_switching_all; ··· 151 149 */ 152 150 static u64 scx_slice_dfl = SCX_SLICE_DFL; 153 151 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 152 + static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 154 153 155 154 static int set_slice_us(const char *val, const struct kernel_param *kp) 156 155 { ··· 163 160 .get = param_get_uint, 164 161 }; 165 162 163 + static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 164 + { 165 + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 166 + } 167 + 168 + static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 169 + .set = set_bypass_lb_intv_us, 170 + .get = param_get_uint, 171 + }; 172 + 166 173 #undef MODULE_PARAM_PREFIX 167 174 #define MODULE_PARAM_PREFIX "sched_ext." 168 175 169 176 module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 170 177 MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 178 + module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 179 + MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 171 180 172 181 #undef MODULE_PARAM_PREFIX 173 182 ··· 977 962 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 978 963 979 964 if (!is_local) { 980 - raw_spin_lock(&dsq->lock); 965 + raw_spin_lock_nested(&dsq->lock, 966 + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 967 + 981 968 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 982 969 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 983 970 /* fall back to the global dsq */ ··· 3761 3744 return true; 3762 3745 } 3763 3746 3747 + static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, 3748 + struct cpumask *donee_mask, struct cpumask *resched_mask, 3749 + u32 nr_donor_target, u32 nr_donee_target) 3750 + { 3751 + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; 3752 + struct task_struct *p, *n; 3753 + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); 3754 + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 3755 + u32 nr_balanced = 0, min_delta_us; 3756 + 3757 + /* 3758 + * All we want to guarantee is reasonable forward progress. No reason to 3759 + * fine tune. Assuming every task on @donor_dsq runs their full slice, 3760 + * consider offloading iff the total queued duration is over the 3761 + * threshold. 3762 + */ 3763 + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; 3764 + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) 3765 + return 0; 3766 + 3767 + raw_spin_rq_lock_irq(rq); 3768 + raw_spin_lock(&donor_dsq->lock); 3769 + list_add(&cursor.node, &donor_dsq->list); 3770 + resume: 3771 + n = container_of(&cursor, struct task_struct, scx.dsq_list); 3772 + n = nldsq_next_task(donor_dsq, n, false); 3773 + 3774 + while ((p = n)) { 3775 + struct rq *donee_rq; 3776 + struct scx_dispatch_q *donee_dsq; 3777 + int donee; 3778 + 3779 + n = nldsq_next_task(donor_dsq, n, false); 3780 + 3781 + if (donor_dsq->nr <= nr_donor_target) 3782 + break; 3783 + 3784 + if (cpumask_empty(donee_mask)) 3785 + break; 3786 + 3787 + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 3788 + if (donee >= nr_cpu_ids) 3789 + continue; 3790 + 3791 + donee_rq = cpu_rq(donee); 3792 + donee_dsq = &donee_rq->scx.bypass_dsq; 3793 + 3794 + /* 3795 + * $p's rq is not locked but $p's DSQ lock protects its 3796 + * scheduling properties making this test safe. 3797 + */ 3798 + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) 3799 + continue; 3800 + 3801 + /* 3802 + * Moving $p from one non-local DSQ to another. The source rq 3803 + * and DSQ are already locked. Do an abbreviated dequeue and 3804 + * then perform enqueue without unlocking $donor_dsq. 3805 + * 3806 + * We don't want to drop and reacquire the lock on each 3807 + * iteration as @donor_dsq can be very long and potentially 3808 + * highly contended. Donee DSQs are less likely to be contended. 3809 + * The nested locking is safe as only this LB moves tasks 3810 + * between bypass DSQs. 3811 + */ 3812 + dispatch_dequeue_locked(p, donor_dsq); 3813 + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); 3814 + 3815 + /* 3816 + * $donee might have been idle and need to be woken up. No need 3817 + * to be clever. Kick every CPU that receives tasks. 3818 + */ 3819 + cpumask_set_cpu(donee, resched_mask); 3820 + 3821 + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 3822 + cpumask_clear_cpu(donee, donee_mask); 3823 + 3824 + nr_balanced++; 3825 + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 3826 + list_move_tail(&cursor.node, &n->scx.dsq_list.node); 3827 + raw_spin_unlock(&donor_dsq->lock); 3828 + raw_spin_rq_unlock_irq(rq); 3829 + cpu_relax(); 3830 + raw_spin_rq_lock_irq(rq); 3831 + raw_spin_lock(&donor_dsq->lock); 3832 + goto resume; 3833 + } 3834 + } 3835 + 3836 + list_del_init(&cursor.node); 3837 + raw_spin_unlock(&donor_dsq->lock); 3838 + raw_spin_rq_unlock_irq(rq); 3839 + 3840 + return nr_balanced; 3841 + } 3842 + 3843 + static void bypass_lb_node(struct scx_sched *sch, int node) 3844 + { 3845 + const struct cpumask *node_mask = cpumask_of_node(node); 3846 + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; 3847 + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; 3848 + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 3849 + u32 nr_target, nr_donor_target; 3850 + u32 before_min = U32_MAX, before_max = 0; 3851 + u32 after_min = U32_MAX, after_max = 0; 3852 + int cpu; 3853 + 3854 + /* count the target tasks and CPUs */ 3855 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3856 + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); 3857 + 3858 + nr_tasks += nr; 3859 + nr_cpus++; 3860 + 3861 + before_min = min(nr, before_min); 3862 + before_max = max(nr, before_max); 3863 + } 3864 + 3865 + if (!nr_cpus) 3866 + return; 3867 + 3868 + /* 3869 + * We don't want CPUs to have more than $nr_donor_target tasks and 3870 + * balancing to fill donee CPUs upto $nr_target. Once targets are 3871 + * calculated, find the donee CPUs. 3872 + */ 3873 + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 3874 + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 3875 + 3876 + cpumask_clear(donee_mask); 3877 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3878 + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) 3879 + cpumask_set_cpu(cpu, donee_mask); 3880 + } 3881 + 3882 + /* iterate !donee CPUs and see if they should be offloaded */ 3883 + cpumask_clear(resched_mask); 3884 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3885 + struct rq *rq = cpu_rq(cpu); 3886 + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; 3887 + 3888 + if (cpumask_empty(donee_mask)) 3889 + break; 3890 + if (cpumask_test_cpu(cpu, donee_mask)) 3891 + continue; 3892 + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) 3893 + continue; 3894 + 3895 + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, 3896 + nr_donor_target, nr_target); 3897 + } 3898 + 3899 + for_each_cpu(cpu, resched_mask) { 3900 + struct rq *rq = cpu_rq(cpu); 3901 + 3902 + raw_spin_rq_lock_irq(rq); 3903 + resched_curr(rq); 3904 + raw_spin_rq_unlock_irq(rq); 3905 + } 3906 + 3907 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3908 + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); 3909 + 3910 + after_min = min(nr, after_min); 3911 + after_max = max(nr, after_max); 3912 + 3913 + } 3914 + 3915 + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 3916 + before_min, before_max, after_min, after_max); 3917 + } 3918 + 3919 + /* 3920 + * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 3921 + * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 3922 + * bypass DSQs can be overloaded. If there are enough tasks to saturate other 3923 + * lightly loaded CPUs, such imbalance can lead to very high execution latency 3924 + * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 3925 + * outcomes, a simple load balancing mechanism is implemented by the following 3926 + * timer which runs periodically while bypass mode is in effect. 3927 + */ 3928 + static void scx_bypass_lb_timerfn(struct timer_list *timer) 3929 + { 3930 + struct scx_sched *sch; 3931 + int node; 3932 + u32 intv_us; 3933 + 3934 + sch = rcu_dereference_all(scx_root); 3935 + if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) 3936 + return; 3937 + 3938 + for_each_node_with_cpus(node) 3939 + bypass_lb_node(sch, node); 3940 + 3941 + intv_us = READ_ONCE(scx_bypass_lb_intv_us); 3942 + if (intv_us) 3943 + mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 3944 + } 3945 + 3946 + static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); 3947 + 3764 3948 /** 3765 3949 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress 3766 3950 * @bypass: true for bypass, false for unbypass ··· 4005 3787 sch = rcu_dereference_bh(scx_root); 4006 3788 4007 3789 if (bypass) { 4008 - scx_bypass_depth++; 3790 + u32 intv_us; 3791 + 3792 + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); 4009 3793 WARN_ON_ONCE(scx_bypass_depth <= 0); 4010 3794 if (scx_bypass_depth != 1) 4011 3795 goto unlock; ··· 4015 3795 bypass_timestamp = ktime_get_ns(); 4016 3796 if (sch) 4017 3797 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 3798 + 3799 + intv_us = READ_ONCE(scx_bypass_lb_intv_us); 3800 + if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { 3801 + scx_bypass_lb_timer.expires = 3802 + jiffies + usecs_to_jiffies(intv_us); 3803 + add_timer_global(&scx_bypass_lb_timer); 3804 + } 4018 3805 } else { 4019 - scx_bypass_depth--; 3806 + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); 4020 3807 WARN_ON_ONCE(scx_bypass_depth < 0); 4021 3808 if (scx_bypass_depth != 0) 4022 3809 goto unlock; ··· 7277 7050 if (ret < 0) { 7278 7051 pr_err("sched_ext: Failed to add global attributes\n"); 7279 7052 return ret; 7053 + } 7054 + 7055 + if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || 7056 + !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { 7057 + pr_err("sched_ext: Failed to allocate cpumasks\n"); 7058 + return -ENOMEM; 7280 7059 } 7281 7060 7282 7061 return 0;

+6

kernel/sched/ext_internal.h

··· 23 23 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. 24 24 */ 25 25 SCX_TASK_ITER_BATCH = 32, 26 + 27 + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, 28 + SCX_BYPASS_LB_DONOR_PCT = 125, 29 + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, 30 + SCX_BYPASS_LB_BATCH = 256, 26 31 }; 27 32 28 33 enum scx_exit_kind { ··· 968 963 969 964 SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 970 965 SCX_ENQ_DSQ_PRIQ = 1LLU << 57, 966 + SCX_ENQ_NESTED = 1LLU << 58, 971 967 }; 972 968 973 969 enum scx_deq_flags {

Configure Feed

Configure Feed