sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs

+6

include/linux/sched/ext.h

··· 62 62 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, 63 63 }; 64 64 65 + struct scx_deferred_reenq_user { 66 + struct list_head node; 67 + u64 flags; 68 + }; 69 + 65 70 struct scx_dsq_pcpu { 66 71 struct scx_dispatch_q *dsq; 72 + struct scx_deferred_reenq_user deferred_reenq_user; 67 73 }; 68 74 69 75 /*

+128

kernel/sched/ext.c

··· 1181 1181 } 1182 1182 1183 1183 schedule_deferred(rq); 1184 + } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { 1185 + struct rq *rq = this_rq(); 1186 + struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); 1187 + struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; 1188 + 1189 + scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) { 1190 + if (list_empty(&dru->node)) 1191 + list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); 1192 + dru->flags |= reenq_flags; 1193 + } 1194 + 1195 + schedule_deferred(rq); 1184 1196 } else { 1185 1197 scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); 1186 1198 } ··· 3796 3784 } 3797 3785 } 3798 3786 3787 + static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) 3788 + { 3789 + struct rq *locked_rq = rq; 3790 + struct scx_sched *sch = dsq->sched; 3791 + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); 3792 + struct task_struct *p; 3793 + s32 nr_enqueued = 0; 3794 + 3795 + lockdep_assert_rq_held(rq); 3796 + 3797 + raw_spin_lock(&dsq->lock); 3798 + 3799 + while (likely(!READ_ONCE(sch->bypass_depth))) { 3800 + struct rq *task_rq; 3801 + 3802 + p = nldsq_cursor_next_task(&cursor, dsq); 3803 + if (!p) 3804 + break; 3805 + 3806 + if (!task_should_reenq(p, reenq_flags)) 3807 + continue; 3808 + 3809 + task_rq = task_rq(p); 3810 + 3811 + if (locked_rq != task_rq) { 3812 + if (locked_rq) 3813 + raw_spin_rq_unlock(locked_rq); 3814 + if (unlikely(!raw_spin_rq_trylock(task_rq))) { 3815 + raw_spin_unlock(&dsq->lock); 3816 + raw_spin_rq_lock(task_rq); 3817 + raw_spin_lock(&dsq->lock); 3818 + } 3819 + locked_rq = task_rq; 3820 + 3821 + /* did we lose @p while switching locks? */ 3822 + if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) 3823 + continue; 3824 + } 3825 + 3826 + /* @p is on @dsq, its rq and @dsq are locked */ 3827 + dispatch_dequeue_locked(p, dsq); 3828 + raw_spin_unlock(&dsq->lock); 3829 + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); 3830 + 3831 + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { 3832 + raw_spin_rq_unlock(locked_rq); 3833 + locked_rq = NULL; 3834 + cpu_relax(); 3835 + } 3836 + 3837 + raw_spin_lock(&dsq->lock); 3838 + } 3839 + 3840 + list_del_init(&cursor.node); 3841 + raw_spin_unlock(&dsq->lock); 3842 + 3843 + if (locked_rq != rq) { 3844 + if (locked_rq) 3845 + raw_spin_rq_unlock(locked_rq); 3846 + raw_spin_rq_lock(rq); 3847 + } 3848 + } 3849 + 3850 + static void process_deferred_reenq_users(struct rq *rq) 3851 + { 3852 + lockdep_assert_rq_held(rq); 3853 + 3854 + while (true) { 3855 + struct scx_dispatch_q *dsq; 3856 + u64 reenq_flags = 0; 3857 + 3858 + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { 3859 + struct scx_deferred_reenq_user *dru = 3860 + list_first_entry_or_null(&rq->scx.deferred_reenq_users, 3861 + struct scx_deferred_reenq_user, 3862 + node); 3863 + struct scx_dsq_pcpu *dsq_pcpu; 3864 + 3865 + if (!dru) 3866 + return; 3867 + 3868 + dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, 3869 + deferred_reenq_user); 3870 + dsq = dsq_pcpu->dsq; 3871 + swap(dru->flags, reenq_flags); 3872 + list_del_init(&dru->node); 3873 + } 3874 + 3875 + BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); 3876 + reenq_user(rq, dsq, reenq_flags); 3877 + } 3878 + } 3879 + 3799 3880 static void run_deferred(struct rq *rq) 3800 3881 { 3801 3882 process_ddsp_deferred_locals(rq); 3802 3883 3803 3884 if (!list_empty(&rq->scx.deferred_reenq_locals)) 3804 3885 process_deferred_reenq_locals(rq); 3886 + 3887 + if (!list_empty(&rq->scx.deferred_reenq_users)) 3888 + process_deferred_reenq_users(rq); 3805 3889 } 3806 3890 3807 3891 #ifdef CONFIG_NO_HZ_FULL ··· 4227 4119 struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4228 4120 4229 4121 pcpu->dsq = dsq; 4122 + INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); 4230 4123 } 4231 4124 4232 4125 return 0; ··· 4235 4126 4236 4127 static void exit_dsq(struct scx_dispatch_q *dsq) 4237 4128 { 4129 + s32 cpu; 4130 + 4131 + for_each_possible_cpu(cpu) { 4132 + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); 4133 + struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; 4134 + struct rq *rq = cpu_rq(cpu); 4135 + 4136 + /* 4137 + * There must have been a RCU grace period since the last 4138 + * insertion and @dsq should be off the deferred list by now. 4139 + */ 4140 + if (WARN_ON_ONCE(!list_empty(&dru->node))) { 4141 + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); 4142 + list_del_init(&dru->node); 4143 + } 4144 + } 4145 + 4238 4146 free_percpu(dsq->pcpu); 4239 4147 } 4240 4148 ··· 7434 7308 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 7435 7309 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 7436 7310 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 7311 + INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); 7437 7312 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 7438 7313 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 7439 7314 ··· 8481 8354 * supported: 8482 8355 * 8483 8356 * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) 8357 + * - User DSQs 8484 8358 * 8485 8359 * Re-enqueues are performed asynchronously. Can be called from anywhere. 8486 8360 */

+1

kernel/sched/sched.h

··· 810 810 811 811 raw_spinlock_t deferred_reenq_lock; 812 812 struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ 813 + struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ 813 814 struct balance_callback deferred_bal_cb; 814 815 struct irq_work deferred_irq_work; 815 816 struct irq_work kick_cpus_irq_work;

+55 -2

tools/sched_ext/scx_qmap.bpf.c

··· 26 26 27 27 enum consts { 28 28 ONE_SEC_IN_NS = 1000000000, 29 + ONE_MSEC_IN_NS = 1000000, 30 + LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS, 29 31 SHARED_DSQ = 0, 30 32 HIGHPRI_DSQ = 1, 33 + LOWPRI_DSQ = 2, 31 34 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ 32 35 }; 33 36 ··· 175 172 if (!(tctx = lookup_task_ctx(p))) 176 173 return -ESRCH; 177 174 175 + if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD)) 176 + return prev_cpu; 177 + 178 178 cpu = pick_direct_dispatch_cpu(p, prev_cpu); 179 179 180 180 if (cpu >= 0) { ··· 245 239 if (tctx->force_local) { 246 240 tctx->force_local = false; 247 241 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); 242 + return; 243 + } 244 + 245 + /* see lowpri_timerfn() */ 246 + if (__COMPAT_has_generic_reenq() && 247 + p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) { 248 + scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags); 248 249 return; 249 250 } 250 251 ··· 886 873 return 0; 887 874 } 888 875 876 + struct lowpri_timer { 877 + struct bpf_timer timer; 878 + }; 879 + 880 + struct { 881 + __uint(type, BPF_MAP_TYPE_ARRAY); 882 + __uint(max_entries, 1); 883 + __type(key, u32); 884 + __type(value, struct lowpri_timer); 885 + } lowpri_timer SEC(".maps"); 886 + 887 + /* 888 + * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and 889 + * the tasks are transferred to SHARED_DSQ. 890 + */ 891 + static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer) 892 + { 893 + scx_bpf_dsq_reenq(LOWPRI_DSQ, 0); 894 + bpf_timer_start(timer, LOWPRI_INTV_NS, 0); 895 + return 0; 896 + } 897 + 889 898 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) 890 899 { 891 900 u32 key = 0; ··· 929 894 return ret; 930 895 } 931 896 897 + ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1); 898 + if (ret) 899 + return ret; 900 + 932 901 timer = bpf_map_lookup_elem(&monitor_timer, &key); 933 902 if (!timer) 934 903 return -ESRCH; 935 - 936 904 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); 937 905 bpf_timer_set_callback(timer, monitor_timerfn); 906 + ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 907 + if (ret) 908 + return ret; 938 909 939 - return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 910 + if (__COMPAT_has_generic_reenq()) { 911 + /* see lowpri_timerfn() */ 912 + timer = bpf_map_lookup_elem(&lowpri_timer, &key); 913 + if (!timer) 914 + return -ESRCH; 915 + bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC); 916 + bpf_timer_set_callback(timer, lowpri_timerfn); 917 + ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0); 918 + if (ret) 919 + return ret; 920 + } 921 + 922 + return 0; 940 923 } 941 924 942 925 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)

Configure Feed

Configure Feed