Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched_ext: Add lockless peek operation for DSQs

The builtin DSQ queue data structures are meant to be used by a wide
range of different sched_ext schedulers with different demands on these
data structures. They might be per-cpu with low-contention, or
high-contention shared queues. Unfortunately, DSQs have a coarse-grained
lock around the whole data structure. Without going all the way to a
lock-free, more scalable implementation, a small step we can take to
reduce lock contention is to allow a lockless, small-fixed-cost peek at
the head of the queue.

This change allows certain custom SCX schedulers to cheaply peek at
queues, e.g. during load balancing, before locking them. But it
represents a few extra memory operations to update the pointer each
time the DSQ is modified, including a memory barrier on ARM so the write
appears correctly ordered.

This commit adds a first_task pointer field which is updated
atomically when the DSQ is modified, and allows any thread to peek at
the head of the queue without holding the lock.

Signed-off-by: Ryan Newton <newton@meta.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Ryan Newton and committed by
Tejun Heo
44f5c8ec 347ed2d5

+76 -2
+1
include/linux/sched/ext.h
··· 58 58 */ 59 59 struct scx_dispatch_q { 60 60 raw_spinlock_t lock; 61 + struct task_struct __rcu *first_task; /* lockless peek at head */ 61 62 struct list_head list; /* tasks in dispatch order */ 62 63 struct rb_root priq; /* used to order by p->scx.dsq_vtime */ 63 64 u32 nr;
+56 -2
kernel/sched/ext.c
··· 965 965 container_of(rbp, struct task_struct, 966 966 scx.dsq_priq); 967 967 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 968 + /* first task unchanged - no update needed */ 968 969 } else { 969 970 list_add(&p->scx.dsq_list.node, &dsq->list); 971 + /* not builtin and new task is at head - use fastpath */ 972 + rcu_assign_pointer(dsq->first_task, p); 970 973 } 971 974 } else { 972 975 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ ··· 977 974 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 978 975 dsq->id); 979 976 980 - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 977 + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 981 978 list_add(&p->scx.dsq_list.node, &dsq->list); 982 - else 979 + /* new task inserted at head - use fastpath */ 980 + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 981 + rcu_assign_pointer(dsq->first_task, p); 982 + } else { 983 + bool was_empty; 984 + 985 + was_empty = list_empty(&dsq->list); 983 986 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 987 + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 988 + rcu_assign_pointer(dsq->first_task, p); 989 + } 984 990 } 985 991 986 992 /* seq records the order tasks are queued, used by BPF DSQ iterator */ ··· 1042 1030 rb_erase(&p->scx.dsq_priq, &dsq->priq); 1043 1031 RB_CLEAR_NODE(&p->scx.dsq_priq); 1044 1032 p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; 1033 + } 1034 + 1035 + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1036 + struct task_struct *first_task; 1037 + 1038 + first_task = nldsq_next_task(dsq, NULL, false); 1039 + rcu_assign_pointer(dsq->first_task, first_task); 1045 1040 } 1046 1041 1047 1042 list_del_init(&p->scx.dsq_list.node); ··· 6311 6292 kit->dsq = NULL; 6312 6293 } 6313 6294 6295 + /** 6296 + * scx_bpf_dsq_peek - Lockless peek at the first element. 6297 + * @dsq_id: DSQ to examine. 6298 + * 6299 + * Read the first element in the DSQ. This is semantically equivalent to using 6300 + * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 6301 + * this provides only a point-in-time snapshot, and the contents may change 6302 + * by the time any subsequent locking operation reads the queue. 6303 + * 6304 + * Returns the pointer, or NULL indicates an empty queue OR internal error. 6305 + */ 6306 + __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) 6307 + { 6308 + struct scx_sched *sch; 6309 + struct scx_dispatch_q *dsq; 6310 + 6311 + sch = rcu_dereference(scx_root); 6312 + if (unlikely(!sch)) 6313 + return NULL; 6314 + 6315 + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 6316 + scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 6317 + return NULL; 6318 + } 6319 + 6320 + dsq = find_user_dsq(sch, dsq_id); 6321 + if (unlikely(!dsq)) { 6322 + scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 6323 + return NULL; 6324 + } 6325 + 6326 + return rcu_dereference(dsq->first_task); 6327 + } 6328 + 6314 6329 __bpf_kfunc_end_defs(); 6315 6330 6316 6331 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, ··· 6904 6851 BTF_ID_FLAGS(func, scx_bpf_kick_cpu) 6905 6852 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) 6906 6853 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) 6854 + BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) 6907 6855 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) 6908 6856 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 6909 6857 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
+1
tools/sched_ext/include/scx/common.bpf.h
··· 74 74 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; 75 75 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; 76 76 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; 77 + struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; 77 78 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; 78 79 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; 79 80 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
+18
tools/sched_ext/include/scx/compat.bpf.h
··· 26 26 (bpf_ksym_exists(bpf_cpumask_populate) ? \ 27 27 (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) 28 28 29 + /* 30 + * v6.19: Introduce lockless peek API for user DSQs. 31 + * 32 + * Preserve the following macro until v6.21. 33 + */ 34 + static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) 35 + { 36 + struct task_struct *p = NULL; 37 + struct bpf_iter_scx_dsq it; 38 + 39 + if (bpf_ksym_exists(scx_bpf_dsq_peek)) 40 + return scx_bpf_dsq_peek(dsq_id); 41 + if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) 42 + p = bpf_iter_scx_dsq_next(&it); 43 + bpf_iter_scx_dsq_destroy(&it); 44 + return p; 45 + } 46 + 29 47 /** 30 48 * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on 31 49 * in a compatible way. We will preserve this __COMPAT helper until v6.16.