Merge tag 'sched_ext-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

+2 -2

kernel/sched/ext.c

··· 3520 3520 * operations inside scheduler locks. 3521 3521 */ 3522 3522 dsq->id = SCX_DSQ_INVALID; 3523 - llist_add(&dsq->free_node, &dsqs_to_free); 3524 - irq_work_queue(&free_dsq_irq_work); 3523 + if (llist_add(&dsq->free_node, &dsqs_to_free)) 3524 + irq_work_queue(&free_dsq_irq_work); 3525 3525 3526 3526 out_unlock_dsq: 3527 3527 raw_spin_unlock_irqrestore(&dsq->lock, flags);

+1 -1

tools/sched_ext/Makefile

··· 189 189 190 190 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) 191 191 192 - c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg 192 + c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt 193 193 194 194 $(addprefix $(BINDIR)/,$(c-sched-targets)): \ 195 195 $(BINDIR)/%: \

+3 -1

tools/sched_ext/scx_central.bpf.c

··· 301 301 int ret; 302 302 303 303 ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); 304 - if (ret) 304 + if (ret) { 305 + scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret); 305 306 return ret; 307 + } 306 308 307 309 timer = bpf_map_lookup_elem(&central_timer, &key); 308 310 if (!timer)

+9 -1

tools/sched_ext/scx_cpu0.bpf.c

··· 71 71 72 72 s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init) 73 73 { 74 - return scx_bpf_create_dsq(DSQ_CPU0, -1); 74 + int ret; 75 + 76 + ret = scx_bpf_create_dsq(DSQ_CPU0, -1); 77 + if (ret) { 78 + scx_bpf_error("failed to create DSQ %d (%d)", DSQ_CPU0, ret); 79 + return ret; 80 + } 81 + 82 + return 0; 75 83 } 76 84 77 85 void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)

+12 -2

tools/sched_ext/scx_flatcg.bpf.c

··· 842 842 * unlikely case that it breaks. 843 843 */ 844 844 ret = scx_bpf_create_dsq(cgid, -1); 845 - if (ret) 845 + if (ret) { 846 + scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret); 846 847 return ret; 848 + } 847 849 848 850 cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 849 851 BPF_LOCAL_STORAGE_GET_F_CREATE); ··· 929 927 930 928 s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) 931 929 { 932 - return scx_bpf_create_dsq(FALLBACK_DSQ, -1); 930 + int ret; 931 + 932 + ret = scx_bpf_create_dsq(FALLBACK_DSQ, -1); 933 + if (ret) { 934 + scx_bpf_error("failed to create DSQ %d (%d)", FALLBACK_DSQ, ret); 935 + return ret; 936 + } 937 + 938 + return 0; 933 939 } 934 940 935 941 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)

+610

tools/sched_ext/scx_pair.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A demo sched_ext core-scheduler which always makes every sibling CPU pair 4 + * execute from the same CPU cgroup. 5 + * 6 + * This scheduler is a minimal implementation and would need some form of 7 + * priority handling both inside each cgroup and across the cgroups to be 8 + * practically useful. 9 + * 10 + * Each CPU in the system is paired with exactly one other CPU, according to a 11 + * "stride" value that can be specified when the BPF scheduler program is first 12 + * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee 13 + * that they will only ever schedule tasks that belong to the same CPU cgroup. 14 + * 15 + * Scheduler Initialization 16 + * ------------------------ 17 + * 18 + * The scheduler BPF program is first initialized from user space, before it is 19 + * enabled. During this initialization process, each CPU on the system is 20 + * assigned several values that are constant throughout its runtime: 21 + * 22 + * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling 23 + * decisions. Paired CPUs always schedule tasks from the same 24 + * CPU cgroup, and synchronize with each other to guarantee 25 + * that this constraint is not violated. 26 + * 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access 27 + * a struct pair_ctx object that is shared between the pair. 28 + * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the 29 + * pair. Each struct pair_ctx has an active_mask field, 30 + * which is a bitmap used to indicate whether each core 31 + * in the pair currently has an actively running task. 32 + * This index specifies which entry in the bitmap corresponds 33 + * to each CPU in the pair. 34 + * 35 + * During this initialization, the CPUs are paired according to a "stride" that 36 + * may be specified when invoking the user space program that initializes and 37 + * loads the scheduler. By default, the stride is 1/2 the total number of CPUs. 38 + * 39 + * Tasks and cgroups 40 + * ----------------- 41 + * 42 + * Every cgroup in the system is registered with the scheduler using the 43 + * pair_cgroup_init() callback, and every task in the system is associated with 44 + * exactly one cgroup. At a high level, the idea with the pair scheduler is to 45 + * always schedule tasks from the same cgroup within a given CPU pair. When a 46 + * task is enqueued (i.e. passed to the pair_enqueue() callback function), its 47 + * cgroup ID is read from its task struct, and then a corresponding queue map 48 + * is used to FIFO-enqueue the task for that cgroup. 49 + * 50 + * If you look through the implementation of the scheduler, you'll notice that 51 + * there is quite a bit of complexity involved with looking up the per-cgroup 52 + * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash 53 + * BPF hash map that is used to map a cgroup ID to a globally unique ID that's 54 + * allocated in the BPF program. This is done because we use separate maps to 55 + * store the FIFO queue of tasks, and the length of that map, per cgroup. This 56 + * complexity is only present because of current deficiencies in BPF that will 57 + * soon be addressed. The main point to keep in mind is that newly enqueued 58 + * tasks are added to their cgroup's FIFO queue. 59 + * 60 + * Dispatching tasks 61 + * ----------------- 62 + * 63 + * This section will describe how enqueued tasks are dispatched and scheduled. 64 + * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is 65 + * as follows: 66 + * 67 + * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is 68 + * the structure that's used to synchronize amongst the two pair CPUs in their 69 + * scheduling decisions. After any of the following events have occurred: 70 + * 71 + * - The cgroup's slice run has expired, or 72 + * - The cgroup becomes empty, or 73 + * - Either CPU in the pair is preempted by a higher priority scheduling class 74 + * 75 + * The cgroup transitions to the draining state and stops executing new tasks 76 + * from the cgroup. 77 + * 78 + * 2. If the pair is still executing a task, mark the pair_ctx as draining, and 79 + * wait for the pair CPU to be preempted. 80 + * 81 + * 3. Otherwise, if the pair CPU is not running a task, we can move onto 82 + * scheduling new tasks. Pop the next cgroup id from the top_q queue. 83 + * 84 + * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it. 85 + * 86 + * Note again that this scheduling behavior is simple, but the implementation 87 + * is complex mostly because this it hits several BPF shortcomings and has to 88 + * work around in often awkward ways. Most of the shortcomings are expected to 89 + * be resolved in the near future which should allow greatly simplifying this 90 + * scheduler. 91 + * 92 + * Dealing with preemption 93 + * ----------------------- 94 + * 95 + * SCX is the lowest priority sched_class, and could be preempted by them at 96 + * any time. To address this, the scheduler implements pair_cpu_release() and 97 + * pair_cpu_acquire() callbacks which are invoked by the core scheduler when 98 + * the scheduler loses and gains control of the CPU respectively. 99 + * 100 + * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and 101 + * then invoke: 102 + * 103 + * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT); 104 + * 105 + * This preempts the pair CPU, and waits until it has re-entered the scheduler 106 + * before returning. This is necessary to ensure that the higher priority 107 + * sched_class that preempted our scheduler does not schedule a task 108 + * concurrently with our pair CPU. 109 + * 110 + * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption 111 + * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable 112 + * pair scheduling. 113 + * 114 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 115 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 116 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 117 + */ 118 + #include <scx/common.bpf.h> 119 + #include "scx_pair.h" 120 + 121 + char _license[] SEC("license") = "GPL"; 122 + 123 + /* !0 for veristat, set during init */ 124 + const volatile u32 nr_cpu_ids = 1; 125 + 126 + /* a pair of CPUs stay on a cgroup for this duration */ 127 + const volatile u32 pair_batch_dur_ns; 128 + 129 + /* cpu ID -> pair cpu ID */ 130 + const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu); 131 + 132 + /* cpu ID -> pair_id */ 133 + const volatile u32 RESIZABLE_ARRAY(rodata, pair_id); 134 + 135 + /* CPU ID -> CPU # in the pair (0 or 1) */ 136 + const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx); 137 + 138 + struct pair_ctx { 139 + struct bpf_spin_lock lock; 140 + 141 + /* the cgroup the pair is currently executing */ 142 + u64 cgid; 143 + 144 + /* the pair started executing the current cgroup at */ 145 + u64 started_at; 146 + 147 + /* whether the current cgroup is draining */ 148 + bool draining; 149 + 150 + /* the CPUs that are currently active on the cgroup */ 151 + u32 active_mask; 152 + 153 + /* 154 + * the CPUs that are currently preempted and running tasks in a 155 + * different scheduler. 156 + */ 157 + u32 preempted_mask; 158 + }; 159 + 160 + struct { 161 + __uint(type, BPF_MAP_TYPE_ARRAY); 162 + __type(key, u32); 163 + __type(value, struct pair_ctx); 164 + } pair_ctx SEC(".maps"); 165 + 166 + /* queue of cgrp_q's possibly with tasks on them */ 167 + struct { 168 + __uint(type, BPF_MAP_TYPE_QUEUE); 169 + /* 170 + * Because it's difficult to build strong synchronization encompassing 171 + * multiple non-trivial operations in BPF, this queue is managed in an 172 + * opportunistic way so that we guarantee that a cgroup w/ active tasks 173 + * is always on it but possibly multiple times. Once we have more robust 174 + * synchronization constructs and e.g. linked list, we should be able to 175 + * do this in a prettier way but for now just size it big enough. 176 + */ 177 + __uint(max_entries, 4 * MAX_CGRPS); 178 + __type(value, u64); 179 + } top_q SEC(".maps"); 180 + 181 + /* per-cgroup q which FIFOs the tasks from the cgroup */ 182 + struct cgrp_q { 183 + __uint(type, BPF_MAP_TYPE_QUEUE); 184 + __uint(max_entries, MAX_QUEUED); 185 + __type(value, u32); 186 + }; 187 + 188 + /* 189 + * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local 190 + * storage; however, a cgroup local storage can only be accessed from the BPF 191 + * progs attached to the cgroup. For now, work around by allocating array of 192 + * cgrp_q's and then allocating per-cgroup indices. 193 + * 194 + * Another caveat: It's difficult to populate a large array of maps statically 195 + * or from BPF. Initialize it from userland. 196 + */ 197 + struct { 198 + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 199 + __uint(max_entries, MAX_CGRPS); 200 + __type(key, s32); 201 + __array(values, struct cgrp_q); 202 + } cgrp_q_arr SEC(".maps"); 203 + 204 + static u64 cgrp_q_len[MAX_CGRPS]; 205 + 206 + /* 207 + * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be 208 + * useful to have as a map type. 209 + */ 210 + static u32 cgrp_q_idx_cursor; 211 + static u64 cgrp_q_idx_busy[MAX_CGRPS]; 212 + 213 + /* 214 + * All added up, the following is what we do: 215 + * 216 + * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking 217 + * for a free ID. If not found, fail cgroup creation with -EBUSY. 218 + * 219 + * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following 220 + * cgrp_q_idx_hash. 221 + * 222 + * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from 223 + * cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr. 224 + * 225 + * This is sadly complicated for something pretty simple. Hopefully, we should 226 + * be able to simplify in the future. 227 + */ 228 + struct { 229 + __uint(type, BPF_MAP_TYPE_HASH); 230 + __uint(max_entries, MAX_CGRPS); 231 + __uint(key_size, sizeof(u64)); /* cgrp ID */ 232 + __uint(value_size, sizeof(s32)); /* cgrp_q idx */ 233 + } cgrp_q_idx_hash SEC(".maps"); 234 + 235 + /* statistics */ 236 + u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions; 237 + u64 nr_exps, nr_exp_waits, nr_exp_empty; 238 + u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty; 239 + 240 + UEI_DEFINE(uei); 241 + 242 + void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags) 243 + { 244 + struct cgroup *cgrp; 245 + struct cgrp_q *cgq; 246 + s32 pid = p->pid; 247 + u64 cgid; 248 + u32 *q_idx; 249 + u64 *cgq_len; 250 + 251 + __sync_fetch_and_add(&nr_total, 1); 252 + 253 + cgrp = scx_bpf_task_cgroup(p); 254 + cgid = cgrp->kn->id; 255 + bpf_cgroup_release(cgrp); 256 + 257 + /* find the cgroup's q and push @p into it */ 258 + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 259 + if (!q_idx) { 260 + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); 261 + return; 262 + } 263 + 264 + cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx); 265 + if (!cgq) { 266 + scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]", 267 + cgid, *q_idx); 268 + return; 269 + } 270 + 271 + if (bpf_map_push_elem(cgq, &pid, 0)) { 272 + scx_bpf_error("cgroup[%llu] queue overflow", cgid); 273 + return; 274 + } 275 + 276 + /* bump q len, if going 0 -> 1, queue cgroup into the top_q */ 277 + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); 278 + if (!cgq_len) { 279 + scx_bpf_error("MEMBER_VTPR malfunction"); 280 + return; 281 + } 282 + 283 + if (!__sync_fetch_and_add(cgq_len, 1) && 284 + bpf_map_push_elem(&top_q, &cgid, 0)) { 285 + scx_bpf_error("top_q overflow"); 286 + return; 287 + } 288 + } 289 + 290 + static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask) 291 + { 292 + u32 *vptr; 293 + 294 + vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids); 295 + if (!vptr) 296 + return -EINVAL; 297 + 298 + *pairc = bpf_map_lookup_elem(&pair_ctx, vptr); 299 + if (!(*pairc)) 300 + return -EINVAL; 301 + 302 + vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids); 303 + if (!vptr) 304 + return -EINVAL; 305 + 306 + *mask = 1U << *vptr; 307 + 308 + return 0; 309 + } 310 + 311 + __attribute__((noinline)) 312 + static int try_dispatch(s32 cpu) 313 + { 314 + struct pair_ctx *pairc; 315 + struct bpf_map *cgq_map; 316 + struct task_struct *p; 317 + u64 now = scx_bpf_now(); 318 + bool kick_pair = false; 319 + bool expired, pair_preempted; 320 + u32 *vptr, in_pair_mask; 321 + s32 pid, q_idx; 322 + u64 cgid; 323 + int ret; 324 + 325 + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 326 + if (ret) { 327 + scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]", 328 + cpu); 329 + return -ENOENT; 330 + } 331 + 332 + bpf_spin_lock(&pairc->lock); 333 + pairc->active_mask &= ~in_pair_mask; 334 + 335 + expired = time_before(pairc->started_at + pair_batch_dur_ns, now); 336 + if (expired || pairc->draining) { 337 + u64 new_cgid = 0; 338 + 339 + __sync_fetch_and_add(&nr_exps, 1); 340 + 341 + /* 342 + * We're done with the current cgid. An obvious optimization 343 + * would be not draining if the next cgroup is the current one. 344 + * For now, be dumb and always expire. 345 + */ 346 + pairc->draining = true; 347 + 348 + pair_preempted = pairc->preempted_mask; 349 + if (pairc->active_mask || pair_preempted) { 350 + /* 351 + * The other CPU is still active, or is no longer under 352 + * our control due to e.g. being preempted by a higher 353 + * priority sched_class. We want to wait until this 354 + * cgroup expires, or until control of our pair CPU has 355 + * been returned to us. 356 + * 357 + * If the pair controls its CPU, and the time already 358 + * expired, kick. When the other CPU arrives at 359 + * dispatch and clears its active mask, it'll push the 360 + * pair to the next cgroup and kick this CPU. 361 + */ 362 + __sync_fetch_and_add(&nr_exp_waits, 1); 363 + bpf_spin_unlock(&pairc->lock); 364 + if (expired && !pair_preempted) 365 + kick_pair = true; 366 + goto out_maybe_kick; 367 + } 368 + 369 + bpf_spin_unlock(&pairc->lock); 370 + 371 + /* 372 + * Pick the next cgroup. It'd be easier / cleaner to not drop 373 + * pairc->lock and use stronger synchronization here especially 374 + * given that we'll be switching cgroups significantly less 375 + * frequently than tasks. Unfortunately, bpf_spin_lock can't 376 + * really protect anything non-trivial. Let's do opportunistic 377 + * operations instead. 378 + */ 379 + bpf_repeat(BPF_MAX_LOOPS) { 380 + u32 *q_idx; 381 + u64 *cgq_len; 382 + 383 + if (bpf_map_pop_elem(&top_q, &new_cgid)) { 384 + /* no active cgroup, go idle */ 385 + __sync_fetch_and_add(&nr_exp_empty, 1); 386 + return 0; 387 + } 388 + 389 + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid); 390 + if (!q_idx) 391 + continue; 392 + 393 + /* 394 + * This is the only place where empty cgroups are taken 395 + * off the top_q. 396 + */ 397 + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); 398 + if (!cgq_len || !*cgq_len) 399 + continue; 400 + 401 + /* 402 + * If it has any tasks, requeue as we may race and not 403 + * execute it. 404 + */ 405 + bpf_map_push_elem(&top_q, &new_cgid, 0); 406 + break; 407 + } 408 + 409 + bpf_spin_lock(&pairc->lock); 410 + 411 + /* 412 + * The other CPU may already have started on a new cgroup while 413 + * we dropped the lock. Make sure that we're still draining and 414 + * start on the new cgroup. 415 + */ 416 + if (pairc->draining && !pairc->active_mask) { 417 + __sync_fetch_and_add(&nr_cgrp_next, 1); 418 + pairc->cgid = new_cgid; 419 + pairc->started_at = now; 420 + pairc->draining = false; 421 + kick_pair = true; 422 + } else { 423 + __sync_fetch_and_add(&nr_cgrp_coll, 1); 424 + } 425 + } 426 + 427 + cgid = pairc->cgid; 428 + pairc->active_mask |= in_pair_mask; 429 + bpf_spin_unlock(&pairc->lock); 430 + 431 + /* again, it'd be better to do all these with the lock held, oh well */ 432 + vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 433 + if (!vptr) { 434 + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); 435 + return -ENOENT; 436 + } 437 + q_idx = *vptr; 438 + 439 + /* claim one task from cgrp_q w/ q_idx */ 440 + bpf_repeat(BPF_MAX_LOOPS) { 441 + u64 *cgq_len, len; 442 + 443 + cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]); 444 + if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) { 445 + /* the cgroup must be empty, expire and repeat */ 446 + __sync_fetch_and_add(&nr_cgrp_empty, 1); 447 + bpf_spin_lock(&pairc->lock); 448 + pairc->draining = true; 449 + pairc->active_mask &= ~in_pair_mask; 450 + bpf_spin_unlock(&pairc->lock); 451 + return -EAGAIN; 452 + } 453 + 454 + if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len) 455 + continue; 456 + 457 + break; 458 + } 459 + 460 + cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx); 461 + if (!cgq_map) { 462 + scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]", 463 + cgid, q_idx); 464 + return -ENOENT; 465 + } 466 + 467 + if (bpf_map_pop_elem(cgq_map, &pid)) { 468 + scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]", 469 + cgid, q_idx); 470 + return -ENOENT; 471 + } 472 + 473 + p = bpf_task_from_pid(pid); 474 + if (p) { 475 + __sync_fetch_and_add(&nr_dispatched, 1); 476 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 477 + bpf_task_release(p); 478 + } else { 479 + /* we don't handle dequeues, retry on lost tasks */ 480 + __sync_fetch_and_add(&nr_missing, 1); 481 + return -EAGAIN; 482 + } 483 + 484 + out_maybe_kick: 485 + if (kick_pair) { 486 + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 487 + if (pair) { 488 + __sync_fetch_and_add(&nr_kicks, 1); 489 + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); 490 + } 491 + } 492 + return 0; 493 + } 494 + 495 + void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev) 496 + { 497 + bpf_repeat(BPF_MAX_LOOPS) { 498 + if (try_dispatch(cpu) != -EAGAIN) 499 + break; 500 + } 501 + } 502 + 503 + void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args) 504 + { 505 + int ret; 506 + u32 in_pair_mask; 507 + struct pair_ctx *pairc; 508 + bool kick_pair; 509 + 510 + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 511 + if (ret) 512 + return; 513 + 514 + bpf_spin_lock(&pairc->lock); 515 + pairc->preempted_mask &= ~in_pair_mask; 516 + /* Kick the pair CPU, unless it was also preempted. */ 517 + kick_pair = !pairc->preempted_mask; 518 + bpf_spin_unlock(&pairc->lock); 519 + 520 + if (kick_pair) { 521 + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 522 + 523 + if (pair) { 524 + __sync_fetch_and_add(&nr_kicks, 1); 525 + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); 526 + } 527 + } 528 + } 529 + 530 + void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 531 + { 532 + int ret; 533 + u32 in_pair_mask; 534 + struct pair_ctx *pairc; 535 + bool kick_pair; 536 + 537 + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 538 + if (ret) 539 + return; 540 + 541 + bpf_spin_lock(&pairc->lock); 542 + pairc->preempted_mask |= in_pair_mask; 543 + pairc->active_mask &= ~in_pair_mask; 544 + /* Kick the pair CPU if it's still running. */ 545 + kick_pair = pairc->active_mask; 546 + pairc->draining = true; 547 + bpf_spin_unlock(&pairc->lock); 548 + 549 + if (kick_pair) { 550 + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 551 + 552 + if (pair) { 553 + __sync_fetch_and_add(&nr_kicks, 1); 554 + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT); 555 + } 556 + } 557 + __sync_fetch_and_add(&nr_preemptions, 1); 558 + } 559 + 560 + s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp) 561 + { 562 + u64 cgid = cgrp->kn->id; 563 + s32 i, q_idx; 564 + 565 + bpf_for(i, 0, MAX_CGRPS) { 566 + q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS; 567 + if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) 568 + break; 569 + } 570 + if (i == MAX_CGRPS) 571 + return -EBUSY; 572 + 573 + if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) { 574 + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]); 575 + if (busy) 576 + *busy = 0; 577 + return -EBUSY; 578 + } 579 + 580 + return 0; 581 + } 582 + 583 + void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp) 584 + { 585 + u64 cgid = cgrp->kn->id; 586 + s32 *q_idx; 587 + 588 + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 589 + if (q_idx) { 590 + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]); 591 + if (busy) 592 + *busy = 0; 593 + bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid); 594 + } 595 + } 596 + 597 + void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei) 598 + { 599 + UEI_RECORD(uei, ei); 600 + } 601 + 602 + SCX_OPS_DEFINE(pair_ops, 603 + .enqueue = (void *)pair_enqueue, 604 + .dispatch = (void *)pair_dispatch, 605 + .cpu_acquire = (void *)pair_cpu_acquire, 606 + .cpu_release = (void *)pair_cpu_release, 607 + .cgroup_init = (void *)pair_cgroup_init, 608 + .cgroup_exit = (void *)pair_cgroup_exit, 609 + .exit = (void *)pair_exit, 610 + .name = "pair");

+180

tools/sched_ext/scx_pair.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 + */ 7 + #include <stdio.h> 8 + #include <unistd.h> 9 + #include <inttypes.h> 10 + #include <signal.h> 11 + #include <assert.h> 12 + #include <libgen.h> 13 + #include <bpf/bpf.h> 14 + #include <scx/common.h> 15 + #include "scx_pair.h" 16 + #include "scx_pair.bpf.skel.h" 17 + 18 + const char help_fmt[] = 19 + "A demo sched_ext core-scheduler which always makes every sibling CPU pair\n" 20 + "execute from the same CPU cgroup.\n" 21 + "\n" 22 + "See the top-level comment in .bpf.c for more details.\n" 23 + "\n" 24 + "Usage: %s [-S STRIDE]\n" 25 + "\n" 26 + " -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n" 27 + " -v Print libbpf debug messages\n" 28 + " -h Display this help and exit\n"; 29 + 30 + static bool verbose; 31 + static volatile int exit_req; 32 + 33 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 34 + { 35 + if (level == LIBBPF_DEBUG && !verbose) 36 + return 0; 37 + return vfprintf(stderr, format, args); 38 + } 39 + 40 + static void sigint_handler(int dummy) 41 + { 42 + exit_req = 1; 43 + } 44 + 45 + int main(int argc, char **argv) 46 + { 47 + struct scx_pair *skel; 48 + struct bpf_link *link; 49 + __u64 seq = 0, ecode; 50 + __s32 stride, i, opt, outer_fd; 51 + 52 + libbpf_set_print(libbpf_print_fn); 53 + signal(SIGINT, sigint_handler); 54 + signal(SIGTERM, sigint_handler); 55 + restart: 56 + skel = SCX_OPS_OPEN(pair_ops, scx_pair); 57 + 58 + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); 59 + assert(skel->rodata->nr_cpu_ids > 0); 60 + skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); 61 + 62 + /* pair up the earlier half to the latter by default, override with -s */ 63 + stride = skel->rodata->nr_cpu_ids / 2; 64 + 65 + while ((opt = getopt(argc, argv, "S:vh")) != -1) { 66 + switch (opt) { 67 + case 'S': 68 + stride = strtoul(optarg, NULL, 0); 69 + break; 70 + case 'v': 71 + verbose = true; 72 + break; 73 + default: 74 + fprintf(stderr, help_fmt, basename(argv[0])); 75 + return opt != 'h'; 76 + } 77 + } 78 + 79 + bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2); 80 + 81 + /* Resize arrays so their element count is equal to cpu count. */ 82 + RESIZE_ARRAY(skel, rodata, pair_cpu, skel->rodata->nr_cpu_ids); 83 + RESIZE_ARRAY(skel, rodata, pair_id, skel->rodata->nr_cpu_ids); 84 + RESIZE_ARRAY(skel, rodata, in_pair_idx, skel->rodata->nr_cpu_ids); 85 + 86 + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) 87 + skel->rodata_pair_cpu->pair_cpu[i] = -1; 88 + 89 + printf("Pairs: "); 90 + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) { 91 + int j = (i + stride) % skel->rodata->nr_cpu_ids; 92 + 93 + if (skel->rodata_pair_cpu->pair_cpu[i] >= 0) 94 + continue; 95 + 96 + SCX_BUG_ON(i == j, 97 + "Invalid stride %d - CPU%d wants to be its own pair", 98 + stride, i); 99 + 100 + SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0, 101 + "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair", 102 + stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]); 103 + 104 + skel->rodata_pair_cpu->pair_cpu[i] = j; 105 + skel->rodata_pair_cpu->pair_cpu[j] = i; 106 + skel->rodata_pair_id->pair_id[i] = i; 107 + skel->rodata_pair_id->pair_id[j] = i; 108 + skel->rodata_in_pair_idx->in_pair_idx[i] = 0; 109 + skel->rodata_in_pair_idx->in_pair_idx[j] = 1; 110 + 111 + printf("[%d, %d] ", i, j); 112 + } 113 + printf("\n"); 114 + 115 + SCX_OPS_LOAD(skel, pair_ops, scx_pair, uei); 116 + 117 + /* 118 + * Populate the cgrp_q_arr map which is an array containing per-cgroup 119 + * queues. It'd probably be better to do this from BPF but there are too 120 + * many to initialize statically and there's no way to dynamically 121 + * populate from BPF. 122 + */ 123 + outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr); 124 + SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd); 125 + 126 + printf("Initializing"); 127 + for (i = 0; i < MAX_CGRPS; i++) { 128 + __s32 inner_fd; 129 + 130 + if (exit_req) 131 + break; 132 + 133 + inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, 134 + sizeof(__u32), MAX_QUEUED, NULL); 135 + SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d", 136 + inner_fd); 137 + SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY), 138 + "Failed to set inner map"); 139 + close(inner_fd); 140 + 141 + if (!(i % 10)) 142 + printf("."); 143 + fflush(stdout); 144 + } 145 + printf("\n"); 146 + 147 + /* 148 + * Fully initialized, attach and run. 149 + */ 150 + link = SCX_OPS_ATTACH(skel, pair_ops, scx_pair); 151 + 152 + while (!exit_req && !UEI_EXITED(skel, uei)) { 153 + printf("[SEQ %llu]\n", seq++); 154 + printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n", 155 + skel->bss->nr_total, 156 + skel->bss->nr_dispatched, 157 + skel->bss->nr_missing); 158 + printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n", 159 + skel->bss->nr_kicks, 160 + skel->bss->nr_preemptions); 161 + printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n", 162 + skel->bss->nr_exps, 163 + skel->bss->nr_exp_waits, 164 + skel->bss->nr_exp_empty); 165 + printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n", 166 + skel->bss->nr_cgrp_next, 167 + skel->bss->nr_cgrp_coll, 168 + skel->bss->nr_cgrp_empty); 169 + fflush(stdout); 170 + sleep(1); 171 + } 172 + 173 + bpf_link__destroy(link); 174 + ecode = UEI_REPORT(skel, uei); 175 + scx_pair__destroy(skel); 176 + 177 + if (UEI_ECODE_RESTART(ecode)) 178 + goto restart; 179 + return 0; 180 + }

+9

tools/sched_ext/scx_pair.h

··· 1 + #ifndef __SCX_EXAMPLE_PAIR_H 2 + #define __SCX_EXAMPLE_PAIR_H 3 + 4 + enum { 5 + MAX_QUEUED = 4096, 6 + MAX_CGRPS = 4096, 7 + }; 8 + 9 + #endif /* __SCX_EXAMPLE_PAIR_H */

+6 -2

tools/sched_ext/scx_qmap.bpf.c

··· 866 866 print_cpus(); 867 867 868 868 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 869 - if (ret) 869 + if (ret) { 870 + scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); 870 871 return ret; 872 + } 871 873 872 874 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1); 873 - if (ret) 875 + if (ret) { 876 + scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret); 874 877 return ret; 878 + } 875 879 876 880 timer = bpf_map_lookup_elem(&monitor_timer, &key); 877 881 if (!timer)

+716

tools/sched_ext/scx_sdt.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Arena-based task data scheduler. This is a variation of scx_simple 4 + * that uses a combined allocator and indexing structure to organize 5 + * task data. Task context allocation is done when a task enters the 6 + * scheduler, while freeing is done when it exits. Task contexts are 7 + * retrieved from task-local storage, pointing to the allocated memory. 8 + * 9 + * The main purpose of this scheduler is to demostrate arena memory 10 + * management. 11 + * 12 + * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. 13 + * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com> 14 + * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org> 15 + * 16 + */ 17 + #include <scx/common.bpf.h> 18 + #include <scx/bpf_arena_common.bpf.h> 19 + 20 + #include "scx_sdt.h" 21 + 22 + char _license[] SEC("license") = "GPL"; 23 + 24 + UEI_DEFINE(uei); 25 + 26 + struct { 27 + __uint(type, BPF_MAP_TYPE_ARENA); 28 + __uint(map_flags, BPF_F_MMAPABLE); 29 + #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) 30 + __uint(max_entries, 1 << 16); /* number of pages */ 31 + __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ 32 + #else 33 + __uint(max_entries, 1 << 20); /* number of pages */ 34 + __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ 35 + #endif 36 + } arena __weak SEC(".maps"); 37 + 38 + #define SHARED_DSQ 0 39 + 40 + #define DEFINE_SDT_STAT(metric) \ 41 + static inline void \ 42 + stat_inc_##metric(struct scx_stats __arena *stats) \ 43 + { \ 44 + cast_kern(stats); \ 45 + stats->metric += 1; \ 46 + } \ 47 + __u64 stat_##metric; \ 48 + 49 + DEFINE_SDT_STAT(enqueue); 50 + DEFINE_SDT_STAT(init); 51 + DEFINE_SDT_STAT(exit); 52 + DEFINE_SDT_STAT(select_idle_cpu); 53 + DEFINE_SDT_STAT(select_busy_cpu); 54 + 55 + /* 56 + * Necessary for cond_break/can_loop's semantics. According to kernel commit 57 + * 011832b, the loop counter variable must be seen as imprecise and bounded 58 + * by the verifier. Initializing it from a constant (e.g., i = 0;), then, 59 + * makes it precise and prevents may_goto from helping with converging the 60 + * loop. For these loops we must initialize the loop counter from a variable 61 + * whose value the verifier cannot reason about when checking the program, so 62 + * that the loop counter's value is imprecise. 63 + */ 64 + static __u64 zero = 0; 65 + 66 + /* 67 + * XXX Hack to get the verifier to find the arena for sdt_exit_task. 68 + * As of 6.12-rc5, The verifier associates arenas with programs by 69 + * checking LD.IMM instruction operands for an arena and populating 70 + * the program state with the first instance it finds. This requires 71 + * accessing our global arena variable, but scx methods do not necessarily 72 + * do so while still using pointers from that arena. Insert a bpf_printk 73 + * statement that triggers at most once to generate an LD.IMM instruction 74 + * to access the arena and help the verifier. 75 + */ 76 + static volatile bool scx_arena_verify_once; 77 + 78 + __hidden void scx_arena_subprog_init(void) 79 + { 80 + if (scx_arena_verify_once) 81 + return; 82 + 83 + bpf_printk("%s: arena pointer %p", __func__, &arena); 84 + scx_arena_verify_once = true; 85 + } 86 + 87 + 88 + private(LOCK) struct bpf_spin_lock alloc_lock; 89 + private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock; 90 + 91 + /* allocation pools */ 92 + struct sdt_pool desc_pool; 93 + struct sdt_pool chunk_pool; 94 + 95 + /* Protected by alloc_lock. */ 96 + struct scx_alloc_stats alloc_stats; 97 + 98 + 99 + /* Allocate element from the pool. Must be called with a then pool lock held. */ 100 + static 101 + void __arena *scx_alloc_from_pool(struct sdt_pool *pool) 102 + { 103 + __u64 elem_size, max_elems; 104 + void __arena *slab; 105 + void __arena *ptr; 106 + 107 + elem_size = pool->elem_size; 108 + max_elems = pool->max_elems; 109 + 110 + /* If the chunk is spent, get a new one. */ 111 + if (pool->idx >= max_elems) { 112 + slab = bpf_arena_alloc_pages(&arena, NULL, 113 + div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0); 114 + if (!slab) 115 + return NULL; 116 + 117 + pool->slab = slab; 118 + pool->idx = 0; 119 + } 120 + 121 + ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx); 122 + pool->idx += 1; 123 + 124 + return ptr; 125 + } 126 + 127 + /* Alloc desc and associated chunk. Called with the allocator spinlock held. */ 128 + static sdt_desc_t *scx_alloc_chunk(void) 129 + { 130 + struct sdt_chunk __arena *chunk; 131 + sdt_desc_t *desc; 132 + sdt_desc_t *out; 133 + 134 + chunk = scx_alloc_from_pool(&chunk_pool); 135 + if (!chunk) 136 + return NULL; 137 + 138 + desc = scx_alloc_from_pool(&desc_pool); 139 + if (!desc) { 140 + /* 141 + * Effectively frees the previous chunk allocation. 142 + * Index cannot be 0, so decrementing is always 143 + * valid. 144 + */ 145 + chunk_pool.idx -= 1; 146 + return NULL; 147 + } 148 + 149 + out = desc; 150 + 151 + desc->nr_free = SDT_TASK_ENTS_PER_CHUNK; 152 + desc->chunk = chunk; 153 + 154 + alloc_stats.chunk_allocs += 1; 155 + 156 + return out; 157 + } 158 + 159 + static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages) 160 + { 161 + if (unlikely(data_size % 8)) 162 + return -EINVAL; 163 + 164 + if (unlikely(nr_pages == 0)) 165 + return -EINVAL; 166 + 167 + pool->elem_size = data_size; 168 + pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size; 169 + /* Populate the pool slab on the first allocation. */ 170 + pool->idx = pool->max_elems; 171 + 172 + return 0; 173 + } 174 + 175 + /* Initialize both the base pool allocators and the root chunk of the index. */ 176 + __hidden int 177 + scx_alloc_init(struct scx_allocator *alloc, __u64 data_size) 178 + { 179 + size_t min_chunk_size; 180 + int ret; 181 + 182 + _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE, 183 + "chunk size must fit into a page"); 184 + 185 + ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1); 186 + if (ret != 0) 187 + return ret; 188 + 189 + ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1); 190 + if (ret != 0) 191 + return ret; 192 + 193 + /* Wrap data into a descriptor and word align. */ 194 + data_size += sizeof(struct sdt_data); 195 + data_size = round_up(data_size, 8); 196 + 197 + /* 198 + * Ensure we allocate large enough chunks from the arena to avoid excessive 199 + * internal fragmentation when turning chunks it into structs. 200 + */ 201 + min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE); 202 + ret = pool_set_size(&alloc->pool, data_size, min_chunk_size); 203 + if (ret != 0) 204 + return ret; 205 + 206 + bpf_spin_lock(&alloc_lock); 207 + alloc->root = scx_alloc_chunk(); 208 + bpf_spin_unlock(&alloc_lock); 209 + if (!alloc->root) 210 + return -ENOMEM; 211 + 212 + return 0; 213 + } 214 + 215 + static 216 + int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) 217 + { 218 + __u64 __arena *allocated = desc->allocated; 219 + __u64 bit; 220 + 221 + if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK)) 222 + return -EINVAL; 223 + 224 + bit = (__u64)1 << (pos % 64); 225 + 226 + if (state) 227 + allocated[pos / 64] |= bit; 228 + else 229 + allocated[pos / 64] &= ~bit; 230 + 231 + return 0; 232 + } 233 + 234 + static __noinline 235 + int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) 236 + { 237 + sdt_desc_t *desc; 238 + __u64 u, level; 239 + int ret; 240 + 241 + for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { 242 + level = SDT_TASK_LEVELS - 1 - u; 243 + 244 + /* Only propagate upwards if we are the parent's only free chunk. */ 245 + desc = lv_desc[level]; 246 + 247 + ret = set_idx_state(desc, lv_pos[level], false); 248 + if (unlikely(ret != 0)) 249 + return ret; 250 + 251 + desc->nr_free += 1; 252 + if (desc->nr_free > 1) 253 + return 0; 254 + } 255 + 256 + return 0; 257 + } 258 + 259 + /* 260 + * Free the allocated struct with the given index. Called with the 261 + * allocator lock taken. 262 + */ 263 + __hidden 264 + int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx) 265 + { 266 + const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1; 267 + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 268 + sdt_desc_t * __arena *desc_children; 269 + struct sdt_chunk __arena *chunk; 270 + sdt_desc_t *desc; 271 + struct sdt_data __arena *data; 272 + __u64 level, shift, pos; 273 + __u64 lv_pos[SDT_TASK_LEVELS]; 274 + int ret; 275 + int i; 276 + 277 + if (!alloc) 278 + return 0; 279 + 280 + desc = alloc->root; 281 + if (unlikely(!desc)) 282 + return -EINVAL; 283 + 284 + /* To appease the verifier. */ 285 + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 286 + lv_desc[level] = NULL; 287 + lv_pos[level] = 0; 288 + } 289 + 290 + /* Find the leaf node containing the index. */ 291 + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 292 + shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; 293 + pos = (idx >> shift) & mask; 294 + 295 + lv_desc[level] = desc; 296 + lv_pos[level] = pos; 297 + 298 + if (level == SDT_TASK_LEVELS - 1) 299 + break; 300 + 301 + chunk = desc->chunk; 302 + 303 + desc_children = (sdt_desc_t * __arena *)chunk->descs; 304 + desc = desc_children[pos]; 305 + 306 + if (unlikely(!desc)) 307 + return -EINVAL; 308 + } 309 + 310 + chunk = desc->chunk; 311 + 312 + pos = idx & mask; 313 + data = chunk->data[pos]; 314 + if (likely(data)) { 315 + *data = (struct sdt_data) { 316 + .tid.genn = data->tid.genn + 1, 317 + }; 318 + 319 + /* Zero out one word at a time. */ 320 + for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { 321 + data->payload[i] = 0; 322 + } 323 + } 324 + 325 + ret = mark_nodes_avail(lv_desc, lv_pos); 326 + if (unlikely(ret != 0)) 327 + return ret; 328 + 329 + alloc_stats.active_allocs -= 1; 330 + alloc_stats.free_ops += 1; 331 + 332 + return 0; 333 + } 334 + 335 + static inline 336 + int ffs(__u64 word) 337 + { 338 + unsigned int num = 0; 339 + 340 + if ((word & 0xffffffff) == 0) { 341 + num += 32; 342 + word >>= 32; 343 + } 344 + 345 + if ((word & 0xffff) == 0) { 346 + num += 16; 347 + word >>= 16; 348 + } 349 + 350 + if ((word & 0xff) == 0) { 351 + num += 8; 352 + word >>= 8; 353 + } 354 + 355 + if ((word & 0xf) == 0) { 356 + num += 4; 357 + word >>= 4; 358 + } 359 + 360 + if ((word & 0x3) == 0) { 361 + num += 2; 362 + word >>= 2; 363 + } 364 + 365 + if ((word & 0x1) == 0) { 366 + num += 1; 367 + word >>= 1; 368 + } 369 + 370 + return num; 371 + } 372 + 373 + 374 + /* find the first empty slot */ 375 + __hidden 376 + __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc) 377 + { 378 + __u64 freeslots; 379 + __u64 i; 380 + 381 + for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) { 382 + freeslots = ~desc->allocated[i]; 383 + if (freeslots == (__u64)0) 384 + continue; 385 + 386 + return (i * 64) + ffs(freeslots); 387 + } 388 + 389 + return SDT_TASK_ENTS_PER_CHUNK; 390 + } 391 + 392 + /* 393 + * Find and return an available idx on the allocator. 394 + * Called with the task spinlock held. 395 + */ 396 + static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp) 397 + { 398 + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 399 + sdt_desc_t * __arena *desc_children; 400 + struct sdt_chunk __arena *chunk; 401 + sdt_desc_t *tmp; 402 + __u64 lv_pos[SDT_TASK_LEVELS]; 403 + __u64 u, pos, level; 404 + __u64 idx = 0; 405 + int ret; 406 + 407 + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 408 + pos = chunk_find_empty(desc); 409 + 410 + /* If we error out, something has gone very wrong. */ 411 + if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK)) 412 + return NULL; 413 + 414 + if (pos == SDT_TASK_ENTS_PER_CHUNK) 415 + return NULL; 416 + 417 + idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT; 418 + idx |= pos; 419 + 420 + /* Log the levels to complete allocation. */ 421 + lv_desc[level] = desc; 422 + lv_pos[level] = pos; 423 + 424 + /* The rest of the loop is for internal node traversal. */ 425 + if (level == SDT_TASK_LEVELS - 1) 426 + break; 427 + 428 + /* Allocate an internal node if necessary. */ 429 + chunk = desc->chunk; 430 + desc_children = (sdt_desc_t * __arena *)chunk->descs; 431 + 432 + desc = desc_children[pos]; 433 + if (!desc) { 434 + desc = scx_alloc_chunk(); 435 + if (!desc) 436 + return NULL; 437 + 438 + desc_children[pos] = desc; 439 + } 440 + } 441 + 442 + /* 443 + * Finding the descriptor along with any internal node 444 + * allocations was successful. Update all levels with 445 + * the new allocation. 446 + */ 447 + bpf_for(u, 0, SDT_TASK_LEVELS) { 448 + level = SDT_TASK_LEVELS - 1 - u; 449 + tmp = lv_desc[level]; 450 + 451 + ret = set_idx_state(tmp, lv_pos[level], true); 452 + if (ret != 0) 453 + break; 454 + 455 + tmp->nr_free -= 1; 456 + if (tmp->nr_free > 0) 457 + break; 458 + } 459 + 460 + *idxp = idx; 461 + 462 + return desc; 463 + } 464 + 465 + __hidden 466 + void __arena *scx_alloc(struct scx_allocator *alloc) 467 + { 468 + struct sdt_data __arena *data = NULL; 469 + struct sdt_chunk __arena *chunk; 470 + sdt_desc_t *desc; 471 + __u64 idx, pos; 472 + 473 + if (!alloc) 474 + return NULL; 475 + 476 + bpf_spin_lock(&alloc_lock); 477 + 478 + /* We unlock if we encounter an error in the function. */ 479 + desc = desc_find_empty(alloc->root, &idx); 480 + if (unlikely(desc == NULL)) { 481 + bpf_spin_unlock(&alloc_lock); 482 + return NULL; 483 + } 484 + 485 + chunk = desc->chunk; 486 + 487 + /* Populate the leaf node if necessary. */ 488 + pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1); 489 + data = chunk->data[pos]; 490 + if (!data) { 491 + data = scx_alloc_from_pool(&alloc->pool); 492 + if (!data) { 493 + scx_alloc_free_idx(alloc, idx); 494 + bpf_spin_unlock(&alloc_lock); 495 + return NULL; 496 + } 497 + } 498 + 499 + chunk->data[pos] = data; 500 + 501 + /* The data counts as a chunk */ 502 + alloc_stats.data_allocs += 1; 503 + alloc_stats.alloc_ops += 1; 504 + alloc_stats.active_allocs += 1; 505 + 506 + data->tid.idx = idx; 507 + 508 + bpf_spin_unlock(&alloc_lock); 509 + 510 + return data; 511 + } 512 + 513 + /* 514 + * Task BPF map entry recording the task's assigned ID and pointing to the data 515 + * area allocated in arena. 516 + */ 517 + struct scx_task_map_val { 518 + union sdt_id tid; 519 + __u64 tptr; 520 + struct sdt_data __arena *data; 521 + }; 522 + 523 + struct { 524 + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 525 + __uint(map_flags, BPF_F_NO_PREALLOC); 526 + __type(key, int); 527 + __type(value, struct scx_task_map_val); 528 + } scx_task_map SEC(".maps"); 529 + 530 + static struct scx_allocator scx_task_allocator; 531 + 532 + __hidden 533 + void __arena *scx_task_alloc(struct task_struct *p) 534 + { 535 + struct sdt_data __arena *data = NULL; 536 + struct scx_task_map_val *mval; 537 + 538 + mval = bpf_task_storage_get(&scx_task_map, p, 0, 539 + BPF_LOCAL_STORAGE_GET_F_CREATE); 540 + if (!mval) 541 + return NULL; 542 + 543 + data = scx_alloc(&scx_task_allocator); 544 + if (unlikely(!data)) 545 + return NULL; 546 + 547 + mval->tid = data->tid; 548 + mval->tptr = (__u64) p; 549 + mval->data = data; 550 + 551 + return (void __arena *)data->payload; 552 + } 553 + 554 + __hidden 555 + int scx_task_init(__u64 data_size) 556 + { 557 + return scx_alloc_init(&scx_task_allocator, data_size); 558 + } 559 + 560 + __hidden 561 + void __arena *scx_task_data(struct task_struct *p) 562 + { 563 + struct sdt_data __arena *data; 564 + struct scx_task_map_val *mval; 565 + 566 + scx_arena_subprog_init(); 567 + 568 + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 569 + if (!mval) 570 + return NULL; 571 + 572 + data = mval->data; 573 + 574 + return (void __arena *)data->payload; 575 + } 576 + 577 + __hidden 578 + void scx_task_free(struct task_struct *p) 579 + { 580 + struct scx_task_map_val *mval; 581 + 582 + scx_arena_subprog_init(); 583 + 584 + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 585 + if (!mval) 586 + return; 587 + 588 + bpf_spin_lock(&alloc_lock); 589 + scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx); 590 + bpf_spin_unlock(&alloc_lock); 591 + 592 + bpf_task_storage_delete(&scx_task_map, p); 593 + } 594 + 595 + static inline void 596 + scx_stat_global_update(struct scx_stats __arena *stats) 597 + { 598 + cast_kern(stats); 599 + __sync_fetch_and_add(&stat_enqueue, stats->enqueue); 600 + __sync_fetch_and_add(&stat_init, stats->init); 601 + __sync_fetch_and_add(&stat_exit, stats->exit); 602 + __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu); 603 + __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu); 604 + } 605 + 606 + s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 607 + { 608 + struct scx_stats __arena *stats; 609 + bool is_idle = false; 610 + s32 cpu; 611 + 612 + stats = scx_task_data(p); 613 + if (!stats) { 614 + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 615 + return 0; 616 + } 617 + 618 + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 619 + if (is_idle) { 620 + stat_inc_select_idle_cpu(stats); 621 + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 622 + } else { 623 + stat_inc_select_busy_cpu(stats); 624 + } 625 + 626 + return cpu; 627 + } 628 + 629 + void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags) 630 + { 631 + struct scx_stats __arena *stats; 632 + 633 + stats = scx_task_data(p); 634 + if (!stats) { 635 + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 636 + return; 637 + } 638 + 639 + stat_inc_enqueue(stats); 640 + 641 + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); 642 + } 643 + 644 + void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev) 645 + { 646 + scx_bpf_dsq_move_to_local(SHARED_DSQ); 647 + } 648 + 649 + s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p, 650 + struct scx_init_task_args *args) 651 + { 652 + struct scx_stats __arena *stats; 653 + 654 + stats = scx_task_alloc(p); 655 + if (!stats) { 656 + scx_bpf_error("arena allocator out of memory"); 657 + return -ENOMEM; 658 + } 659 + 660 + stats->pid = p->pid; 661 + 662 + stat_inc_init(stats); 663 + 664 + return 0; 665 + } 666 + 667 + void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p, 668 + struct scx_exit_task_args *args) 669 + { 670 + struct scx_stats __arena *stats; 671 + 672 + stats = scx_task_data(p); 673 + if (!stats) { 674 + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 675 + return; 676 + } 677 + 678 + stat_inc_exit(stats); 679 + scx_stat_global_update(stats); 680 + 681 + scx_task_free(p); 682 + } 683 + 684 + s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init) 685 + { 686 + int ret; 687 + 688 + ret = scx_task_init(sizeof(struct scx_stats)); 689 + if (ret < 0) { 690 + scx_bpf_error("%s: failed with %d", __func__, ret); 691 + return ret; 692 + } 693 + 694 + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 695 + if (ret) { 696 + scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); 697 + return ret; 698 + } 699 + 700 + return 0; 701 + } 702 + 703 + void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei) 704 + { 705 + UEI_RECORD(uei, ei); 706 + } 707 + 708 + SCX_OPS_DEFINE(sdt_ops, 709 + .select_cpu = (void *)sdt_select_cpu, 710 + .enqueue = (void *)sdt_enqueue, 711 + .dispatch = (void *)sdt_dispatch, 712 + .init_task = (void *)sdt_init_task, 713 + .exit_task = (void *)sdt_exit_task, 714 + .init = (void *)sdt_init, 715 + .exit = (void *)sdt_exit, 716 + .name = "sdt");

+101

tools/sched_ext/scx_sdt.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com> 5 + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> 6 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 7 + */ 8 + #include <stdio.h> 9 + #include <unistd.h> 10 + #include <signal.h> 11 + #include <libgen.h> 12 + #include <bpf/bpf.h> 13 + #include <scx/common.h> 14 + 15 + #include "scx_sdt.h" 16 + #include "scx_sdt.bpf.skel.h" 17 + 18 + const char help_fmt[] = 19 + "A simple arena-based sched_ext scheduler.\n" 20 + "\n" 21 + "Modified version of scx_simple that demonstrates arena-based data structures.\n" 22 + "\n" 23 + "Usage: %s [-f] [-v]\n" 24 + "\n" 25 + " -v Print libbpf debug messages\n" 26 + " -h Display this help and exit\n"; 27 + 28 + static bool verbose; 29 + static volatile int exit_req; 30 + 31 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 32 + { 33 + if (level == LIBBPF_DEBUG && !verbose) 34 + return 0; 35 + return vfprintf(stderr, format, args); 36 + } 37 + 38 + static void sigint_handler(int sig) 39 + { 40 + exit_req = 1; 41 + } 42 + 43 + int main(int argc, char **argv) 44 + { 45 + struct scx_sdt *skel; 46 + struct bpf_link *link; 47 + __u32 opt; 48 + __u64 ecode; 49 + 50 + libbpf_set_print(libbpf_print_fn); 51 + signal(SIGINT, sigint_handler); 52 + signal(SIGTERM, sigint_handler); 53 + restart: 54 + skel = SCX_OPS_OPEN(sdt_ops, scx_sdt); 55 + 56 + while ((opt = getopt(argc, argv, "fvh")) != -1) { 57 + switch (opt) { 58 + case 'v': 59 + verbose = true; 60 + break; 61 + default: 62 + fprintf(stderr, help_fmt, basename(argv[0])); 63 + return opt != 'h'; 64 + } 65 + } 66 + 67 + SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei); 68 + link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt); 69 + 70 + while (!exit_req && !UEI_EXITED(skel, uei)) { 71 + printf("====SCHEDULING STATS====\n"); 72 + printf("enqueues=%llu\t", skel->bss->stat_enqueue); 73 + printf("inits=%llu\t", skel->bss->stat_init); 74 + printf("exits=%llu\t", skel->bss->stat_exit); 75 + printf("\n"); 76 + 77 + printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu); 78 + printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu); 79 + printf("\n"); 80 + 81 + printf("====ALLOCATION STATS====\n"); 82 + printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs); 83 + printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs); 84 + printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops); 85 + printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops); 86 + printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs); 87 + printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used); 88 + printf("\n\n"); 89 + 90 + fflush(stdout); 91 + sleep(1); 92 + } 93 + 94 + bpf_link__destroy(link); 95 + ecode = UEI_REPORT(skel, uei); 96 + scx_sdt__destroy(skel); 97 + 98 + if (UEI_ECODE_RESTART(ecode)) 99 + goto restart; 100 + return 0; 101 + }

+113

tools/sched_ext/scx_sdt.h

··· 1 + /* 2 + * SPDX-License-Identifier: GPL-2.0 3 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com> 6 + */ 7 + #pragma once 8 + 9 + #ifndef __BPF__ 10 + #define __arena 11 + #endif /* __BPF__ */ 12 + 13 + struct scx_alloc_stats { 14 + __u64 chunk_allocs; 15 + __u64 data_allocs; 16 + __u64 alloc_ops; 17 + __u64 free_ops; 18 + __u64 active_allocs; 19 + __u64 arena_pages_used; 20 + }; 21 + 22 + struct sdt_pool { 23 + void __arena *slab; 24 + __u64 elem_size; 25 + __u64 max_elems; 26 + __u64 idx; 27 + }; 28 + 29 + #ifndef div_round_up 30 + #define div_round_up(a, b) (((a) + (b) - 1) / (b)) 31 + #endif 32 + 33 + #ifndef round_up 34 + #define round_up(a, b) (div_round_up((a), (b)) * (b)) 35 + #endif 36 + 37 + typedef struct sdt_desc __arena sdt_desc_t; 38 + 39 + enum sdt_consts { 40 + SDT_TASK_ENTS_PER_PAGE_SHIFT = 9, 41 + SDT_TASK_LEVELS = 3, 42 + SDT_TASK_ENTS_PER_CHUNK = 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT, 43 + SDT_TASK_CHUNK_BITMAP_U64S = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64), 44 + SDT_TASK_MIN_ELEM_PER_ALLOC = 8, 45 + }; 46 + 47 + union sdt_id { 48 + __s64 val; 49 + struct { 50 + __s32 idx; /* index in the radix tree */ 51 + __s32 genn; /* ++'d on recycle so that it forms unique'ish 64bit ID */ 52 + }; 53 + }; 54 + 55 + struct sdt_chunk; 56 + 57 + /* 58 + * Each index page is described by the following descriptor which carries the 59 + * bitmap. This way the actual index can host power-of-two numbers of entries 60 + * which makes indexing cheaper. 61 + */ 62 + struct sdt_desc { 63 + __u64 allocated[SDT_TASK_CHUNK_BITMAP_U64S]; 64 + __u64 nr_free; 65 + struct sdt_chunk __arena *chunk; 66 + }; 67 + 68 + /* 69 + * Leaf node containing per-task data. 70 + */ 71 + struct sdt_data { 72 + union sdt_id tid; 73 + __u64 payload[]; 74 + }; 75 + 76 + /* 77 + * Intermediate node pointing to another intermediate node or leaf node. 78 + */ 79 + struct sdt_chunk { 80 + union { 81 + sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK]; 82 + struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK]; 83 + }; 84 + }; 85 + 86 + struct scx_allocator { 87 + struct sdt_pool pool; 88 + sdt_desc_t *root; 89 + }; 90 + 91 + struct scx_stats { 92 + int seq; 93 + pid_t pid; 94 + __u64 enqueue; 95 + __u64 exit; 96 + __u64 init; 97 + __u64 select_busy_cpu; 98 + __u64 select_idle_cpu; 99 + }; 100 + 101 + #ifdef __BPF__ 102 + 103 + void __arena *scx_task_data(struct task_struct *p); 104 + int scx_task_init(__u64 data_size); 105 + void __arena *scx_task_alloc(struct task_struct *p); 106 + void scx_task_free(struct task_struct *p); 107 + void scx_arena_subprog_init(void); 108 + 109 + int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size); 110 + u64 scx_alloc_internal(struct scx_allocator *alloc); 111 + int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx); 112 + 113 + #endif /* __BPF__ */

+9 -1

tools/sched_ext/scx_simple.bpf.c

··· 131 131 132 132 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) 133 133 { 134 - return scx_bpf_create_dsq(SHARED_DSQ, -1); 134 + int ret; 135 + 136 + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 137 + if (ret) { 138 + scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); 139 + return ret; 140 + } 141 + 142 + return 0; 135 143 } 136 144 137 145 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)

+344

tools/sched_ext/scx_userland.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A minimal userland scheduler. 4 + * 5 + * In terms of scheduling, this provides two different types of behaviors: 6 + * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. 7 + * All such tasks are direct-dispatched from the kernel, and are never 8 + * enqueued in user space. 9 + * 2. A primitive vruntime scheduler that is implemented in user space, for all 10 + * other tasks. 11 + * 12 + * Some parts of this example user space scheduler could be implemented more 13 + * efficiently using more complex and sophisticated data structures. For 14 + * example, rather than using BPF_MAP_TYPE_QUEUE's, 15 + * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between 16 + * user space and kernel space. Similarly, we use a simple vruntime-sorted list 17 + * in user space, but an rbtree could be used instead. 18 + * 19 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 20 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 21 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 22 + */ 23 + #include <scx/common.bpf.h> 24 + #include "scx_userland.h" 25 + 26 + /* 27 + * Maximum amount of tasks enqueued/dispatched between kernel and user-space. 28 + */ 29 + #define MAX_ENQUEUED_TASKS 4096 30 + 31 + char _license[] SEC("license") = "GPL"; 32 + 33 + const volatile s32 usersched_pid; 34 + 35 + /* !0 for veristat, set during init */ 36 + const volatile u32 num_possible_cpus = 64; 37 + 38 + /* Stats that are printed by user space. */ 39 + u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; 40 + 41 + /* 42 + * Number of tasks that are queued for scheduling. 43 + * 44 + * This number is incremented by the BPF component when a task is queued to the 45 + * user-space scheduler and it must be decremented by the user-space scheduler 46 + * when a task is consumed. 47 + */ 48 + volatile u64 nr_queued; 49 + 50 + /* 51 + * Number of tasks that are waiting for scheduling. 52 + * 53 + * This number must be updated by the user-space scheduler to keep track if 54 + * there is still some scheduling work to do. 55 + */ 56 + volatile u64 nr_scheduled; 57 + 58 + UEI_DEFINE(uei); 59 + 60 + /* 61 + * The map containing tasks that are enqueued in user space from the kernel. 62 + * 63 + * This map is drained by the user space scheduler. 64 + */ 65 + struct { 66 + __uint(type, BPF_MAP_TYPE_QUEUE); 67 + __uint(max_entries, MAX_ENQUEUED_TASKS); 68 + __type(value, struct scx_userland_enqueued_task); 69 + } enqueued SEC(".maps"); 70 + 71 + /* 72 + * The map containing tasks that are dispatched to the kernel from user space. 73 + * 74 + * Drained by the kernel in userland_dispatch(). 75 + */ 76 + struct { 77 + __uint(type, BPF_MAP_TYPE_QUEUE); 78 + __uint(max_entries, MAX_ENQUEUED_TASKS); 79 + __type(value, s32); 80 + } dispatched SEC(".maps"); 81 + 82 + /* Per-task scheduling context */ 83 + struct task_ctx { 84 + bool force_local; /* Dispatch directly to local DSQ */ 85 + }; 86 + 87 + /* Map that contains task-local storage. */ 88 + struct { 89 + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 90 + __uint(map_flags, BPF_F_NO_PREALLOC); 91 + __type(key, int); 92 + __type(value, struct task_ctx); 93 + } task_ctx_stor SEC(".maps"); 94 + 95 + /* 96 + * Flag used to wake-up the user-space scheduler. 97 + */ 98 + static volatile u32 usersched_needed; 99 + 100 + /* 101 + * Set user-space scheduler wake-up flag (equivalent to an atomic release 102 + * operation). 103 + */ 104 + static void set_usersched_needed(void) 105 + { 106 + __sync_fetch_and_or(&usersched_needed, 1); 107 + } 108 + 109 + /* 110 + * Check and clear user-space scheduler wake-up flag (equivalent to an atomic 111 + * acquire operation). 112 + */ 113 + static bool test_and_clear_usersched_needed(void) 114 + { 115 + return __sync_fetch_and_and(&usersched_needed, 0) == 1; 116 + } 117 + 118 + static bool is_usersched_task(const struct task_struct *p) 119 + { 120 + return p->pid == usersched_pid; 121 + } 122 + 123 + static bool keep_in_kernel(const struct task_struct *p) 124 + { 125 + return p->nr_cpus_allowed < num_possible_cpus; 126 + } 127 + 128 + static struct task_struct *usersched_task(void) 129 + { 130 + struct task_struct *p; 131 + 132 + p = bpf_task_from_pid(usersched_pid); 133 + /* 134 + * Should never happen -- the usersched task should always be managed 135 + * by sched_ext. 136 + */ 137 + if (!p) 138 + scx_bpf_error("Failed to find usersched task %d", usersched_pid); 139 + 140 + return p; 141 + } 142 + 143 + s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, 144 + s32 prev_cpu, u64 wake_flags) 145 + { 146 + if (keep_in_kernel(p)) { 147 + s32 cpu; 148 + struct task_ctx *tctx; 149 + 150 + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 151 + if (!tctx) { 152 + scx_bpf_error("Failed to look up task-local storage for %s", p->comm); 153 + return -ESRCH; 154 + } 155 + 156 + if (p->nr_cpus_allowed == 1 || 157 + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 158 + tctx->force_local = true; 159 + return prev_cpu; 160 + } 161 + 162 + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 163 + if (cpu >= 0) { 164 + tctx->force_local = true; 165 + return cpu; 166 + } 167 + } 168 + 169 + return prev_cpu; 170 + } 171 + 172 + static void dispatch_user_scheduler(void) 173 + { 174 + struct task_struct *p; 175 + 176 + p = usersched_task(); 177 + if (p) { 178 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 179 + bpf_task_release(p); 180 + } 181 + } 182 + 183 + static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) 184 + { 185 + struct scx_userland_enqueued_task task = {}; 186 + 187 + task.pid = p->pid; 188 + task.sum_exec_runtime = p->se.sum_exec_runtime; 189 + task.weight = p->scx.weight; 190 + 191 + if (bpf_map_push_elem(&enqueued, &task, 0)) { 192 + /* 193 + * If we fail to enqueue the task in user space, put it 194 + * directly on the global DSQ. 195 + */ 196 + __sync_fetch_and_add(&nr_failed_enqueues, 1); 197 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 198 + } else { 199 + __sync_fetch_and_add(&nr_user_enqueues, 1); 200 + set_usersched_needed(); 201 + } 202 + } 203 + 204 + void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) 205 + { 206 + if (keep_in_kernel(p)) { 207 + u64 dsq_id = SCX_DSQ_GLOBAL; 208 + struct task_ctx *tctx; 209 + 210 + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 211 + if (!tctx) { 212 + scx_bpf_error("Failed to lookup task ctx for %s", p->comm); 213 + return; 214 + } 215 + 216 + if (tctx->force_local) 217 + dsq_id = SCX_DSQ_LOCAL; 218 + tctx->force_local = false; 219 + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); 220 + __sync_fetch_and_add(&nr_kernel_enqueues, 1); 221 + return; 222 + } else if (!is_usersched_task(p)) { 223 + enqueue_task_in_user_space(p, enq_flags); 224 + } 225 + } 226 + 227 + void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) 228 + { 229 + if (test_and_clear_usersched_needed()) 230 + dispatch_user_scheduler(); 231 + 232 + bpf_repeat(MAX_ENQUEUED_TASKS) { 233 + s32 pid; 234 + struct task_struct *p; 235 + 236 + if (bpf_map_pop_elem(&dispatched, &pid)) 237 + break; 238 + 239 + /* 240 + * The task could have exited by the time we get around to 241 + * dispatching it. Treat this as a normal occurrence, and simply 242 + * move onto the next iteration. 243 + */ 244 + p = bpf_task_from_pid(pid); 245 + if (!p) 246 + continue; 247 + 248 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 249 + bpf_task_release(p); 250 + } 251 + } 252 + 253 + /* 254 + * A CPU is about to change its idle state. If the CPU is going idle, ensure 255 + * that the user-space scheduler has a chance to run if there is any remaining 256 + * work to do. 257 + */ 258 + void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle) 259 + { 260 + /* 261 + * Don't do anything if we exit from and idle state, a CPU owner will 262 + * be assigned in .running(). 263 + */ 264 + if (!idle) 265 + return; 266 + /* 267 + * A CPU is now available, notify the user-space scheduler that tasks 268 + * can be dispatched, if there is at least one task waiting to be 269 + * scheduled, either queued (accounted in nr_queued) or scheduled 270 + * (accounted in nr_scheduled). 271 + * 272 + * NOTE: nr_queued is incremented by the BPF component, more exactly in 273 + * enqueue(), when a task is sent to the user-space scheduler, then 274 + * the scheduler drains the queued tasks (updating nr_queued) and adds 275 + * them to its internal data structures / state; at this point tasks 276 + * become "scheduled" and the user-space scheduler will take care of 277 + * updating nr_scheduled accordingly; lastly tasks will be dispatched 278 + * and the user-space scheduler will update nr_scheduled again. 279 + * 280 + * Checking both counters allows to determine if there is still some 281 + * pending work to do for the scheduler: new tasks have been queued 282 + * since last check, or there are still tasks "queued" or "scheduled" 283 + * since the previous user-space scheduler run. If the counters are 284 + * both zero it is pointless to wake-up the scheduler (even if a CPU 285 + * becomes idle), because there is nothing to do. 286 + * 287 + * Keep in mind that update_idle() doesn't run concurrently with the 288 + * user-space scheduler (that is single-threaded): this function is 289 + * naturally serialized with the user-space scheduler code, therefore 290 + * this check here is also safe from a concurrency perspective. 291 + */ 292 + if (nr_queued || nr_scheduled) { 293 + /* 294 + * Kick the CPU to make it immediately ready to accept 295 + * dispatched tasks. 296 + */ 297 + set_usersched_needed(); 298 + scx_bpf_kick_cpu(cpu, 0); 299 + } 300 + } 301 + 302 + s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p, 303 + struct scx_init_task_args *args) 304 + { 305 + if (bpf_task_storage_get(&task_ctx_stor, p, 0, 306 + BPF_LOCAL_STORAGE_GET_F_CREATE)) 307 + return 0; 308 + else 309 + return -ENOMEM; 310 + } 311 + 312 + s32 BPF_STRUCT_OPS(userland_init) 313 + { 314 + if (num_possible_cpus == 0) { 315 + scx_bpf_error("User scheduler # CPUs uninitialized (%d)", 316 + num_possible_cpus); 317 + return -EINVAL; 318 + } 319 + 320 + if (usersched_pid <= 0) { 321 + scx_bpf_error("User scheduler pid uninitialized (%d)", 322 + usersched_pid); 323 + return -EINVAL; 324 + } 325 + 326 + return 0; 327 + } 328 + 329 + void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) 330 + { 331 + UEI_RECORD(uei, ei); 332 + } 333 + 334 + SCX_OPS_DEFINE(userland_ops, 335 + .select_cpu = (void *)userland_select_cpu, 336 + .enqueue = (void *)userland_enqueue, 337 + .dispatch = (void *)userland_dispatch, 338 + .update_idle = (void *)userland_update_idle, 339 + .init_task = (void *)userland_init_task, 340 + .init = (void *)userland_init, 341 + .exit = (void *)userland_exit, 342 + .flags = SCX_OPS_ENQ_LAST | 343 + SCX_OPS_KEEP_BUILTIN_IDLE, 344 + .name = "userland");

+437

tools/sched_ext/scx_userland.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A demo sched_ext user space scheduler which provides vruntime semantics 4 + * using a simple ordered-list implementation. 5 + * 6 + * Each CPU in the system resides in a single, global domain. This precludes 7 + * the need to do any load balancing between domains. The scheduler could 8 + * easily be extended to support multiple domains, with load balancing 9 + * happening in user space. 10 + * 11 + * Any task which has any CPU affinity is scheduled entirely in BPF. This 12 + * program only schedules tasks which may run on any CPU. 13 + * 14 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 15 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 16 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 17 + */ 18 + #include <stdio.h> 19 + #include <unistd.h> 20 + #include <sched.h> 21 + #include <signal.h> 22 + #include <assert.h> 23 + #include <libgen.h> 24 + #include <pthread.h> 25 + #include <bpf/bpf.h> 26 + #include <sys/mman.h> 27 + #include <sys/queue.h> 28 + #include <sys/syscall.h> 29 + 30 + #include <scx/common.h> 31 + #include "scx_userland.h" 32 + #include "scx_userland.bpf.skel.h" 33 + 34 + const char help_fmt[] = 35 + "A minimal userland sched_ext scheduler.\n" 36 + "\n" 37 + "See the top-level comment in .bpf.c for more details.\n" 38 + "\n" 39 + "Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" 40 + "\n" 41 + "Usage: %s [-b BATCH]\n" 42 + "\n" 43 + " -b BATCH The number of tasks to batch when dispatching (default: 8)\n" 44 + " -v Print libbpf debug messages\n" 45 + " -h Display this help and exit\n"; 46 + 47 + /* Defined in UAPI */ 48 + #define SCHED_EXT 7 49 + 50 + /* Number of tasks to batch when dispatching to user space. */ 51 + static __u32 batch_size = 8; 52 + 53 + static bool verbose; 54 + static volatile int exit_req; 55 + static int enqueued_fd, dispatched_fd; 56 + 57 + static struct scx_userland *skel; 58 + static struct bpf_link *ops_link; 59 + 60 + /* Stats collected in user space. */ 61 + static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; 62 + 63 + /* Number of tasks currently enqueued. */ 64 + static __u64 nr_curr_enqueued; 65 + 66 + /* The data structure containing tasks that are enqueued in user space. */ 67 + struct enqueued_task { 68 + LIST_ENTRY(enqueued_task) entries; 69 + __u64 sum_exec_runtime; 70 + double vruntime; 71 + }; 72 + 73 + /* 74 + * Use a vruntime-sorted list to store tasks. This could easily be extended to 75 + * a more optimal data structure, such as an rbtree as is done in CFS. We 76 + * currently elect to use a sorted list to simplify the example for 77 + * illustrative purposes. 78 + */ 79 + LIST_HEAD(listhead, enqueued_task); 80 + 81 + /* 82 + * A vruntime-sorted list of tasks. The head of the list contains the task with 83 + * the lowest vruntime. That is, the task that has the "highest" claim to be 84 + * scheduled. 85 + */ 86 + static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); 87 + 88 + /* 89 + * The main array of tasks. The array is allocated all at once during 90 + * initialization, based on /proc/sys/kernel/pid_max, to avoid having to 91 + * dynamically allocate memory on the enqueue path, which could cause a 92 + * deadlock. A more substantive user space scheduler could e.g. provide a hook 93 + * for newly enabled tasks that are passed to the scheduler from the 94 + * .prep_enable() callback to allows the scheduler to allocate on safe paths. 95 + */ 96 + struct enqueued_task *tasks; 97 + static int pid_max; 98 + 99 + static double min_vruntime; 100 + 101 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 102 + { 103 + if (level == LIBBPF_DEBUG && !verbose) 104 + return 0; 105 + return vfprintf(stderr, format, args); 106 + } 107 + 108 + static void sigint_handler(int userland) 109 + { 110 + exit_req = 1; 111 + } 112 + 113 + static int get_pid_max(void) 114 + { 115 + FILE *fp; 116 + int pid_max; 117 + 118 + fp = fopen("/proc/sys/kernel/pid_max", "r"); 119 + if (fp == NULL) { 120 + fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); 121 + return -1; 122 + } 123 + if (fscanf(fp, "%d", &pid_max) != 1) { 124 + fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); 125 + fclose(fp); 126 + return -1; 127 + } 128 + fclose(fp); 129 + 130 + return pid_max; 131 + } 132 + 133 + static int init_tasks(void) 134 + { 135 + pid_max = get_pid_max(); 136 + if (pid_max < 0) 137 + return pid_max; 138 + 139 + tasks = calloc(pid_max, sizeof(*tasks)); 140 + if (!tasks) { 141 + fprintf(stderr, "Error allocating tasks array\n"); 142 + return -ENOMEM; 143 + } 144 + 145 + return 0; 146 + } 147 + 148 + static __u32 task_pid(const struct enqueued_task *task) 149 + { 150 + return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); 151 + } 152 + 153 + static int dispatch_task(__s32 pid) 154 + { 155 + int err; 156 + 157 + err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); 158 + if (err) { 159 + nr_vruntime_failed++; 160 + } else { 161 + nr_vruntime_dispatches++; 162 + } 163 + 164 + return err; 165 + } 166 + 167 + static struct enqueued_task *get_enqueued_task(__s32 pid) 168 + { 169 + if (pid >= pid_max) 170 + return NULL; 171 + 172 + return &tasks[pid]; 173 + } 174 + 175 + static double calc_vruntime_delta(__u64 weight, __u64 delta) 176 + { 177 + double weight_f = (double)weight / 100.0; 178 + double delta_f = (double)delta; 179 + 180 + return delta_f / weight_f; 181 + } 182 + 183 + static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) 184 + { 185 + __u64 delta; 186 + 187 + delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; 188 + 189 + enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); 190 + if (min_vruntime > enqueued->vruntime) 191 + enqueued->vruntime = min_vruntime; 192 + enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; 193 + } 194 + 195 + static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) 196 + { 197 + struct enqueued_task *curr, *enqueued, *prev; 198 + 199 + curr = get_enqueued_task(bpf_task->pid); 200 + if (!curr) 201 + return ENOENT; 202 + 203 + update_enqueued(curr, bpf_task); 204 + nr_vruntime_enqueues++; 205 + nr_curr_enqueued++; 206 + 207 + /* 208 + * Enqueue the task in a vruntime-sorted list. A more optimal data 209 + * structure such as an rbtree could easily be used as well. We elect 210 + * to use a list here simply because it's less code, and thus the 211 + * example is less convoluted and better serves to illustrate what a 212 + * user space scheduler could look like. 213 + */ 214 + 215 + if (LIST_EMPTY(&vruntime_head)) { 216 + LIST_INSERT_HEAD(&vruntime_head, curr, entries); 217 + return 0; 218 + } 219 + 220 + LIST_FOREACH(enqueued, &vruntime_head, entries) { 221 + if (curr->vruntime <= enqueued->vruntime) { 222 + LIST_INSERT_BEFORE(enqueued, curr, entries); 223 + return 0; 224 + } 225 + prev = enqueued; 226 + } 227 + 228 + LIST_INSERT_AFTER(prev, curr, entries); 229 + 230 + return 0; 231 + } 232 + 233 + static void drain_enqueued_map(void) 234 + { 235 + while (1) { 236 + struct scx_userland_enqueued_task task; 237 + int err; 238 + 239 + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { 240 + skel->bss->nr_queued = 0; 241 + skel->bss->nr_scheduled = nr_curr_enqueued; 242 + return; 243 + } 244 + 245 + err = vruntime_enqueue(&task); 246 + if (err) { 247 + fprintf(stderr, "Failed to enqueue task %d: %s\n", 248 + task.pid, strerror(err)); 249 + exit_req = 1; 250 + return; 251 + } 252 + } 253 + } 254 + 255 + static void dispatch_batch(void) 256 + { 257 + __u32 i; 258 + 259 + for (i = 0; i < batch_size; i++) { 260 + struct enqueued_task *task; 261 + int err; 262 + __s32 pid; 263 + 264 + task = LIST_FIRST(&vruntime_head); 265 + if (!task) 266 + break; 267 + 268 + min_vruntime = task->vruntime; 269 + pid = task_pid(task); 270 + LIST_REMOVE(task, entries); 271 + err = dispatch_task(pid); 272 + if (err) { 273 + /* 274 + * If we fail to dispatch, put the task back to the 275 + * vruntime_head list and stop dispatching additional 276 + * tasks in this batch. 277 + */ 278 + LIST_INSERT_HEAD(&vruntime_head, task, entries); 279 + break; 280 + } 281 + nr_curr_enqueued--; 282 + } 283 + skel->bss->nr_scheduled = nr_curr_enqueued; 284 + } 285 + 286 + static void *run_stats_printer(void *arg) 287 + { 288 + while (!exit_req) { 289 + __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; 290 + 291 + nr_failed_enqueues = skel->bss->nr_failed_enqueues; 292 + nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; 293 + nr_user_enqueues = skel->bss->nr_user_enqueues; 294 + total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; 295 + 296 + printf("o-----------------------o\n"); 297 + printf("| BPF ENQUEUES |\n"); 298 + printf("|-----------------------|\n"); 299 + printf("| kern: %10llu |\n", nr_kernel_enqueues); 300 + printf("| user: %10llu |\n", nr_user_enqueues); 301 + printf("| failed: %10llu |\n", nr_failed_enqueues); 302 + printf("| -------------------- |\n"); 303 + printf("| total: %10llu |\n", total); 304 + printf("| |\n"); 305 + printf("|-----------------------|\n"); 306 + printf("| VRUNTIME / USER |\n"); 307 + printf("|-----------------------|\n"); 308 + printf("| enq: %10llu |\n", nr_vruntime_enqueues); 309 + printf("| disp: %10llu |\n", nr_vruntime_dispatches); 310 + printf("| failed: %10llu |\n", nr_vruntime_failed); 311 + printf("o-----------------------o\n"); 312 + printf("\n\n"); 313 + fflush(stdout); 314 + sleep(1); 315 + } 316 + 317 + return NULL; 318 + } 319 + 320 + static int spawn_stats_thread(void) 321 + { 322 + pthread_t stats_printer; 323 + 324 + return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); 325 + } 326 + 327 + static void pre_bootstrap(int argc, char **argv) 328 + { 329 + int err; 330 + __u32 opt; 331 + struct sched_param sched_param = { 332 + .sched_priority = sched_get_priority_max(SCHED_EXT), 333 + }; 334 + 335 + err = init_tasks(); 336 + if (err) 337 + exit(err); 338 + 339 + libbpf_set_print(libbpf_print_fn); 340 + signal(SIGINT, sigint_handler); 341 + signal(SIGTERM, sigint_handler); 342 + 343 + /* 344 + * Enforce that the user scheduler task is managed by sched_ext. The 345 + * task eagerly drains the list of enqueued tasks in its main work 346 + * loop, and then yields the CPU. The BPF scheduler only schedules the 347 + * user space scheduler task when at least one other task in the system 348 + * needs to be scheduled. 349 + */ 350 + err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); 351 + SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); 352 + 353 + while ((opt = getopt(argc, argv, "b:vh")) != -1) { 354 + switch (opt) { 355 + case 'b': 356 + batch_size = strtoul(optarg, NULL, 0); 357 + break; 358 + case 'v': 359 + verbose = true; 360 + break; 361 + default: 362 + fprintf(stderr, help_fmt, basename(argv[0])); 363 + exit(opt != 'h'); 364 + } 365 + } 366 + 367 + /* 368 + * It's not always safe to allocate in a user space scheduler, as an 369 + * enqueued task could hold a lock that we require in order to be able 370 + * to allocate. 371 + */ 372 + err = mlockall(MCL_CURRENT | MCL_FUTURE); 373 + SCX_BUG_ON(err, "Failed to prefault and lock address space"); 374 + } 375 + 376 + static void bootstrap(char *comm) 377 + { 378 + skel = SCX_OPS_OPEN(userland_ops, scx_userland); 379 + 380 + skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); 381 + assert(skel->rodata->num_possible_cpus > 0); 382 + skel->rodata->usersched_pid = getpid(); 383 + assert(skel->rodata->usersched_pid > 0); 384 + 385 + SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei); 386 + 387 + enqueued_fd = bpf_map__fd(skel->maps.enqueued); 388 + dispatched_fd = bpf_map__fd(skel->maps.dispatched); 389 + assert(enqueued_fd > 0); 390 + assert(dispatched_fd > 0); 391 + 392 + SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); 393 + 394 + ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland); 395 + } 396 + 397 + static void sched_main_loop(void) 398 + { 399 + while (!exit_req) { 400 + /* 401 + * Perform the following work in the main user space scheduler 402 + * loop: 403 + * 404 + * 1. Drain all tasks from the enqueued map, and enqueue them 405 + * to the vruntime sorted list. 406 + * 407 + * 2. Dispatch a batch of tasks from the vruntime sorted list 408 + * down to the kernel. 409 + * 410 + * 3. Yield the CPU back to the system. The BPF scheduler will 411 + * reschedule the user space scheduler once another task has 412 + * been enqueued to user space. 413 + */ 414 + drain_enqueued_map(); 415 + dispatch_batch(); 416 + sched_yield(); 417 + } 418 + } 419 + 420 + int main(int argc, char **argv) 421 + { 422 + __u64 ecode; 423 + 424 + pre_bootstrap(argc, argv); 425 + restart: 426 + bootstrap(argv[0]); 427 + sched_main_loop(); 428 + 429 + exit_req = 1; 430 + bpf_link__destroy(ops_link); 431 + ecode = UEI_REPORT(skel, uei); 432 + scx_userland__destroy(skel); 433 + 434 + if (UEI_ECODE_RESTART(ecode)) 435 + goto restart; 436 + return 0; 437 + }

+17

tools/sched_ext/scx_userland.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2022 Meta, Inc */ 3 + 4 + #ifndef __SCX_USERLAND_COMMON_H 5 + #define __SCX_USERLAND_COMMON_H 6 + 7 + /* 8 + * An instance of a task that has been enqueued by the kernel for consumption 9 + * by a user space global scheduler thread. 10 + */ 11 + struct scx_userland_enqueued_task { 12 + __s32 pid; 13 + u64 sum_exec_runtime; 14 + u64 weight; 15 + }; 16 + 17 + #endif // __SCX_USERLAND_COMMON_H

+23 -11

tools/testing/selftests/sched_ext/init_enable_count.c

··· 4 4 * Copyright (c) 2023 David Vernet <dvernet@meta.com> 5 5 * Copyright (c) 2023 Tejun Heo <tj@kernel.org> 6 6 */ 7 + #include <signal.h> 7 8 #include <stdio.h> 8 9 #include <unistd.h> 9 10 #include <sched.h> ··· 24 23 int ret, i, status; 25 24 struct sched_param param = {}; 26 25 pid_t pids[num_pre_forks]; 26 + int pipe_fds[2]; 27 + 28 + SCX_FAIL_IF(pipe(pipe_fds) < 0, "Failed to create pipe"); 27 29 28 30 skel = init_enable_count__open(); 29 31 SCX_FAIL_IF(!skel, "Failed to open"); ··· 42 38 * ensure (at least in practical terms) that there are more tasks that 43 39 * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that 44 40 * take the fork() path either below or in other processes. 41 + * 42 + * All children will block on read() on the pipe until the parent closes 43 + * the write end after attaching the scheduler, which signals all of 44 + * them to exit simultaneously. Auto-reap so we don't have to wait on 45 + * them. 45 46 */ 47 + signal(SIGCHLD, SIG_IGN); 46 48 for (i = 0; i < num_pre_forks; i++) { 47 - pids[i] = fork(); 48 - SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); 49 - if (pids[i] == 0) { 50 - sleep(1); 49 + pid_t pid = fork(); 50 + 51 + SCX_FAIL_IF(pid < 0, "Failed to fork child"); 52 + if (pid == 0) { 53 + char buf; 54 + 55 + close(pipe_fds[1]); 56 + read(pipe_fds[0], &buf, 1); 57 + close(pipe_fds[0]); 51 58 exit(0); 52 59 } 53 60 } 61 + close(pipe_fds[0]); 54 62 55 63 link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); 56 64 SCX_FAIL_IF(!link, "Failed to attach struct_ops"); 57 65 58 - for (i = 0; i < num_pre_forks; i++) { 59 - SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], 60 - "Failed to wait for pre-forked child\n"); 61 - 62 - SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, 63 - status); 64 - } 66 + /* Signal all pre-forked children to exit. */ 67 + close(pipe_fds[1]); 68 + signal(SIGCHLD, SIG_DFL); 65 69 66 70 bpf_link__destroy(link); 67 71 SCX_GE(skel->bss->init_task_cnt, num_pre_forks);

Configure Feed

Configure Feed