tools/sched_ext: add scx_pair scheduler

+1 -1

tools/sched_ext/Makefile

··· 189 189 190 190 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) 191 191 192 - c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland 192 + c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair 193 193 194 194 $(addprefix $(BINDIR)/,$(c-sched-targets)): \ 195 195 $(BINDIR)/%: \

+610

tools/sched_ext/scx_pair.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A demo sched_ext core-scheduler which always makes every sibling CPU pair 4 + * execute from the same CPU cgroup. 5 + * 6 + * This scheduler is a minimal implementation and would need some form of 7 + * priority handling both inside each cgroup and across the cgroups to be 8 + * practically useful. 9 + * 10 + * Each CPU in the system is paired with exactly one other CPU, according to a 11 + * "stride" value that can be specified when the BPF scheduler program is first 12 + * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee 13 + * that they will only ever schedule tasks that belong to the same CPU cgroup. 14 + * 15 + * Scheduler Initialization 16 + * ------------------------ 17 + * 18 + * The scheduler BPF program is first initialized from user space, before it is 19 + * enabled. During this initialization process, each CPU on the system is 20 + * assigned several values that are constant throughout its runtime: 21 + * 22 + * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling 23 + * decisions. Paired CPUs always schedule tasks from the same 24 + * CPU cgroup, and synchronize with each other to guarantee 25 + * that this constraint is not violated. 26 + * 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access 27 + * a struct pair_ctx object that is shared between the pair. 28 + * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the 29 + * pair. Each struct pair_ctx has an active_mask field, 30 + * which is a bitmap used to indicate whether each core 31 + * in the pair currently has an actively running task. 32 + * This index specifies which entry in the bitmap corresponds 33 + * to each CPU in the pair. 34 + * 35 + * During this initialization, the CPUs are paired according to a "stride" that 36 + * may be specified when invoking the user space program that initializes and 37 + * loads the scheduler. By default, the stride is 1/2 the total number of CPUs. 38 + * 39 + * Tasks and cgroups 40 + * ----------------- 41 + * 42 + * Every cgroup in the system is registered with the scheduler using the 43 + * pair_cgroup_init() callback, and every task in the system is associated with 44 + * exactly one cgroup. At a high level, the idea with the pair scheduler is to 45 + * always schedule tasks from the same cgroup within a given CPU pair. When a 46 + * task is enqueued (i.e. passed to the pair_enqueue() callback function), its 47 + * cgroup ID is read from its task struct, and then a corresponding queue map 48 + * is used to FIFO-enqueue the task for that cgroup. 49 + * 50 + * If you look through the implementation of the scheduler, you'll notice that 51 + * there is quite a bit of complexity involved with looking up the per-cgroup 52 + * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash 53 + * BPF hash map that is used to map a cgroup ID to a globally unique ID that's 54 + * allocated in the BPF program. This is done because we use separate maps to 55 + * store the FIFO queue of tasks, and the length of that map, per cgroup. This 56 + * complexity is only present because of current deficiencies in BPF that will 57 + * soon be addressed. The main point to keep in mind is that newly enqueued 58 + * tasks are added to their cgroup's FIFO queue. 59 + * 60 + * Dispatching tasks 61 + * ----------------- 62 + * 63 + * This section will describe how enqueued tasks are dispatched and scheduled. 64 + * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is 65 + * as follows: 66 + * 67 + * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is 68 + * the structure that's used to synchronize amongst the two pair CPUs in their 69 + * scheduling decisions. After any of the following events have occurred: 70 + * 71 + * - The cgroup's slice run has expired, or 72 + * - The cgroup becomes empty, or 73 + * - Either CPU in the pair is preempted by a higher priority scheduling class 74 + * 75 + * The cgroup transitions to the draining state and stops executing new tasks 76 + * from the cgroup. 77 + * 78 + * 2. If the pair is still executing a task, mark the pair_ctx as draining, and 79 + * wait for the pair CPU to be preempted. 80 + * 81 + * 3. Otherwise, if the pair CPU is not running a task, we can move onto 82 + * scheduling new tasks. Pop the next cgroup id from the top_q queue. 83 + * 84 + * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it. 85 + * 86 + * Note again that this scheduling behavior is simple, but the implementation 87 + * is complex mostly because this it hits several BPF shortcomings and has to 88 + * work around in often awkward ways. Most of the shortcomings are expected to 89 + * be resolved in the near future which should allow greatly simplifying this 90 + * scheduler. 91 + * 92 + * Dealing with preemption 93 + * ----------------------- 94 + * 95 + * SCX is the lowest priority sched_class, and could be preempted by them at 96 + * any time. To address this, the scheduler implements pair_cpu_release() and 97 + * pair_cpu_acquire() callbacks which are invoked by the core scheduler when 98 + * the scheduler loses and gains control of the CPU respectively. 99 + * 100 + * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and 101 + * then invoke: 102 + * 103 + * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT); 104 + * 105 + * This preempts the pair CPU, and waits until it has re-entered the scheduler 106 + * before returning. This is necessary to ensure that the higher priority 107 + * sched_class that preempted our scheduler does not schedule a task 108 + * concurrently with our pair CPU. 109 + * 110 + * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption 111 + * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable 112 + * pair scheduling. 113 + * 114 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 115 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 116 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 117 + */ 118 + #include <scx/common.bpf.h> 119 + #include "scx_pair.h" 120 + 121 + char _license[] SEC("license") = "GPL"; 122 + 123 + /* !0 for veristat, set during init */ 124 + const volatile u32 nr_cpu_ids = 1; 125 + 126 + /* a pair of CPUs stay on a cgroup for this duration */ 127 + const volatile u32 pair_batch_dur_ns; 128 + 129 + /* cpu ID -> pair cpu ID */ 130 + const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu); 131 + 132 + /* cpu ID -> pair_id */ 133 + const volatile u32 RESIZABLE_ARRAY(rodata, pair_id); 134 + 135 + /* CPU ID -> CPU # in the pair (0 or 1) */ 136 + const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx); 137 + 138 + struct pair_ctx { 139 + struct bpf_spin_lock lock; 140 + 141 + /* the cgroup the pair is currently executing */ 142 + u64 cgid; 143 + 144 + /* the pair started executing the current cgroup at */ 145 + u64 started_at; 146 + 147 + /* whether the current cgroup is draining */ 148 + bool draining; 149 + 150 + /* the CPUs that are currently active on the cgroup */ 151 + u32 active_mask; 152 + 153 + /* 154 + * the CPUs that are currently preempted and running tasks in a 155 + * different scheduler. 156 + */ 157 + u32 preempted_mask; 158 + }; 159 + 160 + struct { 161 + __uint(type, BPF_MAP_TYPE_ARRAY); 162 + __type(key, u32); 163 + __type(value, struct pair_ctx); 164 + } pair_ctx SEC(".maps"); 165 + 166 + /* queue of cgrp_q's possibly with tasks on them */ 167 + struct { 168 + __uint(type, BPF_MAP_TYPE_QUEUE); 169 + /* 170 + * Because it's difficult to build strong synchronization encompassing 171 + * multiple non-trivial operations in BPF, this queue is managed in an 172 + * opportunistic way so that we guarantee that a cgroup w/ active tasks 173 + * is always on it but possibly multiple times. Once we have more robust 174 + * synchronization constructs and e.g. linked list, we should be able to 175 + * do this in a prettier way but for now just size it big enough. 176 + */ 177 + __uint(max_entries, 4 * MAX_CGRPS); 178 + __type(value, u64); 179 + } top_q SEC(".maps"); 180 + 181 + /* per-cgroup q which FIFOs the tasks from the cgroup */ 182 + struct cgrp_q { 183 + __uint(type, BPF_MAP_TYPE_QUEUE); 184 + __uint(max_entries, MAX_QUEUED); 185 + __type(value, u32); 186 + }; 187 + 188 + /* 189 + * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local 190 + * storage; however, a cgroup local storage can only be accessed from the BPF 191 + * progs attached to the cgroup. For now, work around by allocating array of 192 + * cgrp_q's and then allocating per-cgroup indices. 193 + * 194 + * Another caveat: It's difficult to populate a large array of maps statically 195 + * or from BPF. Initialize it from userland. 196 + */ 197 + struct { 198 + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); 199 + __uint(max_entries, MAX_CGRPS); 200 + __type(key, s32); 201 + __array(values, struct cgrp_q); 202 + } cgrp_q_arr SEC(".maps"); 203 + 204 + static u64 cgrp_q_len[MAX_CGRPS]; 205 + 206 + /* 207 + * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be 208 + * useful to have as a map type. 209 + */ 210 + static u32 cgrp_q_idx_cursor; 211 + static u64 cgrp_q_idx_busy[MAX_CGRPS]; 212 + 213 + /* 214 + * All added up, the following is what we do: 215 + * 216 + * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking 217 + * for a free ID. If not found, fail cgroup creation with -EBUSY. 218 + * 219 + * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following 220 + * cgrp_q_idx_hash. 221 + * 222 + * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from 223 + * cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr. 224 + * 225 + * This is sadly complicated for something pretty simple. Hopefully, we should 226 + * be able to simplify in the future. 227 + */ 228 + struct { 229 + __uint(type, BPF_MAP_TYPE_HASH); 230 + __uint(max_entries, MAX_CGRPS); 231 + __uint(key_size, sizeof(u64)); /* cgrp ID */ 232 + __uint(value_size, sizeof(s32)); /* cgrp_q idx */ 233 + } cgrp_q_idx_hash SEC(".maps"); 234 + 235 + /* statistics */ 236 + u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions; 237 + u64 nr_exps, nr_exp_waits, nr_exp_empty; 238 + u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty; 239 + 240 + UEI_DEFINE(uei); 241 + 242 + void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags) 243 + { 244 + struct cgroup *cgrp; 245 + struct cgrp_q *cgq; 246 + s32 pid = p->pid; 247 + u64 cgid; 248 + u32 *q_idx; 249 + u64 *cgq_len; 250 + 251 + __sync_fetch_and_add(&nr_total, 1); 252 + 253 + cgrp = scx_bpf_task_cgroup(p); 254 + cgid = cgrp->kn->id; 255 + bpf_cgroup_release(cgrp); 256 + 257 + /* find the cgroup's q and push @p into it */ 258 + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 259 + if (!q_idx) { 260 + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); 261 + return; 262 + } 263 + 264 + cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx); 265 + if (!cgq) { 266 + scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]", 267 + cgid, *q_idx); 268 + return; 269 + } 270 + 271 + if (bpf_map_push_elem(cgq, &pid, 0)) { 272 + scx_bpf_error("cgroup[%llu] queue overflow", cgid); 273 + return; 274 + } 275 + 276 + /* bump q len, if going 0 -> 1, queue cgroup into the top_q */ 277 + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); 278 + if (!cgq_len) { 279 + scx_bpf_error("MEMBER_VTPR malfunction"); 280 + return; 281 + } 282 + 283 + if (!__sync_fetch_and_add(cgq_len, 1) && 284 + bpf_map_push_elem(&top_q, &cgid, 0)) { 285 + scx_bpf_error("top_q overflow"); 286 + return; 287 + } 288 + } 289 + 290 + static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask) 291 + { 292 + u32 *vptr; 293 + 294 + vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids); 295 + if (!vptr) 296 + return -EINVAL; 297 + 298 + *pairc = bpf_map_lookup_elem(&pair_ctx, vptr); 299 + if (!(*pairc)) 300 + return -EINVAL; 301 + 302 + vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids); 303 + if (!vptr) 304 + return -EINVAL; 305 + 306 + *mask = 1U << *vptr; 307 + 308 + return 0; 309 + } 310 + 311 + __attribute__((noinline)) 312 + static int try_dispatch(s32 cpu) 313 + { 314 + struct pair_ctx *pairc; 315 + struct bpf_map *cgq_map; 316 + struct task_struct *p; 317 + u64 now = scx_bpf_now(); 318 + bool kick_pair = false; 319 + bool expired, pair_preempted; 320 + u32 *vptr, in_pair_mask; 321 + s32 pid, q_idx; 322 + u64 cgid; 323 + int ret; 324 + 325 + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 326 + if (ret) { 327 + scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]", 328 + cpu); 329 + return -ENOENT; 330 + } 331 + 332 + bpf_spin_lock(&pairc->lock); 333 + pairc->active_mask &= ~in_pair_mask; 334 + 335 + expired = time_before(pairc->started_at + pair_batch_dur_ns, now); 336 + if (expired || pairc->draining) { 337 + u64 new_cgid = 0; 338 + 339 + __sync_fetch_and_add(&nr_exps, 1); 340 + 341 + /* 342 + * We're done with the current cgid. An obvious optimization 343 + * would be not draining if the next cgroup is the current one. 344 + * For now, be dumb and always expire. 345 + */ 346 + pairc->draining = true; 347 + 348 + pair_preempted = pairc->preempted_mask; 349 + if (pairc->active_mask || pair_preempted) { 350 + /* 351 + * The other CPU is still active, or is no longer under 352 + * our control due to e.g. being preempted by a higher 353 + * priority sched_class. We want to wait until this 354 + * cgroup expires, or until control of our pair CPU has 355 + * been returned to us. 356 + * 357 + * If the pair controls its CPU, and the time already 358 + * expired, kick. When the other CPU arrives at 359 + * dispatch and clears its active mask, it'll push the 360 + * pair to the next cgroup and kick this CPU. 361 + */ 362 + __sync_fetch_and_add(&nr_exp_waits, 1); 363 + bpf_spin_unlock(&pairc->lock); 364 + if (expired && !pair_preempted) 365 + kick_pair = true; 366 + goto out_maybe_kick; 367 + } 368 + 369 + bpf_spin_unlock(&pairc->lock); 370 + 371 + /* 372 + * Pick the next cgroup. It'd be easier / cleaner to not drop 373 + * pairc->lock and use stronger synchronization here especially 374 + * given that we'll be switching cgroups significantly less 375 + * frequently than tasks. Unfortunately, bpf_spin_lock can't 376 + * really protect anything non-trivial. Let's do opportunistic 377 + * operations instead. 378 + */ 379 + bpf_repeat(BPF_MAX_LOOPS) { 380 + u32 *q_idx; 381 + u64 *cgq_len; 382 + 383 + if (bpf_map_pop_elem(&top_q, &new_cgid)) { 384 + /* no active cgroup, go idle */ 385 + __sync_fetch_and_add(&nr_exp_empty, 1); 386 + return 0; 387 + } 388 + 389 + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid); 390 + if (!q_idx) 391 + continue; 392 + 393 + /* 394 + * This is the only place where empty cgroups are taken 395 + * off the top_q. 396 + */ 397 + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); 398 + if (!cgq_len || !*cgq_len) 399 + continue; 400 + 401 + /* 402 + * If it has any tasks, requeue as we may race and not 403 + * execute it. 404 + */ 405 + bpf_map_push_elem(&top_q, &new_cgid, 0); 406 + break; 407 + } 408 + 409 + bpf_spin_lock(&pairc->lock); 410 + 411 + /* 412 + * The other CPU may already have started on a new cgroup while 413 + * we dropped the lock. Make sure that we're still draining and 414 + * start on the new cgroup. 415 + */ 416 + if (pairc->draining && !pairc->active_mask) { 417 + __sync_fetch_and_add(&nr_cgrp_next, 1); 418 + pairc->cgid = new_cgid; 419 + pairc->started_at = now; 420 + pairc->draining = false; 421 + kick_pair = true; 422 + } else { 423 + __sync_fetch_and_add(&nr_cgrp_coll, 1); 424 + } 425 + } 426 + 427 + cgid = pairc->cgid; 428 + pairc->active_mask |= in_pair_mask; 429 + bpf_spin_unlock(&pairc->lock); 430 + 431 + /* again, it'd be better to do all these with the lock held, oh well */ 432 + vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 433 + if (!vptr) { 434 + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); 435 + return -ENOENT; 436 + } 437 + q_idx = *vptr; 438 + 439 + /* claim one task from cgrp_q w/ q_idx */ 440 + bpf_repeat(BPF_MAX_LOOPS) { 441 + u64 *cgq_len, len; 442 + 443 + cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]); 444 + if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) { 445 + /* the cgroup must be empty, expire and repeat */ 446 + __sync_fetch_and_add(&nr_cgrp_empty, 1); 447 + bpf_spin_lock(&pairc->lock); 448 + pairc->draining = true; 449 + pairc->active_mask &= ~in_pair_mask; 450 + bpf_spin_unlock(&pairc->lock); 451 + return -EAGAIN; 452 + } 453 + 454 + if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len) 455 + continue; 456 + 457 + break; 458 + } 459 + 460 + cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx); 461 + if (!cgq_map) { 462 + scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]", 463 + cgid, q_idx); 464 + return -ENOENT; 465 + } 466 + 467 + if (bpf_map_pop_elem(cgq_map, &pid)) { 468 + scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]", 469 + cgid, q_idx); 470 + return -ENOENT; 471 + } 472 + 473 + p = bpf_task_from_pid(pid); 474 + if (p) { 475 + __sync_fetch_and_add(&nr_dispatched, 1); 476 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 477 + bpf_task_release(p); 478 + } else { 479 + /* we don't handle dequeues, retry on lost tasks */ 480 + __sync_fetch_and_add(&nr_missing, 1); 481 + return -EAGAIN; 482 + } 483 + 484 + out_maybe_kick: 485 + if (kick_pair) { 486 + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 487 + if (pair) { 488 + __sync_fetch_and_add(&nr_kicks, 1); 489 + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); 490 + } 491 + } 492 + return 0; 493 + } 494 + 495 + void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev) 496 + { 497 + bpf_repeat(BPF_MAX_LOOPS) { 498 + if (try_dispatch(cpu) != -EAGAIN) 499 + break; 500 + } 501 + } 502 + 503 + void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args) 504 + { 505 + int ret; 506 + u32 in_pair_mask; 507 + struct pair_ctx *pairc; 508 + bool kick_pair; 509 + 510 + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 511 + if (ret) 512 + return; 513 + 514 + bpf_spin_lock(&pairc->lock); 515 + pairc->preempted_mask &= ~in_pair_mask; 516 + /* Kick the pair CPU, unless it was also preempted. */ 517 + kick_pair = !pairc->preempted_mask; 518 + bpf_spin_unlock(&pairc->lock); 519 + 520 + if (kick_pair) { 521 + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 522 + 523 + if (pair) { 524 + __sync_fetch_and_add(&nr_kicks, 1); 525 + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); 526 + } 527 + } 528 + } 529 + 530 + void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 531 + { 532 + int ret; 533 + u32 in_pair_mask; 534 + struct pair_ctx *pairc; 535 + bool kick_pair; 536 + 537 + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); 538 + if (ret) 539 + return; 540 + 541 + bpf_spin_lock(&pairc->lock); 542 + pairc->preempted_mask |= in_pair_mask; 543 + pairc->active_mask &= ~in_pair_mask; 544 + /* Kick the pair CPU if it's still running. */ 545 + kick_pair = pairc->active_mask; 546 + pairc->draining = true; 547 + bpf_spin_unlock(&pairc->lock); 548 + 549 + if (kick_pair) { 550 + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); 551 + 552 + if (pair) { 553 + __sync_fetch_and_add(&nr_kicks, 1); 554 + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT); 555 + } 556 + } 557 + __sync_fetch_and_add(&nr_preemptions, 1); 558 + } 559 + 560 + s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp) 561 + { 562 + u64 cgid = cgrp->kn->id; 563 + s32 i, q_idx; 564 + 565 + bpf_for(i, 0, MAX_CGRPS) { 566 + q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS; 567 + if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) 568 + break; 569 + } 570 + if (i == MAX_CGRPS) 571 + return -EBUSY; 572 + 573 + if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) { 574 + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]); 575 + if (busy) 576 + *busy = 0; 577 + return -EBUSY; 578 + } 579 + 580 + return 0; 581 + } 582 + 583 + void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp) 584 + { 585 + u64 cgid = cgrp->kn->id; 586 + s32 *q_idx; 587 + 588 + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); 589 + if (q_idx) { 590 + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]); 591 + if (busy) 592 + *busy = 0; 593 + bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid); 594 + } 595 + } 596 + 597 + void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei) 598 + { 599 + UEI_RECORD(uei, ei); 600 + } 601 + 602 + SCX_OPS_DEFINE(pair_ops, 603 + .enqueue = (void *)pair_enqueue, 604 + .dispatch = (void *)pair_dispatch, 605 + .cpu_acquire = (void *)pair_cpu_acquire, 606 + .cpu_release = (void *)pair_cpu_release, 607 + .cgroup_init = (void *)pair_cgroup_init, 608 + .cgroup_exit = (void *)pair_cgroup_exit, 609 + .exit = (void *)pair_exit, 610 + .name = "pair");

+180

tools/sched_ext/scx_pair.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 + */ 7 + #include <stdio.h> 8 + #include <unistd.h> 9 + #include <inttypes.h> 10 + #include <signal.h> 11 + #include <assert.h> 12 + #include <libgen.h> 13 + #include <bpf/bpf.h> 14 + #include <scx/common.h> 15 + #include "scx_pair.h" 16 + #include "scx_pair.bpf.skel.h" 17 + 18 + const char help_fmt[] = 19 + "A demo sched_ext core-scheduler which always makes every sibling CPU pair\n" 20 + "execute from the same CPU cgroup.\n" 21 + "\n" 22 + "See the top-level comment in .bpf.c for more details.\n" 23 + "\n" 24 + "Usage: %s [-S STRIDE]\n" 25 + "\n" 26 + " -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n" 27 + " -v Print libbpf debug messages\n" 28 + " -h Display this help and exit\n"; 29 + 30 + static bool verbose; 31 + static volatile int exit_req; 32 + 33 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 34 + { 35 + if (level == LIBBPF_DEBUG && !verbose) 36 + return 0; 37 + return vfprintf(stderr, format, args); 38 + } 39 + 40 + static void sigint_handler(int dummy) 41 + { 42 + exit_req = 1; 43 + } 44 + 45 + int main(int argc, char **argv) 46 + { 47 + struct scx_pair *skel; 48 + struct bpf_link *link; 49 + __u64 seq = 0, ecode; 50 + __s32 stride, i, opt, outer_fd; 51 + 52 + libbpf_set_print(libbpf_print_fn); 53 + signal(SIGINT, sigint_handler); 54 + signal(SIGTERM, sigint_handler); 55 + restart: 56 + skel = SCX_OPS_OPEN(pair_ops, scx_pair); 57 + 58 + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); 59 + assert(skel->rodata->nr_cpu_ids > 0); 60 + skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); 61 + 62 + /* pair up the earlier half to the latter by default, override with -s */ 63 + stride = skel->rodata->nr_cpu_ids / 2; 64 + 65 + while ((opt = getopt(argc, argv, "S:vh")) != -1) { 66 + switch (opt) { 67 + case 'S': 68 + stride = strtoul(optarg, NULL, 0); 69 + break; 70 + case 'v': 71 + verbose = true; 72 + break; 73 + default: 74 + fprintf(stderr, help_fmt, basename(argv[0])); 75 + return opt != 'h'; 76 + } 77 + } 78 + 79 + bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2); 80 + 81 + /* Resize arrays so their element count is equal to cpu count. */ 82 + RESIZE_ARRAY(skel, rodata, pair_cpu, skel->rodata->nr_cpu_ids); 83 + RESIZE_ARRAY(skel, rodata, pair_id, skel->rodata->nr_cpu_ids); 84 + RESIZE_ARRAY(skel, rodata, in_pair_idx, skel->rodata->nr_cpu_ids); 85 + 86 + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) 87 + skel->rodata_pair_cpu->pair_cpu[i] = -1; 88 + 89 + printf("Pairs: "); 90 + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) { 91 + int j = (i + stride) % skel->rodata->nr_cpu_ids; 92 + 93 + if (skel->rodata_pair_cpu->pair_cpu[i] >= 0) 94 + continue; 95 + 96 + SCX_BUG_ON(i == j, 97 + "Invalid stride %d - CPU%d wants to be its own pair", 98 + stride, i); 99 + 100 + SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0, 101 + "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair", 102 + stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]); 103 + 104 + skel->rodata_pair_cpu->pair_cpu[i] = j; 105 + skel->rodata_pair_cpu->pair_cpu[j] = i; 106 + skel->rodata_pair_id->pair_id[i] = i; 107 + skel->rodata_pair_id->pair_id[j] = i; 108 + skel->rodata_in_pair_idx->in_pair_idx[i] = 0; 109 + skel->rodata_in_pair_idx->in_pair_idx[j] = 1; 110 + 111 + printf("[%d, %d] ", i, j); 112 + } 113 + printf("\n"); 114 + 115 + SCX_OPS_LOAD(skel, pair_ops, scx_pair, uei); 116 + 117 + /* 118 + * Populate the cgrp_q_arr map which is an array containing per-cgroup 119 + * queues. It'd probably be better to do this from BPF but there are too 120 + * many to initialize statically and there's no way to dynamically 121 + * populate from BPF. 122 + */ 123 + outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr); 124 + SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd); 125 + 126 + printf("Initializing"); 127 + for (i = 0; i < MAX_CGRPS; i++) { 128 + __s32 inner_fd; 129 + 130 + if (exit_req) 131 + break; 132 + 133 + inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, 134 + sizeof(__u32), MAX_QUEUED, NULL); 135 + SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d", 136 + inner_fd); 137 + SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY), 138 + "Failed to set inner map"); 139 + close(inner_fd); 140 + 141 + if (!(i % 10)) 142 + printf("."); 143 + fflush(stdout); 144 + } 145 + printf("\n"); 146 + 147 + /* 148 + * Fully initialized, attach and run. 149 + */ 150 + link = SCX_OPS_ATTACH(skel, pair_ops, scx_pair); 151 + 152 + while (!exit_req && !UEI_EXITED(skel, uei)) { 153 + printf("[SEQ %llu]\n", seq++); 154 + printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n", 155 + skel->bss->nr_total, 156 + skel->bss->nr_dispatched, 157 + skel->bss->nr_missing); 158 + printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n", 159 + skel->bss->nr_kicks, 160 + skel->bss->nr_preemptions); 161 + printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n", 162 + skel->bss->nr_exps, 163 + skel->bss->nr_exp_waits, 164 + skel->bss->nr_exp_empty); 165 + printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n", 166 + skel->bss->nr_cgrp_next, 167 + skel->bss->nr_cgrp_coll, 168 + skel->bss->nr_cgrp_empty); 169 + fflush(stdout); 170 + sleep(1); 171 + } 172 + 173 + bpf_link__destroy(link); 174 + ecode = UEI_REPORT(skel, uei); 175 + scx_pair__destroy(skel); 176 + 177 + if (UEI_ECODE_RESTART(ecode)) 178 + goto restart; 179 + return 0; 180 + }

+9

tools/sched_ext/scx_pair.h

··· 1 + #ifndef __SCX_EXAMPLE_PAIR_H 2 + #define __SCX_EXAMPLE_PAIR_H 3 + 4 + enum { 5 + MAX_QUEUED = 4096, 6 + MAX_CGRPS = 4096, 7 + }; 8 + 9 + #endif /* __SCX_EXAMPLE_PAIR_H */

Configure Feed

Configure Feed