Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tools/sched_ext: add arena based scheduler

Add a scheduler that uses BPF arenas to manage task context data.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Emil Tsalapatis and committed by
Tejun Heo
36929ebd f0262b10

+925 -1
+1 -1
tools/sched_ext/Makefile
··· 189 189 190 190 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) 191 191 192 - c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair 192 + c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt 193 193 194 194 $(addprefix $(BINDIR)/,$(c-sched-targets)): \ 195 195 $(BINDIR)/%: \
+710
tools/sched_ext/scx_sdt.bpf.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Arena-based task data scheduler. This is a variation of scx_simple 4 + * that uses a combined allocator and indexing structure to organize 5 + * task data. Task context allocation is done when a task enters the 6 + * scheduler, while freeing is done when it exits. Task contexts are 7 + * retrieved from task-local storage, pointing to the allocated memory. 8 + * 9 + * The main purpose of this scheduler is to demostrate arena memory 10 + * management. 11 + * 12 + * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. 13 + * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com> 14 + * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org> 15 + * 16 + */ 17 + #include <scx/common.bpf.h> 18 + #include <scx/bpf_arena_common.bpf.h> 19 + 20 + #include "scx_sdt.h" 21 + 22 + char _license[] SEC("license") = "GPL"; 23 + 24 + UEI_DEFINE(uei); 25 + 26 + struct { 27 + __uint(type, BPF_MAP_TYPE_ARENA); 28 + __uint(map_flags, BPF_F_MMAPABLE); 29 + #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) 30 + __uint(max_entries, 1 << 16); /* number of pages */ 31 + __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ 32 + #else 33 + __uint(max_entries, 1 << 20); /* number of pages */ 34 + __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ 35 + #endif 36 + } arena __weak SEC(".maps"); 37 + 38 + #define SHARED_DSQ 0 39 + 40 + #define DEFINE_SDT_STAT(metric) \ 41 + static inline void \ 42 + stat_inc_##metric(struct scx_stats __arena *stats) \ 43 + { \ 44 + cast_kern(stats); \ 45 + stats->metric += 1; \ 46 + } \ 47 + __u64 stat_##metric; \ 48 + 49 + DEFINE_SDT_STAT(enqueue); 50 + DEFINE_SDT_STAT(init); 51 + DEFINE_SDT_STAT(exit); 52 + DEFINE_SDT_STAT(select_idle_cpu); 53 + DEFINE_SDT_STAT(select_busy_cpu); 54 + 55 + /* 56 + * Necessary for cond_break/can_loop's semantics. According to kernel commit 57 + * 011832b, the loop counter variable must be seen as imprecise and bounded 58 + * by the verifier. Initializing it from a constant (e.g., i = 0;), then, 59 + * makes it precise and prevents may_goto from helping with converging the 60 + * loop. For these loops we must initialize the loop counter from a variable 61 + * whose value the verifier cannot reason about when checking the program, so 62 + * that the loop counter's value is imprecise. 63 + */ 64 + static __u64 zero = 0; 65 + 66 + /* 67 + * XXX Hack to get the verifier to find the arena for sdt_exit_task. 68 + * As of 6.12-rc5, The verifier associates arenas with programs by 69 + * checking LD.IMM instruction operands for an arena and populating 70 + * the program state with the first instance it finds. This requires 71 + * accessing our global arena variable, but scx methods do not necessarily 72 + * do so while still using pointers from that arena. Insert a bpf_printk 73 + * statement that triggers at most once to generate an LD.IMM instruction 74 + * to access the arena and help the verifier. 75 + */ 76 + static volatile bool scx_arena_verify_once; 77 + 78 + __hidden void scx_arena_subprog_init(void) 79 + { 80 + if (scx_arena_verify_once) 81 + return; 82 + 83 + bpf_printk("%s: arena pointer %p", __func__, &arena); 84 + scx_arena_verify_once = true; 85 + } 86 + 87 + 88 + private(LOCK) struct bpf_spin_lock alloc_lock; 89 + private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock; 90 + 91 + /* allocation pools */ 92 + struct sdt_pool desc_pool; 93 + struct sdt_pool chunk_pool; 94 + 95 + /* Protected by alloc_lock. */ 96 + struct scx_alloc_stats alloc_stats; 97 + 98 + 99 + /* Allocate element from the pool. Must be called with a then pool lock held. */ 100 + static 101 + void __arena *scx_alloc_from_pool(struct sdt_pool *pool) 102 + { 103 + __u64 elem_size, max_elems; 104 + void __arena *slab; 105 + void __arena *ptr; 106 + 107 + elem_size = pool->elem_size; 108 + max_elems = pool->max_elems; 109 + 110 + /* If the chunk is spent, get a new one. */ 111 + if (pool->idx >= max_elems) { 112 + slab = bpf_arena_alloc_pages(&arena, NULL, 113 + div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0); 114 + if (!slab) 115 + return NULL; 116 + 117 + pool->slab = slab; 118 + pool->idx = 0; 119 + } 120 + 121 + ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx); 122 + pool->idx += 1; 123 + 124 + return ptr; 125 + } 126 + 127 + /* Alloc desc and associated chunk. Called with the allocator spinlock held. */ 128 + static sdt_desc_t *scx_alloc_chunk(void) 129 + { 130 + struct sdt_chunk __arena *chunk; 131 + sdt_desc_t *desc; 132 + sdt_desc_t *out; 133 + 134 + chunk = scx_alloc_from_pool(&chunk_pool); 135 + if (!chunk) 136 + return NULL; 137 + 138 + desc = scx_alloc_from_pool(&desc_pool); 139 + if (!desc) { 140 + /* 141 + * Effectively frees the previous chunk allocation. 142 + * Index cannot be 0, so decrementing is always 143 + * valid. 144 + */ 145 + chunk_pool.idx -= 1; 146 + return NULL; 147 + } 148 + 149 + out = desc; 150 + 151 + desc->nr_free = SDT_TASK_ENTS_PER_CHUNK; 152 + desc->chunk = chunk; 153 + 154 + alloc_stats.chunk_allocs += 1; 155 + 156 + return out; 157 + } 158 + 159 + static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages) 160 + { 161 + if (unlikely(data_size % 8)) 162 + return -EINVAL; 163 + 164 + if (unlikely(nr_pages == 0)) 165 + return -EINVAL; 166 + 167 + pool->elem_size = data_size; 168 + pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size; 169 + /* Populate the pool slab on the first allocation. */ 170 + pool->idx = pool->max_elems; 171 + 172 + return 0; 173 + } 174 + 175 + /* Initialize both the base pool allocators and the root chunk of the index. */ 176 + __hidden int 177 + scx_alloc_init(struct scx_allocator *alloc, __u64 data_size) 178 + { 179 + size_t min_chunk_size; 180 + int ret; 181 + 182 + _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE, 183 + "chunk size must fit into a page"); 184 + 185 + ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1); 186 + if (ret != 0) 187 + return ret; 188 + 189 + ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1); 190 + if (ret != 0) 191 + return ret; 192 + 193 + /* Wrap data into a descriptor and word align. */ 194 + data_size += sizeof(struct sdt_data); 195 + data_size = round_up(data_size, 8); 196 + 197 + /* 198 + * Ensure we allocate large enough chunks from the arena to avoid excessive 199 + * internal fragmentation when turning chunks it into structs. 200 + */ 201 + min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE); 202 + ret = pool_set_size(&alloc->pool, data_size, min_chunk_size); 203 + if (ret != 0) 204 + return ret; 205 + 206 + bpf_spin_lock(&alloc_lock); 207 + alloc->root = scx_alloc_chunk(); 208 + bpf_spin_unlock(&alloc_lock); 209 + if (!alloc->root) 210 + return -ENOMEM; 211 + 212 + return 0; 213 + } 214 + 215 + static 216 + int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) 217 + { 218 + __u64 __arena *allocated = desc->allocated; 219 + __u64 bit; 220 + 221 + if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK)) 222 + return -EINVAL; 223 + 224 + bit = (__u64)1 << (pos % 64); 225 + 226 + if (state) 227 + allocated[pos / 64] |= bit; 228 + else 229 + allocated[pos / 64] &= ~bit; 230 + 231 + return 0; 232 + } 233 + 234 + static __noinline 235 + int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) 236 + { 237 + sdt_desc_t *desc; 238 + __u64 u, level; 239 + int ret; 240 + 241 + for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { 242 + level = SDT_TASK_LEVELS - 1 - u; 243 + 244 + /* Only propagate upwards if we are the parent's only free chunk. */ 245 + desc = lv_desc[level]; 246 + 247 + ret = set_idx_state(desc, lv_pos[level], false); 248 + if (unlikely(ret != 0)) 249 + return ret; 250 + 251 + desc->nr_free += 1; 252 + if (desc->nr_free > 1) 253 + return 0; 254 + } 255 + 256 + return 0; 257 + } 258 + 259 + /* 260 + * Free the allocated struct with the given index. Called with the 261 + * allocator lock taken. 262 + */ 263 + __hidden 264 + int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx) 265 + { 266 + const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1; 267 + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 268 + sdt_desc_t * __arena *desc_children; 269 + struct sdt_chunk __arena *chunk; 270 + sdt_desc_t *desc; 271 + struct sdt_data __arena *data; 272 + __u64 level, shift, pos; 273 + __u64 lv_pos[SDT_TASK_LEVELS]; 274 + int ret; 275 + int i; 276 + 277 + if (!alloc) 278 + return 0; 279 + 280 + desc = alloc->root; 281 + if (unlikely(!desc)) 282 + return -EINVAL; 283 + 284 + /* To appease the verifier. */ 285 + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 286 + lv_desc[level] = NULL; 287 + lv_pos[level] = 0; 288 + } 289 + 290 + /* Find the leaf node containing the index. */ 291 + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 292 + shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; 293 + pos = (idx >> shift) & mask; 294 + 295 + lv_desc[level] = desc; 296 + lv_pos[level] = pos; 297 + 298 + if (level == SDT_TASK_LEVELS - 1) 299 + break; 300 + 301 + chunk = desc->chunk; 302 + 303 + desc_children = (sdt_desc_t * __arena *)chunk->descs; 304 + desc = desc_children[pos]; 305 + 306 + if (unlikely(!desc)) 307 + return -EINVAL; 308 + } 309 + 310 + chunk = desc->chunk; 311 + 312 + pos = idx & mask; 313 + data = chunk->data[pos]; 314 + if (likely(data)) { 315 + data[pos] = (struct sdt_data) { 316 + .tid.genn = data->tid.genn + 1, 317 + }; 318 + 319 + /* Zero out one word at a time. */ 320 + for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { 321 + data->payload[i] = 0; 322 + } 323 + } 324 + 325 + ret = mark_nodes_avail(lv_desc, lv_pos); 326 + if (unlikely(ret != 0)) 327 + return ret; 328 + 329 + alloc_stats.active_allocs -= 1; 330 + alloc_stats.free_ops += 1; 331 + 332 + return 0; 333 + } 334 + 335 + static inline 336 + int ffs(__u64 word) 337 + { 338 + unsigned int num = 0; 339 + 340 + if ((word & 0xffffffff) == 0) { 341 + num += 32; 342 + word >>= 32; 343 + } 344 + 345 + if ((word & 0xffff) == 0) { 346 + num += 16; 347 + word >>= 16; 348 + } 349 + 350 + if ((word & 0xff) == 0) { 351 + num += 8; 352 + word >>= 8; 353 + } 354 + 355 + if ((word & 0xf) == 0) { 356 + num += 4; 357 + word >>= 4; 358 + } 359 + 360 + if ((word & 0x3) == 0) { 361 + num += 2; 362 + word >>= 2; 363 + } 364 + 365 + if ((word & 0x1) == 0) { 366 + num += 1; 367 + word >>= 1; 368 + } 369 + 370 + return num; 371 + } 372 + 373 + 374 + /* find the first empty slot */ 375 + __hidden 376 + __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc) 377 + { 378 + __u64 freeslots; 379 + __u64 i; 380 + 381 + for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) { 382 + freeslots = ~desc->allocated[i]; 383 + if (freeslots == (__u64)0) 384 + continue; 385 + 386 + return (i * 64) + ffs(freeslots); 387 + } 388 + 389 + return SDT_TASK_ENTS_PER_CHUNK; 390 + } 391 + 392 + /* 393 + * Find and return an available idx on the allocator. 394 + * Called with the task spinlock held. 395 + */ 396 + static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp) 397 + { 398 + sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 399 + sdt_desc_t * __arena *desc_children; 400 + struct sdt_chunk __arena *chunk; 401 + sdt_desc_t *tmp; 402 + __u64 lv_pos[SDT_TASK_LEVELS]; 403 + __u64 u, pos, level; 404 + __u64 idx = 0; 405 + int ret; 406 + 407 + for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 408 + pos = chunk_find_empty(desc); 409 + 410 + /* If we error out, something has gone very wrong. */ 411 + if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK)) 412 + return NULL; 413 + 414 + if (pos == SDT_TASK_ENTS_PER_CHUNK) 415 + return NULL; 416 + 417 + idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT; 418 + idx |= pos; 419 + 420 + /* Log the levels to complete allocation. */ 421 + lv_desc[level] = desc; 422 + lv_pos[level] = pos; 423 + 424 + /* The rest of the loop is for internal node traversal. */ 425 + if (level == SDT_TASK_LEVELS - 1) 426 + break; 427 + 428 + /* Allocate an internal node if necessary. */ 429 + chunk = desc->chunk; 430 + desc_children = (sdt_desc_t * __arena *)chunk->descs; 431 + 432 + desc = desc_children[pos]; 433 + if (!desc) { 434 + desc = scx_alloc_chunk(); 435 + if (!desc) 436 + return NULL; 437 + 438 + desc_children[pos] = desc; 439 + } 440 + } 441 + 442 + /* 443 + * Finding the descriptor along with any internal node 444 + * allocations was successful. Update all levels with 445 + * the new allocation. 446 + */ 447 + bpf_for(u, 0, SDT_TASK_LEVELS) { 448 + level = SDT_TASK_LEVELS - 1 - u; 449 + tmp = lv_desc[level]; 450 + 451 + ret = set_idx_state(tmp, lv_pos[level], true); 452 + if (ret != 0) 453 + break; 454 + 455 + tmp->nr_free -= 1; 456 + if (tmp->nr_free > 0) 457 + break; 458 + } 459 + 460 + *idxp = idx; 461 + 462 + return desc; 463 + } 464 + 465 + __hidden 466 + void __arena *scx_alloc(struct scx_allocator *alloc) 467 + { 468 + struct sdt_data __arena *data = NULL; 469 + struct sdt_chunk __arena *chunk; 470 + sdt_desc_t *desc; 471 + __u64 idx, pos; 472 + 473 + if (!alloc) 474 + return NULL; 475 + 476 + bpf_spin_lock(&alloc_lock); 477 + 478 + /* We unlock if we encounter an error in the function. */ 479 + desc = desc_find_empty(alloc->root, &idx); 480 + if (unlikely(desc == NULL)) { 481 + bpf_spin_unlock(&alloc_lock); 482 + return NULL; 483 + } 484 + 485 + chunk = desc->chunk; 486 + 487 + /* Populate the leaf node if necessary. */ 488 + pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1); 489 + data = chunk->data[pos]; 490 + if (!data) { 491 + data = scx_alloc_from_pool(&alloc->pool); 492 + if (!data) { 493 + scx_alloc_free_idx(alloc, idx); 494 + bpf_spin_unlock(&alloc_lock); 495 + return NULL; 496 + } 497 + } 498 + 499 + chunk->data[pos] = data; 500 + 501 + /* The data counts as a chunk */ 502 + alloc_stats.data_allocs += 1; 503 + alloc_stats.alloc_ops += 1; 504 + alloc_stats.active_allocs += 1; 505 + 506 + data->tid.idx = idx; 507 + 508 + bpf_spin_unlock(&alloc_lock); 509 + 510 + return data; 511 + } 512 + 513 + /* 514 + * Task BPF map entry recording the task's assigned ID and pointing to the data 515 + * area allocated in arena. 516 + */ 517 + struct scx_task_map_val { 518 + union sdt_id tid; 519 + __u64 tptr; 520 + struct sdt_data __arena *data; 521 + }; 522 + 523 + struct { 524 + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 525 + __uint(map_flags, BPF_F_NO_PREALLOC); 526 + __type(key, int); 527 + __type(value, struct scx_task_map_val); 528 + } scx_task_map SEC(".maps"); 529 + 530 + static struct scx_allocator scx_task_allocator; 531 + 532 + __hidden 533 + void __arena *scx_task_alloc(struct task_struct *p) 534 + { 535 + struct sdt_data __arena *data = NULL; 536 + struct scx_task_map_val *mval; 537 + 538 + mval = bpf_task_storage_get(&scx_task_map, p, 0, 539 + BPF_LOCAL_STORAGE_GET_F_CREATE); 540 + if (!mval) 541 + return NULL; 542 + 543 + data = scx_alloc(&scx_task_allocator); 544 + if (unlikely(!data)) 545 + return NULL; 546 + 547 + mval->tid = data->tid; 548 + mval->tptr = (__u64) p; 549 + mval->data = data; 550 + 551 + return (void __arena *)data->payload; 552 + } 553 + 554 + __hidden 555 + int scx_task_init(__u64 data_size) 556 + { 557 + return scx_alloc_init(&scx_task_allocator, data_size); 558 + } 559 + 560 + __hidden 561 + void __arena *scx_task_data(struct task_struct *p) 562 + { 563 + struct sdt_data __arena *data; 564 + struct scx_task_map_val *mval; 565 + 566 + scx_arena_subprog_init(); 567 + 568 + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 569 + if (!mval) 570 + return NULL; 571 + 572 + data = mval->data; 573 + 574 + return (void __arena *)data->payload; 575 + } 576 + 577 + __hidden 578 + void scx_task_free(struct task_struct *p) 579 + { 580 + struct scx_task_map_val *mval; 581 + 582 + scx_arena_subprog_init(); 583 + 584 + mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 585 + if (!mval) 586 + return; 587 + 588 + bpf_spin_lock(&alloc_lock); 589 + scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx); 590 + bpf_spin_unlock(&alloc_lock); 591 + 592 + bpf_task_storage_delete(&scx_task_map, p); 593 + } 594 + 595 + static inline void 596 + scx_stat_global_update(struct scx_stats __arena *stats) 597 + { 598 + cast_kern(stats); 599 + __sync_fetch_and_add(&stat_enqueue, stats->enqueue); 600 + __sync_fetch_and_add(&stat_init, stats->init); 601 + __sync_fetch_and_add(&stat_exit, stats->exit); 602 + __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu); 603 + __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu); 604 + } 605 + 606 + s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 607 + { 608 + struct scx_stats __arena *stats; 609 + bool is_idle = false; 610 + s32 cpu; 611 + 612 + stats = scx_task_data(p); 613 + if (!stats) { 614 + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 615 + return 0; 616 + } 617 + 618 + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 619 + if (is_idle) { 620 + stat_inc_select_idle_cpu(stats); 621 + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 622 + } else { 623 + stat_inc_select_busy_cpu(stats); 624 + } 625 + 626 + return cpu; 627 + } 628 + 629 + void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags) 630 + { 631 + struct scx_stats __arena *stats; 632 + 633 + stats = scx_task_data(p); 634 + if (!stats) { 635 + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 636 + return; 637 + } 638 + 639 + stat_inc_enqueue(stats); 640 + 641 + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); 642 + } 643 + 644 + void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev) 645 + { 646 + scx_bpf_dsq_move_to_local(SHARED_DSQ); 647 + } 648 + 649 + s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p, 650 + struct scx_init_task_args *args) 651 + { 652 + struct scx_stats __arena *stats; 653 + 654 + stats = scx_task_alloc(p); 655 + if (!stats) { 656 + scx_bpf_error("arena allocator out of memory"); 657 + return -ENOMEM; 658 + } 659 + 660 + stats->pid = p->pid; 661 + 662 + stat_inc_init(stats); 663 + 664 + return 0; 665 + } 666 + 667 + void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p, 668 + struct scx_exit_task_args *args) 669 + { 670 + struct scx_stats __arena *stats; 671 + 672 + stats = scx_task_data(p); 673 + if (!stats) { 674 + scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 675 + return; 676 + } 677 + 678 + stat_inc_exit(stats); 679 + scx_stat_global_update(stats); 680 + 681 + scx_task_free(p); 682 + } 683 + 684 + s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init) 685 + { 686 + int ret; 687 + 688 + ret = scx_task_init(sizeof(struct scx_stats)); 689 + if (ret < 0) { 690 + scx_bpf_error("%s: failed with %d", __func__, ret); 691 + return ret; 692 + } 693 + 694 + return scx_bpf_create_dsq(SHARED_DSQ, -1); 695 + } 696 + 697 + void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei) 698 + { 699 + UEI_RECORD(uei, ei); 700 + } 701 + 702 + SCX_OPS_DEFINE(sdt_ops, 703 + .select_cpu = (void *)sdt_select_cpu, 704 + .enqueue = (void *)sdt_enqueue, 705 + .dispatch = (void *)sdt_dispatch, 706 + .init_task = (void *)sdt_init_task, 707 + .exit_task = (void *)sdt_exit_task, 708 + .init = (void *)sdt_init, 709 + .exit = (void *)sdt_exit, 710 + .name = "sdt");
+101
tools/sched_ext/scx_sdt.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com> 5 + * Copyright (c) 2024 Tejun Heo <tj@kernel.org> 6 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 7 + */ 8 + #include <stdio.h> 9 + #include <unistd.h> 10 + #include <signal.h> 11 + #include <libgen.h> 12 + #include <bpf/bpf.h> 13 + #include <scx/common.h> 14 + 15 + #include "scx_sdt.h" 16 + #include "scx_sdt.bpf.skel.h" 17 + 18 + const char help_fmt[] = 19 + "A simple arena-based sched_ext scheduler.\n" 20 + "\n" 21 + "Modified version of scx_simple that demonstrates arena-based data structures.\n" 22 + "\n" 23 + "Usage: %s [-f] [-v]\n" 24 + "\n" 25 + " -v Print libbpf debug messages\n" 26 + " -h Display this help and exit\n"; 27 + 28 + static bool verbose; 29 + static volatile int exit_req; 30 + 31 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 32 + { 33 + if (level == LIBBPF_DEBUG && !verbose) 34 + return 0; 35 + return vfprintf(stderr, format, args); 36 + } 37 + 38 + static void sigint_handler(int sig) 39 + { 40 + exit_req = 1; 41 + } 42 + 43 + int main(int argc, char **argv) 44 + { 45 + struct scx_sdt *skel; 46 + struct bpf_link *link; 47 + __u32 opt; 48 + __u64 ecode; 49 + 50 + libbpf_set_print(libbpf_print_fn); 51 + signal(SIGINT, sigint_handler); 52 + signal(SIGTERM, sigint_handler); 53 + restart: 54 + skel = SCX_OPS_OPEN(sdt_ops, scx_sdt); 55 + 56 + while ((opt = getopt(argc, argv, "fvh")) != -1) { 57 + switch (opt) { 58 + case 'v': 59 + verbose = true; 60 + break; 61 + default: 62 + fprintf(stderr, help_fmt, basename(argv[0])); 63 + return opt != 'h'; 64 + } 65 + } 66 + 67 + SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei); 68 + link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt); 69 + 70 + while (!exit_req && !UEI_EXITED(skel, uei)) { 71 + printf("====SCHEDULING STATS====\n"); 72 + printf("enqueues=%llu\t", skel->bss->stat_enqueue); 73 + printf("inits=%llu\t", skel->bss->stat_init); 74 + printf("exits=%llu\t", skel->bss->stat_exit); 75 + printf("\n"); 76 + 77 + printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu); 78 + printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu); 79 + printf("\n"); 80 + 81 + printf("====ALLOCATION STATS====\n"); 82 + printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs); 83 + printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs); 84 + printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops); 85 + printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops); 86 + printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs); 87 + printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used); 88 + printf("\n\n"); 89 + 90 + fflush(stdout); 91 + sleep(1); 92 + } 93 + 94 + bpf_link__destroy(link); 95 + ecode = UEI_REPORT(skel, uei); 96 + scx_sdt__destroy(skel); 97 + 98 + if (UEI_ECODE_RESTART(ecode)) 99 + goto restart; 100 + return 0; 101 + }
+113
tools/sched_ext/scx_sdt.h
··· 1 + /* 2 + * SPDX-License-Identifier: GPL-2.0 3 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> 5 + * Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com> 6 + */ 7 + #pragma once 8 + 9 + #ifndef __BPF__ 10 + #define __arena 11 + #endif /* __BPF__ */ 12 + 13 + struct scx_alloc_stats { 14 + __u64 chunk_allocs; 15 + __u64 data_allocs; 16 + __u64 alloc_ops; 17 + __u64 free_ops; 18 + __u64 active_allocs; 19 + __u64 arena_pages_used; 20 + }; 21 + 22 + struct sdt_pool { 23 + void __arena *slab; 24 + __u64 elem_size; 25 + __u64 max_elems; 26 + __u64 idx; 27 + }; 28 + 29 + #ifndef div_round_up 30 + #define div_round_up(a, b) (((a) + (b) - 1) / (b)) 31 + #endif 32 + 33 + #ifndef round_up 34 + #define round_up(a, b) (div_round_up((a), (b)) * (b)) 35 + #endif 36 + 37 + typedef struct sdt_desc __arena sdt_desc_t; 38 + 39 + enum sdt_consts { 40 + SDT_TASK_ENTS_PER_PAGE_SHIFT = 9, 41 + SDT_TASK_LEVELS = 3, 42 + SDT_TASK_ENTS_PER_CHUNK = 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT, 43 + SDT_TASK_CHUNK_BITMAP_U64S = div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64), 44 + SDT_TASK_MIN_ELEM_PER_ALLOC = 8, 45 + }; 46 + 47 + union sdt_id { 48 + __s64 val; 49 + struct { 50 + __s32 idx; /* index in the radix tree */ 51 + __s32 genn; /* ++'d on recycle so that it forms unique'ish 64bit ID */ 52 + }; 53 + }; 54 + 55 + struct sdt_chunk; 56 + 57 + /* 58 + * Each index page is described by the following descriptor which carries the 59 + * bitmap. This way the actual index can host power-of-two numbers of entries 60 + * which makes indexing cheaper. 61 + */ 62 + struct sdt_desc { 63 + __u64 allocated[SDT_TASK_CHUNK_BITMAP_U64S]; 64 + __u64 nr_free; 65 + struct sdt_chunk __arena *chunk; 66 + }; 67 + 68 + /* 69 + * Leaf node containing per-task data. 70 + */ 71 + struct sdt_data { 72 + union sdt_id tid; 73 + __u64 payload[]; 74 + }; 75 + 76 + /* 77 + * Intermediate node pointing to another intermediate node or leaf node. 78 + */ 79 + struct sdt_chunk { 80 + union { 81 + sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK]; 82 + struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK]; 83 + }; 84 + }; 85 + 86 + struct scx_allocator { 87 + struct sdt_pool pool; 88 + sdt_desc_t *root; 89 + }; 90 + 91 + struct scx_stats { 92 + int seq; 93 + pid_t pid; 94 + __u64 enqueue; 95 + __u64 exit; 96 + __u64 init; 97 + __u64 select_busy_cpu; 98 + __u64 select_idle_cpu; 99 + }; 100 + 101 + #ifdef __BPF__ 102 + 103 + void __arena *scx_task_data(struct task_struct *p); 104 + int scx_task_init(__u64 data_size); 105 + void __arena *scx_task_alloc(struct task_struct *p); 106 + void scx_task_free(struct task_struct *p); 107 + void scx_arena_subprog_init(void); 108 + 109 + int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size); 110 + u64 scx_alloc_internal(struct scx_allocator *alloc); 111 + int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx); 112 + 113 + #endif /* __BPF__ */