Merge tag 'sched_ext-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

+5 -1

include/linux/sched/ext.h

··· 108 108 SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ 109 109 /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ 110 110 SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ 111 - /* ops.dequeue (in REST) may be nested inside DISPATCH */ 111 + /* 112 + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and 113 + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be 114 + * nested inside DISPATCH. 115 + */ 112 116 SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ 113 117 SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ 114 118 SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */

+1

kernel/sched/build_policy.c

··· 58 58 #include "deadline.c" 59 59 60 60 #ifdef CONFIG_SCHED_CLASS_EXT 61 + # include "ext_internal.h" 61 62 # include "ext.c" 62 63 # include "ext_idle.c" 63 64 #endif

-2

kernel/sched/core.c

··· 9362 9362 9363 9363 cgroup_taskset_for_each(task, css, tset) 9364 9364 sched_move_task(task, false); 9365 - 9366 - scx_cgroup_finish_attach(); 9367 9365 } 9368 9366 9369 9367 static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)

+313 -1243

kernel/sched/ext.c

··· 9 9 #include <linux/btf_ids.h> 10 10 #include "ext_idle.h" 11 11 12 - #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 13 - 14 - enum scx_consts { 15 - SCX_DSP_DFL_MAX_BATCH = 32, 16 - SCX_DSP_MAX_LOOPS = 32, 17 - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, 18 - 19 - SCX_EXIT_BT_LEN = 64, 20 - SCX_EXIT_MSG_LEN = 1024, 21 - SCX_EXIT_DUMP_DFL_LEN = 32768, 22 - 23 - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, 24 - 25 - /* 26 - * Iterating all tasks may take a while. Periodically drop 27 - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. 28 - */ 29 - SCX_TASK_ITER_BATCH = 32, 30 - }; 31 - 32 - enum scx_exit_kind { 33 - SCX_EXIT_NONE, 34 - SCX_EXIT_DONE, 35 - 36 - SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ 37 - SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ 38 - SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ 39 - SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ 40 - 41 - SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ 42 - SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ 43 - SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ 44 - }; 45 - 46 - /* 47 - * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), 48 - * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes 49 - * are 64bit of the format: 50 - * 51 - * Bits: [63 .. 48 47 .. 32 31 .. 0] 52 - * [ SYS ACT ] [ SYS RSN ] [ USR ] 53 - * 54 - * SYS ACT: System-defined exit actions 55 - * SYS RSN: System-defined exit reasons 56 - * USR : User-defined exit codes and reasons 57 - * 58 - * Using the above, users may communicate intention and context by ORing system 59 - * actions and/or system reasons with a user-defined exit code. 60 - */ 61 - enum scx_exit_code { 62 - /* Reasons */ 63 - SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, 64 - 65 - /* Actions */ 66 - SCX_ECODE_ACT_RESTART = 1LLU << 48, 67 - }; 68 - 69 - /* 70 - * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is 71 - * being disabled. 72 - */ 73 - struct scx_exit_info { 74 - /* %SCX_EXIT_* - broad category of the exit reason */ 75 - enum scx_exit_kind kind; 76 - 77 - /* exit code if gracefully exiting */ 78 - s64 exit_code; 79 - 80 - /* textual representation of the above */ 81 - const char *reason; 82 - 83 - /* backtrace if exiting due to an error */ 84 - unsigned long *bt; 85 - u32 bt_len; 86 - 87 - /* informational message */ 88 - char *msg; 89 - 90 - /* debug dump */ 91 - char *dump; 92 - }; 93 - 94 - /* sched_ext_ops.flags */ 95 - enum scx_ops_flags { 96 - /* 97 - * Keep built-in idle tracking even if ops.update_idle() is implemented. 98 - */ 99 - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 100 - 101 - /* 102 - * By default, if there are no other task to run on the CPU, ext core 103 - * keeps running the current task even after its slice expires. If this 104 - * flag is specified, such tasks are passed to ops.enqueue() with 105 - * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. 106 - */ 107 - SCX_OPS_ENQ_LAST = 1LLU << 1, 108 - 109 - /* 110 - * An exiting task may schedule after PF_EXITING is set. In such cases, 111 - * bpf_task_from_pid() may not be able to find the task and if the BPF 112 - * scheduler depends on pid lookup for dispatching, the task will be 113 - * lost leading to various issues including RCU grace period stalls. 114 - * 115 - * To mask this problem, by default, unhashed tasks are automatically 116 - * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't 117 - * depend on pid lookups and wants to handle these tasks directly, the 118 - * following flag can be used. 119 - */ 120 - SCX_OPS_ENQ_EXITING = 1LLU << 2, 121 - 122 - /* 123 - * If set, only tasks with policy set to SCHED_EXT are attached to 124 - * sched_ext. If clear, SCHED_NORMAL tasks are also included. 125 - */ 126 - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 127 - 128 - /* 129 - * A migration disabled task can only execute on its current CPU. By 130 - * default, such tasks are automatically put on the CPU's local DSQ with 131 - * the default slice on enqueue. If this ops flag is set, they also go 132 - * through ops.enqueue(). 133 - * 134 - * A migration disabled task never invokes ops.select_cpu() as it can 135 - * only select the current CPU. Also, p->cpus_ptr will only contain its 136 - * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr 137 - * and thus may disagree with cpumask_weight(p->cpus_ptr). 138 - */ 139 - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, 140 - 141 - /* 142 - * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes 143 - * ops.enqueue() on the ops.select_cpu() selected or the wakee's 144 - * previous CPU via IPI (inter-processor interrupt) to reduce cacheline 145 - * transfers. When this optimization is enabled, ops.select_cpu() is 146 - * skipped in some cases (when racing against the wakee switching out). 147 - * As the BPF scheduler may depend on ops.select_cpu() being invoked 148 - * during wakeups, queued wakeup is disabled by default. 149 - * 150 - * If this ops flag is set, queued wakeup optimization is enabled and 151 - * the BPF scheduler must be able to handle ops.enqueue() invoked on the 152 - * wakee's CPU without preceding ops.select_cpu() even for tasks which 153 - * may be executed on multiple CPUs. 154 - */ 155 - SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, 156 - 157 - /* 158 - * If set, enable per-node idle cpumasks. If clear, use a single global 159 - * flat idle cpumask. 160 - */ 161 - SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, 162 - 163 - /* 164 - * CPU cgroup support flags 165 - */ 166 - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ 167 - 168 - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | 169 - SCX_OPS_ENQ_LAST | 170 - SCX_OPS_ENQ_EXITING | 171 - SCX_OPS_ENQ_MIGRATION_DISABLED | 172 - SCX_OPS_ALLOW_QUEUED_WAKEUP | 173 - SCX_OPS_SWITCH_PARTIAL | 174 - SCX_OPS_BUILTIN_IDLE_PER_NODE | 175 - SCX_OPS_HAS_CGROUP_WEIGHT, 176 - 177 - /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ 178 - __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, 179 - 180 - SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, 181 - }; 182 - 183 - /* argument container for ops.init_task() */ 184 - struct scx_init_task_args { 185 - /* 186 - * Set if ops.init_task() is being invoked on the fork path, as opposed 187 - * to the scheduler transition path. 188 - */ 189 - bool fork; 190 - #ifdef CONFIG_EXT_GROUP_SCHED 191 - /* the cgroup the task is joining */ 192 - struct cgroup *cgroup; 193 - #endif 194 - }; 195 - 196 - /* argument container for ops.exit_task() */ 197 - struct scx_exit_task_args { 198 - /* Whether the task exited before running on sched_ext. */ 199 - bool cancelled; 200 - }; 201 - 202 - /* argument container for ops->cgroup_init() */ 203 - struct scx_cgroup_init_args { 204 - /* the weight of the cgroup [1..10000] */ 205 - u32 weight; 206 - 207 - /* bandwidth control parameters from cpu.max and cpu.max.burst */ 208 - u64 bw_period_us; 209 - u64 bw_quota_us; 210 - u64 bw_burst_us; 211 - }; 212 - 213 - enum scx_cpu_preempt_reason { 214 - /* next task is being scheduled by &sched_class_rt */ 215 - SCX_CPU_PREEMPT_RT, 216 - /* next task is being scheduled by &sched_class_dl */ 217 - SCX_CPU_PREEMPT_DL, 218 - /* next task is being scheduled by &sched_class_stop */ 219 - SCX_CPU_PREEMPT_STOP, 220 - /* unknown reason for SCX being preempted */ 221 - SCX_CPU_PREEMPT_UNKNOWN, 222 - }; 223 - 224 - /* 225 - * Argument container for ops->cpu_acquire(). Currently empty, but may be 226 - * expanded in the future. 227 - */ 228 - struct scx_cpu_acquire_args {}; 229 - 230 - /* argument container for ops->cpu_release() */ 231 - struct scx_cpu_release_args { 232 - /* the reason the CPU was preempted */ 233 - enum scx_cpu_preempt_reason reason; 234 - 235 - /* the task that's going to be scheduled on the CPU */ 236 - struct task_struct *task; 237 - }; 238 - 239 - /* 240 - * Informational context provided to dump operations. 241 - */ 242 - struct scx_dump_ctx { 243 - enum scx_exit_kind kind; 244 - s64 exit_code; 245 - const char *reason; 246 - u64 at_ns; 247 - u64 at_jiffies; 248 - }; 249 - 250 - /** 251 - * struct sched_ext_ops - Operation table for BPF scheduler implementation 252 - * 253 - * A BPF scheduler can implement an arbitrary scheduling policy by 254 - * implementing and loading operations in this table. Note that a userland 255 - * scheduling policy can also be implemented using the BPF scheduler 256 - * as a shim layer. 257 - */ 258 - struct sched_ext_ops { 259 - /** 260 - * @select_cpu: Pick the target CPU for a task which is being woken up 261 - * @p: task being woken up 262 - * @prev_cpu: the cpu @p was on before sleeping 263 - * @wake_flags: SCX_WAKE_* 264 - * 265 - * Decision made here isn't final. @p may be moved to any CPU while it 266 - * is getting dispatched for execution later. However, as @p is not on 267 - * the rq at this point, getting the eventual execution CPU right here 268 - * saves a small bit of overhead down the line. 269 - * 270 - * If an idle CPU is returned, the CPU is kicked and will try to 271 - * dispatch. While an explicit custom mechanism can be added, 272 - * select_cpu() serves as the default way to wake up idle CPUs. 273 - * 274 - * @p may be inserted into a DSQ directly by calling 275 - * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. 276 - * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ 277 - * of the CPU returned by this operation. 278 - * 279 - * Note that select_cpu() is never called for tasks that can only run 280 - * on a single CPU or tasks with migration disabled, as they don't have 281 - * the option to select a different CPU. See select_task_rq() for 282 - * details. 283 - */ 284 - s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); 285 - 286 - /** 287 - * @enqueue: Enqueue a task on the BPF scheduler 288 - * @p: task being enqueued 289 - * @enq_flags: %SCX_ENQ_* 290 - * 291 - * @p is ready to run. Insert directly into a DSQ by calling 292 - * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly 293 - * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, 294 - * the task will stall. 295 - * 296 - * If @p was inserted into a DSQ from ops.select_cpu(), this callback is 297 - * skipped. 298 - */ 299 - void (*enqueue)(struct task_struct *p, u64 enq_flags); 300 - 301 - /** 302 - * @dequeue: Remove a task from the BPF scheduler 303 - * @p: task being dequeued 304 - * @deq_flags: %SCX_DEQ_* 305 - * 306 - * Remove @p from the BPF scheduler. This is usually called to isolate 307 - * the task while updating its scheduling properties (e.g. priority). 308 - * 309 - * The ext core keeps track of whether the BPF side owns a given task or 310 - * not and can gracefully ignore spurious dispatches from BPF side, 311 - * which makes it safe to not implement this method. However, depending 312 - * on the scheduling logic, this can lead to confusing behaviors - e.g. 313 - * scheduling position not being updated across a priority change. 314 - */ 315 - void (*dequeue)(struct task_struct *p, u64 deq_flags); 316 - 317 - /** 318 - * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs 319 - * @cpu: CPU to dispatch tasks for 320 - * @prev: previous task being switched out 321 - * 322 - * Called when a CPU's local dsq is empty. The operation should dispatch 323 - * one or more tasks from the BPF scheduler into the DSQs using 324 - * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ 325 - * using scx_bpf_dsq_move_to_local(). 326 - * 327 - * The maximum number of times scx_bpf_dsq_insert() can be called 328 - * without an intervening scx_bpf_dsq_move_to_local() is specified by 329 - * ops.dispatch_max_batch. See the comments on top of the two functions 330 - * for more details. 331 - * 332 - * When not %NULL, @prev is an SCX task with its slice depleted. If 333 - * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in 334 - * @prev->scx.flags, it is not enqueued yet and will be enqueued after 335 - * ops.dispatch() returns. To keep executing @prev, return without 336 - * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. 337 - */ 338 - void (*dispatch)(s32 cpu, struct task_struct *prev); 339 - 340 - /** 341 - * @tick: Periodic tick 342 - * @p: task running currently 343 - * 344 - * This operation is called every 1/HZ seconds on CPUs which are 345 - * executing an SCX task. Setting @p->scx.slice to 0 will trigger an 346 - * immediate dispatch cycle on the CPU. 347 - */ 348 - void (*tick)(struct task_struct *p); 349 - 350 - /** 351 - * @runnable: A task is becoming runnable on its associated CPU 352 - * @p: task becoming runnable 353 - * @enq_flags: %SCX_ENQ_* 354 - * 355 - * This and the following three functions can be used to track a task's 356 - * execution state transitions. A task becomes ->runnable() on a CPU, 357 - * and then goes through one or more ->running() and ->stopping() pairs 358 - * as it runs on the CPU, and eventually becomes ->quiescent() when it's 359 - * done running on the CPU. 360 - * 361 - * @p is becoming runnable on the CPU because it's 362 - * 363 - * - waking up (%SCX_ENQ_WAKEUP) 364 - * - being moved from another CPU 365 - * - being restored after temporarily taken off the queue for an 366 - * attribute change. 367 - * 368 - * This and ->enqueue() are related but not coupled. This operation 369 - * notifies @p's state transition and may not be followed by ->enqueue() 370 - * e.g. when @p is being dispatched to a remote CPU, or when @p is 371 - * being enqueued on a CPU experiencing a hotplug event. Likewise, a 372 - * task may be ->enqueue()'d without being preceded by this operation 373 - * e.g. after exhausting its slice. 374 - */ 375 - void (*runnable)(struct task_struct *p, u64 enq_flags); 376 - 377 - /** 378 - * @running: A task is starting to run on its associated CPU 379 - * @p: task starting to run 380 - * 381 - * Note that this callback may be called from a CPU other than the 382 - * one the task is going to run on. This can happen when a task 383 - * property is changed (i.e., affinity), since scx_next_task_scx(), 384 - * which triggers this callback, may run on a CPU different from 385 - * the task's assigned CPU. 386 - * 387 - * Therefore, always use scx_bpf_task_cpu(@p) to determine the 388 - * target CPU the task is going to use. 389 - * 390 - * See ->runnable() for explanation on the task state notifiers. 391 - */ 392 - void (*running)(struct task_struct *p); 393 - 394 - /** 395 - * @stopping: A task is stopping execution 396 - * @p: task stopping to run 397 - * @runnable: is task @p still runnable? 398 - * 399 - * Note that this callback may be called from a CPU other than the 400 - * one the task was running on. This can happen when a task 401 - * property is changed (i.e., affinity), since dequeue_task_scx(), 402 - * which triggers this callback, may run on a CPU different from 403 - * the task's assigned CPU. 404 - * 405 - * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU 406 - * the task was running on. 407 - * 408 - * See ->runnable() for explanation on the task state notifiers. If 409 - * !@runnable, ->quiescent() will be invoked after this operation 410 - * returns. 411 - */ 412 - void (*stopping)(struct task_struct *p, bool runnable); 413 - 414 - /** 415 - * @quiescent: A task is becoming not runnable on its associated CPU 416 - * @p: task becoming not runnable 417 - * @deq_flags: %SCX_DEQ_* 418 - * 419 - * See ->runnable() for explanation on the task state notifiers. 420 - * 421 - * @p is becoming quiescent on the CPU because it's 422 - * 423 - * - sleeping (%SCX_DEQ_SLEEP) 424 - * - being moved to another CPU 425 - * - being temporarily taken off the queue for an attribute change 426 - * (%SCX_DEQ_SAVE) 427 - * 428 - * This and ->dequeue() are related but not coupled. This operation 429 - * notifies @p's state transition and may not be preceded by ->dequeue() 430 - * e.g. when @p is being dispatched to a remote CPU. 431 - */ 432 - void (*quiescent)(struct task_struct *p, u64 deq_flags); 433 - 434 - /** 435 - * @yield: Yield CPU 436 - * @from: yielding task 437 - * @to: optional yield target task 438 - * 439 - * If @to is NULL, @from is yielding the CPU to other runnable tasks. 440 - * The BPF scheduler should ensure that other available tasks are 441 - * dispatched before the yielding task. Return value is ignored in this 442 - * case. 443 - * 444 - * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf 445 - * scheduler can implement the request, return %true; otherwise, %false. 446 - */ 447 - bool (*yield)(struct task_struct *from, struct task_struct *to); 448 - 449 - /** 450 - * @core_sched_before: Task ordering for core-sched 451 - * @a: task A 452 - * @b: task B 453 - * 454 - * Used by core-sched to determine the ordering between two tasks. See 455 - * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on 456 - * core-sched. 457 - * 458 - * Both @a and @b are runnable and may or may not currently be queued on 459 - * the BPF scheduler. Should return %true if @a should run before @b. 460 - * %false if there's no required ordering or @b should run before @a. 461 - * 462 - * If not specified, the default is ordering them according to when they 463 - * became runnable. 464 - */ 465 - bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); 466 - 467 - /** 468 - * @set_weight: Set task weight 469 - * @p: task to set weight for 470 - * @weight: new weight [1..10000] 471 - * 472 - * Update @p's weight to @weight. 473 - */ 474 - void (*set_weight)(struct task_struct *p, u32 weight); 475 - 476 - /** 477 - * @set_cpumask: Set CPU affinity 478 - * @p: task to set CPU affinity for 479 - * @cpumask: cpumask of cpus that @p can run on 480 - * 481 - * Update @p's CPU affinity to @cpumask. 482 - */ 483 - void (*set_cpumask)(struct task_struct *p, 484 - const struct cpumask *cpumask); 485 - 486 - /** 487 - * @update_idle: Update the idle state of a CPU 488 - * @cpu: CPU to update the idle state for 489 - * @idle: whether entering or exiting the idle state 490 - * 491 - * This operation is called when @rq's CPU goes or leaves the idle 492 - * state. By default, implementing this operation disables the built-in 493 - * idle CPU tracking and the following helpers become unavailable: 494 - * 495 - * - scx_bpf_select_cpu_dfl() 496 - * - scx_bpf_select_cpu_and() 497 - * - scx_bpf_test_and_clear_cpu_idle() 498 - * - scx_bpf_pick_idle_cpu() 499 - * 500 - * The user also must implement ops.select_cpu() as the default 501 - * implementation relies on scx_bpf_select_cpu_dfl(). 502 - * 503 - * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle 504 - * tracking. 505 - */ 506 - void (*update_idle)(s32 cpu, bool idle); 507 - 508 - /** 509 - * @cpu_acquire: A CPU is becoming available to the BPF scheduler 510 - * @cpu: The CPU being acquired by the BPF scheduler. 511 - * @args: Acquire arguments, see the struct definition. 512 - * 513 - * A CPU that was previously released from the BPF scheduler is now once 514 - * again under its control. 515 - */ 516 - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); 517 - 518 - /** 519 - * @cpu_release: A CPU is taken away from the BPF scheduler 520 - * @cpu: The CPU being released by the BPF scheduler. 521 - * @args: Release arguments, see the struct definition. 522 - * 523 - * The specified CPU is no longer under the control of the BPF 524 - * scheduler. This could be because it was preempted by a higher 525 - * priority sched_class, though there may be other reasons as well. The 526 - * caller should consult @args->reason to determine the cause. 527 - */ 528 - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); 529 - 530 - /** 531 - * @init_task: Initialize a task to run in a BPF scheduler 532 - * @p: task to initialize for BPF scheduling 533 - * @args: init arguments, see the struct definition 534 - * 535 - * Either we're loading a BPF scheduler or a new task is being forked. 536 - * Initialize @p for BPF scheduling. This operation may block and can 537 - * be used for allocations, and is called exactly once for a task. 538 - * 539 - * Return 0 for success, -errno for failure. An error return while 540 - * loading will abort loading of the BPF scheduler. During a fork, it 541 - * will abort that specific fork. 542 - */ 543 - s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); 544 - 545 - /** 546 - * @exit_task: Exit a previously-running task from the system 547 - * @p: task to exit 548 - * @args: exit arguments, see the struct definition 549 - * 550 - * @p is exiting or the BPF scheduler is being unloaded. Perform any 551 - * necessary cleanup for @p. 552 - */ 553 - void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); 554 - 555 - /** 556 - * @enable: Enable BPF scheduling for a task 557 - * @p: task to enable BPF scheduling for 558 - * 559 - * Enable @p for BPF scheduling. enable() is called on @p any time it 560 - * enters SCX, and is always paired with a matching disable(). 561 - */ 562 - void (*enable)(struct task_struct *p); 563 - 564 - /** 565 - * @disable: Disable BPF scheduling for a task 566 - * @p: task to disable BPF scheduling for 567 - * 568 - * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. 569 - * Disable BPF scheduling for @p. A disable() call is always matched 570 - * with a prior enable() call. 571 - */ 572 - void (*disable)(struct task_struct *p); 573 - 574 - /** 575 - * @dump: Dump BPF scheduler state on error 576 - * @ctx: debug dump context 577 - * 578 - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. 579 - */ 580 - void (*dump)(struct scx_dump_ctx *ctx); 581 - 582 - /** 583 - * @dump_cpu: Dump BPF scheduler state for a CPU on error 584 - * @ctx: debug dump context 585 - * @cpu: CPU to generate debug dump for 586 - * @idle: @cpu is currently idle without any runnable tasks 587 - * 588 - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 589 - * @cpu. If @idle is %true and this operation doesn't produce any 590 - * output, @cpu is skipped for dump. 591 - */ 592 - void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); 593 - 594 - /** 595 - * @dump_task: Dump BPF scheduler state for a runnable task on error 596 - * @ctx: debug dump context 597 - * @p: runnable task to generate debug dump for 598 - * 599 - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 600 - * @p. 601 - */ 602 - void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); 603 - 604 - #ifdef CONFIG_EXT_GROUP_SCHED 605 - /** 606 - * @cgroup_init: Initialize a cgroup 607 - * @cgrp: cgroup being initialized 608 - * @args: init arguments, see the struct definition 609 - * 610 - * Either the BPF scheduler is being loaded or @cgrp created, initialize 611 - * @cgrp for sched_ext. This operation may block. 612 - * 613 - * Return 0 for success, -errno for failure. An error return while 614 - * loading will abort loading of the BPF scheduler. During cgroup 615 - * creation, it will abort the specific cgroup creation. 616 - */ 617 - s32 (*cgroup_init)(struct cgroup *cgrp, 618 - struct scx_cgroup_init_args *args); 619 - 620 - /** 621 - * @cgroup_exit: Exit a cgroup 622 - * @cgrp: cgroup being exited 623 - * 624 - * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit 625 - * @cgrp for sched_ext. This operation my block. 626 - */ 627 - void (*cgroup_exit)(struct cgroup *cgrp); 628 - 629 - /** 630 - * @cgroup_prep_move: Prepare a task to be moved to a different cgroup 631 - * @p: task being moved 632 - * @from: cgroup @p is being moved from 633 - * @to: cgroup @p is being moved to 634 - * 635 - * Prepare @p for move from cgroup @from to @to. This operation may 636 - * block and can be used for allocations. 637 - * 638 - * Return 0 for success, -errno for failure. An error return aborts the 639 - * migration. 640 - */ 641 - s32 (*cgroup_prep_move)(struct task_struct *p, 642 - struct cgroup *from, struct cgroup *to); 643 - 644 - /** 645 - * @cgroup_move: Commit cgroup move 646 - * @p: task being moved 647 - * @from: cgroup @p is being moved from 648 - * @to: cgroup @p is being moved to 649 - * 650 - * Commit the move. @p is dequeued during this operation. 651 - */ 652 - void (*cgroup_move)(struct task_struct *p, 653 - struct cgroup *from, struct cgroup *to); 654 - 655 - /** 656 - * @cgroup_cancel_move: Cancel cgroup move 657 - * @p: task whose cgroup move is being canceled 658 - * @from: cgroup @p was being moved from 659 - * @to: cgroup @p was being moved to 660 - * 661 - * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). 662 - * Undo the preparation. 663 - */ 664 - void (*cgroup_cancel_move)(struct task_struct *p, 665 - struct cgroup *from, struct cgroup *to); 666 - 667 - /** 668 - * @cgroup_set_weight: A cgroup's weight is being changed 669 - * @cgrp: cgroup whose weight is being updated 670 - * @weight: new weight [1..10000] 671 - * 672 - * Update @cgrp's weight to @weight. 673 - */ 674 - void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); 675 - 676 - /** 677 - * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed 678 - * @cgrp: cgroup whose bandwidth is being updated 679 - * @period_us: bandwidth control period 680 - * @quota_us: bandwidth control quota 681 - * @burst_us: bandwidth control burst 682 - * 683 - * Update @cgrp's bandwidth control parameters. This is from the cpu.max 684 - * cgroup interface. 685 - * 686 - * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled 687 - * to. For example, if @period_us is 1_000_000 and @quota_us is 688 - * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be 689 - * interpreted in the same fashion and specifies how much @cgrp can 690 - * burst temporarily. The specific control mechanism and thus the 691 - * interpretation of @period_us and burstiness is upto to the BPF 692 - * scheduler. 693 - */ 694 - void (*cgroup_set_bandwidth)(struct cgroup *cgrp, 695 - u64 period_us, u64 quota_us, u64 burst_us); 696 - 697 - #endif /* CONFIG_EXT_GROUP_SCHED */ 698 - 699 - /* 700 - * All online ops must come before ops.cpu_online(). 701 - */ 702 - 703 - /** 704 - * @cpu_online: A CPU became online 705 - * @cpu: CPU which just came up 706 - * 707 - * @cpu just came online. @cpu will not call ops.enqueue() or 708 - * ops.dispatch(), nor run tasks associated with other CPUs beforehand. 709 - */ 710 - void (*cpu_online)(s32 cpu); 711 - 712 - /** 713 - * @cpu_offline: A CPU is going offline 714 - * @cpu: CPU which is going offline 715 - * 716 - * @cpu is going offline. @cpu will not call ops.enqueue() or 717 - * ops.dispatch(), nor run tasks associated with other CPUs afterwards. 718 - */ 719 - void (*cpu_offline)(s32 cpu); 720 - 721 - /* 722 - * All CPU hotplug ops must come before ops.init(). 723 - */ 724 - 725 - /** 726 - * @init: Initialize the BPF scheduler 727 - */ 728 - s32 (*init)(void); 729 - 730 - /** 731 - * @exit: Clean up after the BPF scheduler 732 - * @info: Exit info 733 - * 734 - * ops.exit() is also called on ops.init() failure, which is a bit 735 - * unusual. This is to allow rich reporting through @info on how 736 - * ops.init() failed. 737 - */ 738 - void (*exit)(struct scx_exit_info *info); 739 - 740 - /** 741 - * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch 742 - */ 743 - u32 dispatch_max_batch; 744 - 745 - /** 746 - * @flags: %SCX_OPS_* flags 747 - */ 748 - u64 flags; 749 - 750 - /** 751 - * @timeout_ms: The maximum amount of time, in milliseconds, that a 752 - * runnable task should be able to wait before being scheduled. The 753 - * maximum timeout may not exceed the default timeout of 30 seconds. 754 - * 755 - * Defaults to the maximum allowed timeout value of 30 seconds. 756 - */ 757 - u32 timeout_ms; 758 - 759 - /** 760 - * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default 761 - * value of 32768 is used. 762 - */ 763 - u32 exit_dump_len; 764 - 765 - /** 766 - * @hotplug_seq: A sequence number that may be set by the scheduler to 767 - * detect when a hotplug event has occurred during the loading process. 768 - * If 0, no detection occurs. Otherwise, the scheduler will fail to 769 - * load if the sequence number does not match @scx_hotplug_seq on the 770 - * enable path. 771 - */ 772 - u64 hotplug_seq; 773 - 774 - /** 775 - * @name: BPF scheduler's name 776 - * 777 - * Must be a non-zero valid BPF object name including only isalnum(), 778 - * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the 779 - * BPF scheduler is enabled. 780 - */ 781 - char name[SCX_OPS_NAME_LEN]; 782 - 783 - /* internal use only, must be NULL */ 784 - void *priv; 785 - }; 786 - 787 - enum scx_opi { 788 - SCX_OPI_BEGIN = 0, 789 - SCX_OPI_NORMAL_BEGIN = 0, 790 - SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), 791 - SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), 792 - SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), 793 - SCX_OPI_END = SCX_OP_IDX(init), 794 - }; 795 - 796 - /* 797 - * Collection of event counters. Event types are placed in descending order. 798 - */ 799 - struct scx_event_stats { 800 - /* 801 - * If ops.select_cpu() returns a CPU which can't be used by the task, 802 - * the core scheduler code silently picks a fallback CPU. 803 - */ 804 - s64 SCX_EV_SELECT_CPU_FALLBACK; 805 - 806 - /* 807 - * When dispatching to a local DSQ, the CPU may have gone offline in 808 - * the meantime. In this case, the task is bounced to the global DSQ. 809 - */ 810 - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; 811 - 812 - /* 813 - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task 814 - * continued to run because there were no other tasks on the CPU. 815 - */ 816 - s64 SCX_EV_DISPATCH_KEEP_LAST; 817 - 818 - /* 819 - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task 820 - * is dispatched to a local DSQ when exiting. 821 - */ 822 - s64 SCX_EV_ENQ_SKIP_EXITING; 823 - 824 - /* 825 - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a 826 - * migration disabled task skips ops.enqueue() and is dispatched to its 827 - * local DSQ. 828 - */ 829 - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; 830 - 831 - /* 832 - * Total number of times a task's time slice was refilled with the 833 - * default value (SCX_SLICE_DFL). 834 - */ 835 - s64 SCX_EV_REFILL_SLICE_DFL; 836 - 837 - /* 838 - * The total duration of bypass modes in nanoseconds. 839 - */ 840 - s64 SCX_EV_BYPASS_DURATION; 841 - 842 - /* 843 - * The number of tasks dispatched in the bypassing mode. 844 - */ 845 - s64 SCX_EV_BYPASS_DISPATCH; 846 - 847 - /* 848 - * The number of times the bypassing mode has been activated. 849 - */ 850 - s64 SCX_EV_BYPASS_ACTIVATE; 851 - }; 852 - 853 - struct scx_sched { 854 - struct sched_ext_ops ops; 855 - DECLARE_BITMAP(has_op, SCX_OPI_END); 856 - 857 - /* 858 - * Dispatch queues. 859 - * 860 - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. 861 - * This is to avoid live-locking in bypass mode where all tasks are 862 - * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If 863 - * per-node split isn't sufficient, it can be further split. 864 - */ 865 - struct rhashtable dsq_hash; 866 - struct scx_dispatch_q **global_dsqs; 867 - 868 - /* 869 - * The event counters are in a per-CPU variable to minimize the 870 - * accounting overhead. A system-wide view on the event counter is 871 - * constructed when requested by scx_bpf_events(). 872 - */ 873 - struct scx_event_stats __percpu *event_stats_cpu; 874 - 875 - bool warned_zero_slice; 876 - 877 - atomic_t exit_kind; 878 - struct scx_exit_info *exit_info; 879 - 880 - struct kobject kobj; 881 - 882 - struct kthread_worker *helper; 883 - struct irq_work error_irq_work; 884 - struct kthread_work disable_work; 885 - struct rcu_work rcu_work; 886 - }; 887 - 888 - enum scx_wake_flags { 889 - /* expose select WF_* flags as enums */ 890 - SCX_WAKE_FORK = WF_FORK, 891 - SCX_WAKE_TTWU = WF_TTWU, 892 - SCX_WAKE_SYNC = WF_SYNC, 893 - }; 894 - 895 - enum scx_enq_flags { 896 - /* expose select ENQUEUE_* flags as enums */ 897 - SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, 898 - SCX_ENQ_HEAD = ENQUEUE_HEAD, 899 - SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, 900 - 901 - /* high 32bits are SCX specific */ 902 - 903 - /* 904 - * Set the following to trigger preemption when calling 905 - * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the 906 - * current task is cleared to zero and the CPU is kicked into the 907 - * scheduling path. Implies %SCX_ENQ_HEAD. 908 - */ 909 - SCX_ENQ_PREEMPT = 1LLU << 32, 910 - 911 - /* 912 - * The task being enqueued was previously enqueued on the current CPU's 913 - * %SCX_DSQ_LOCAL, but was removed from it in a call to the 914 - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was 915 - * invoked in a ->cpu_release() callback, and the task is again 916 - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the 917 - * task will not be scheduled on the CPU until at least the next invocation 918 - * of the ->cpu_acquire() callback. 919 - */ 920 - SCX_ENQ_REENQ = 1LLU << 40, 921 - 922 - /* 923 - * The task being enqueued is the only task available for the cpu. By 924 - * default, ext core keeps executing such tasks but when 925 - * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the 926 - * %SCX_ENQ_LAST flag set. 927 - * 928 - * The BPF scheduler is responsible for triggering a follow-up 929 - * scheduling event. Otherwise, Execution may stall. 930 - */ 931 - SCX_ENQ_LAST = 1LLU << 41, 932 - 933 - /* high 8 bits are internal */ 934 - __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, 935 - 936 - SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 937 - SCX_ENQ_DSQ_PRIQ = 1LLU << 57, 938 - }; 939 - 940 - enum scx_deq_flags { 941 - /* expose select DEQUEUE_* flags as enums */ 942 - SCX_DEQ_SLEEP = DEQUEUE_SLEEP, 943 - 944 - /* high 32bits are SCX specific */ 945 - 946 - /* 947 - * The generic core-sched layer decided to execute the task even though 948 - * it hasn't been dispatched yet. Dequeue from the BPF side. 949 - */ 950 - SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, 951 - }; 952 - 953 - enum scx_pick_idle_cpu_flags { 954 - SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ 955 - SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ 956 - }; 957 - 958 - enum scx_kick_flags { 959 - /* 960 - * Kick the target CPU if idle. Guarantees that the target CPU goes 961 - * through at least one full scheduling cycle before going idle. If the 962 - * target CPU can be determined to be currently not idle and going to go 963 - * through a scheduling cycle before going idle, noop. 964 - */ 965 - SCX_KICK_IDLE = 1LLU << 0, 966 - 967 - /* 968 - * Preempt the current task and execute the dispatch path. If the 969 - * current task of the target CPU is an SCX task, its ->scx.slice is 970 - * cleared to zero before the scheduling path is invoked so that the 971 - * task expires and the dispatch path is invoked. 972 - */ 973 - SCX_KICK_PREEMPT = 1LLU << 1, 974 - 975 - /* 976 - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will 977 - * return after the target CPU finishes picking the next task. 978 - */ 979 - SCX_KICK_WAIT = 1LLU << 2, 980 - }; 981 - 982 - enum scx_tg_flags { 983 - SCX_TG_ONLINE = 1U << 0, 984 - SCX_TG_INITED = 1U << 1, 985 - }; 986 - 987 - enum scx_enable_state { 988 - SCX_ENABLING, 989 - SCX_ENABLED, 990 - SCX_DISABLING, 991 - SCX_DISABLED, 992 - }; 993 - 994 - static const char *scx_enable_state_str[] = { 995 - [SCX_ENABLING] = "enabling", 996 - [SCX_ENABLED] = "enabled", 997 - [SCX_DISABLING] = "disabling", 998 - [SCX_DISABLED] = "disabled", 999 - }; 1000 - 1001 - /* 1002 - * sched_ext_entity->ops_state 1003 - * 1004 - * Used to track the task ownership between the SCX core and the BPF scheduler. 1005 - * State transitions look as follows: 1006 - * 1007 - * NONE -> QUEUEING -> QUEUED -> DISPATCHING 1008 - * ^ | | 1009 - * | v v 1010 - * \-------------------------------/ 1011 - * 1012 - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call 1013 - * sites for explanations on the conditions being waited upon and why they are 1014 - * safe. Transitions out of them into NONE or QUEUED must store_release and the 1015 - * waiters should load_acquire. 1016 - * 1017 - * Tracking scx_ops_state enables sched_ext core to reliably determine whether 1018 - * any given task can be dispatched by the BPF scheduler at all times and thus 1019 - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler 1020 - * to try to dispatch any task anytime regardless of its state as the SCX core 1021 - * can safely reject invalid dispatches. 1022 - */ 1023 - enum scx_ops_state { 1024 - SCX_OPSS_NONE, /* owned by the SCX core */ 1025 - SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ 1026 - SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ 1027 - SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ 1028 - 1029 - /* 1030 - * QSEQ brands each QUEUED instance so that, when dispatch races 1031 - * dequeue/requeue, the dispatcher can tell whether it still has a claim 1032 - * on the task being dispatched. 1033 - * 1034 - * As some 32bit archs can't do 64bit store_release/load_acquire, 1035 - * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on 1036 - * 32bit machines. The dispatch race window QSEQ protects is very narrow 1037 - * and runs with IRQ disabled. 30 bits should be sufficient. 1038 - */ 1039 - SCX_OPSS_QSEQ_SHIFT = 2, 1040 - }; 1041 - 1042 - /* Use macros to ensure that the type is unsigned long for the masks */ 1043 - #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) 1044 - #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) 1045 - 1046 12 /* 1047 13 * NOTE: sched_ext is in the process of growing multiple scheduler support and 1048 14 * scx_root usage is in a transitional state. Naked dereferences are safe if the ··· 136 1170 #include <trace/events/sched_ext.h> 137 1171 138 1172 static void process_ddsp_deferred_locals(struct rq *rq); 139 - static void scx_bpf_kick_cpu(s32 cpu, u64 flags); 1173 + static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 140 1174 static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 141 1175 s64 exit_code, const char *fmt, va_list args); 142 1176 ··· 151 1185 va_end(args); 152 1186 } 153 1187 154 - static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, 155 - const char *fmt, ...) 156 - { 157 - struct scx_sched *sch; 158 - va_list args; 159 - 160 - rcu_read_lock(); 161 - sch = rcu_dereference(scx_root); 162 - if (sch) { 163 - va_start(args, fmt); 164 - scx_vexit(sch, kind, exit_code, fmt, args); 165 - va_end(args); 166 - } 167 - rcu_read_unlock(); 168 - } 169 - 170 1188 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) 171 - #define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) 172 1189 173 1190 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 174 1191 ··· 181 1232 return (s32)(a - b) < 0; 182 1233 } 183 1234 184 - static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) 1235 + static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, 1236 + struct task_struct *p) 185 1237 { 186 - struct scx_sched *sch = scx_root; 187 - 188 1238 return sch->global_dsqs[cpu_to_node(task_cpu(p))]; 189 1239 } 190 1240 ··· 311 1363 }) 312 1364 313 1365 /* @mask is constant, always inline to cull unnecessary branches */ 314 - static __always_inline bool scx_kf_allowed(u32 mask) 1366 + static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) 315 1367 { 316 1368 if (unlikely(!(current->scx.kf_mask & mask))) { 317 - scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", 318 - mask, current->scx.kf_mask); 1369 + scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", 1370 + mask, current->scx.kf_mask); 319 1371 return false; 320 1372 } 321 1373 ··· 328 1380 */ 329 1381 if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && 330 1382 (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { 331 - scx_kf_error("cpu_release kfunc called from a nested operation"); 1383 + scx_error(sch, "cpu_release kfunc called from a nested operation"); 332 1384 return false; 333 1385 } 334 1386 335 1387 if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && 336 1388 (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { 337 - scx_kf_error("dispatch kfunc called from a nested operation"); 1389 + scx_error(sch, "dispatch kfunc called from a nested operation"); 338 1390 return false; 339 1391 } 340 1392 ··· 342 1394 } 343 1395 344 1396 /* see SCX_CALL_OP_TASK() */ 345 - static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, 1397 + static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, 1398 + u32 mask, 346 1399 struct task_struct *p) 347 1400 { 348 - if (!scx_kf_allowed(mask)) 1401 + if (!scx_kf_allowed(sch, mask)) 349 1402 return false; 350 1403 351 1404 if (unlikely((p != current->scx.kf_tasks[0] && 352 1405 p != current->scx.kf_tasks[1]))) { 353 - scx_kf_error("called on a task not being operated on"); 1406 + scx_error(sch, "called on a task not being operated on"); 354 1407 return false; 355 1408 } 356 1409 ··· 437 1488 */ 438 1489 struct scx_task_iter { 439 1490 struct sched_ext_entity cursor; 440 - struct task_struct *locked; 1491 + struct task_struct *locked_task; 441 1492 struct rq *rq; 442 1493 struct rq_flags rf; 443 1494 u32 cnt; 1495 + bool list_locked; 444 1496 }; 445 1497 446 1498 /** ··· 469 1519 470 1520 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 471 1521 list_add(&iter->cursor.tasks_node, &scx_tasks); 472 - iter->locked = NULL; 1522 + iter->locked_task = NULL; 473 1523 iter->cnt = 0; 1524 + iter->list_locked = true; 474 1525 } 475 1526 476 1527 static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) 477 1528 { 478 - if (iter->locked) { 479 - task_rq_unlock(iter->rq, iter->locked, &iter->rf); 480 - iter->locked = NULL; 1529 + if (iter->locked_task) { 1530 + task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); 1531 + iter->locked_task = NULL; 481 1532 } 482 1533 } 483 1534 ··· 488 1537 * 489 1538 * If @iter is in the middle of a locked iteration, it may be locking the rq of 490 1539 * the task currently being visited in addition to scx_tasks_lock. Unlock both. 491 - * This function can be safely called anytime during an iteration. 1540 + * This function can be safely called anytime during an iteration. The next 1541 + * iterator operation will automatically restore the necessary locking. 492 1542 */ 493 1543 static void scx_task_iter_unlock(struct scx_task_iter *iter) 494 1544 { 495 1545 __scx_task_iter_rq_unlock(iter); 496 - spin_unlock_irq(&scx_tasks_lock); 1546 + if (iter->list_locked) { 1547 + iter->list_locked = false; 1548 + spin_unlock_irq(&scx_tasks_lock); 1549 + } 497 1550 } 498 1551 499 - /** 500 - * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() 501 - * @iter: iterator to re-lock 502 - * 503 - * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it 504 - * doesn't re-lock the rq lock. Must be called before other iterator operations. 505 - */ 506 - static void scx_task_iter_relock(struct scx_task_iter *iter) 1552 + static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) 507 1553 { 508 - spin_lock_irq(&scx_tasks_lock); 1554 + if (!iter->list_locked) { 1555 + spin_lock_irq(&scx_tasks_lock); 1556 + iter->list_locked = true; 1557 + } 509 1558 } 510 1559 511 1560 /** ··· 518 1567 */ 519 1568 static void scx_task_iter_stop(struct scx_task_iter *iter) 520 1569 { 1570 + __scx_task_iter_maybe_relock(iter); 521 1571 list_del_init(&iter->cursor.tasks_node); 522 1572 scx_task_iter_unlock(iter); 523 1573 } ··· 536 1584 struct list_head *cursor = &iter->cursor.tasks_node; 537 1585 struct sched_ext_entity *pos; 538 1586 1587 + __scx_task_iter_maybe_relock(iter); 1588 + 539 1589 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 540 1590 scx_task_iter_unlock(iter); 541 1591 cond_resched(); 542 - scx_task_iter_relock(iter); 1592 + __scx_task_iter_maybe_relock(iter); 543 1593 } 544 1594 545 1595 list_for_each_entry(pos, cursor, tasks_node) { ··· 604 1650 return NULL; 605 1651 606 1652 iter->rq = task_rq_lock(p, &iter->rf); 607 - iter->locked = p; 1653 + iter->locked_task = p; 608 1654 609 1655 return p; 610 1656 } ··· 618 1664 * This can be used when preemption is not disabled. 619 1665 */ 620 1666 #define scx_add_event(sch, name, cnt) do { \ 621 - this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ 1667 + this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 622 1668 trace_sched_ext_event(#name, (cnt)); \ 623 1669 } while(0) 624 1670 ··· 631 1677 * This should be used only when preemption is disabled. 632 1678 */ 633 1679 #define __scx_add_event(sch, name, cnt) do { \ 634 - __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ 1680 + __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ 635 1681 trace_sched_ext_event(#name, cnt); \ 636 1682 } while(0) 637 1683 ··· 715 1761 return true; 716 1762 } else { 717 1763 scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 718 - return false; 719 - } 720 - } 721 - 722 - /** 723 - * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args 724 - * @cpu: cpu number which came from a BPF ops 725 - * @where: extra information reported on error 726 - * 727 - * The same as ops_cpu_valid() but @sch is implicit. 728 - */ 729 - static bool kf_cpu_valid(u32 cpu, const char *where) 730 - { 731 - if (__cpu_valid(cpu)) { 732 - return true; 733 - } else { 734 - scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); 735 1764 return false; 736 1765 } 737 1766 } ··· 879 1942 WRITE_ONCE(dsq->nr, dsq->nr + delta); 880 1943 } 881 1944 882 - static void refill_task_slice_dfl(struct task_struct *p) 1945 + static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 883 1946 { 884 1947 p->scx.slice = SCX_SLICE_DFL; 885 - __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); 1948 + __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 886 1949 } 887 1950 888 1951 static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, ··· 900 1963 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 901 1964 /* fall back to the global dsq */ 902 1965 raw_spin_unlock(&dsq->lock); 903 - dsq = find_global_dsq(p); 1966 + dsq = find_global_dsq(sch, p); 904 1967 raw_spin_lock(&dsq->lock); 905 1968 } 906 1969 } ··· 1079 2142 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1080 2143 1081 2144 if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1082 - return find_global_dsq(p); 2145 + return find_global_dsq(sch, p); 1083 2146 1084 2147 return &cpu_rq(cpu)->scx.local_dsq; 1085 2148 } 1086 2149 1087 2150 if (dsq_id == SCX_DSQ_GLOBAL) 1088 - dsq = find_global_dsq(p); 2151 + dsq = find_global_dsq(sch, p); 1089 2152 else 1090 2153 dsq = find_user_dsq(sch, dsq_id); 1091 2154 1092 2155 if (unlikely(!dsq)) { 1093 2156 scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", 1094 2157 dsq_id, p->comm, p->pid); 1095 - return find_global_dsq(p); 2158 + return find_global_dsq(sch, p); 1096 2159 } 1097 2160 1098 2161 return dsq; 1099 2162 } 1100 2163 1101 - static void mark_direct_dispatch(struct task_struct *ddsp_task, 2164 + static void mark_direct_dispatch(struct scx_sched *sch, 2165 + struct task_struct *ddsp_task, 1102 2166 struct task_struct *p, u64 dsq_id, 1103 2167 u64 enq_flags) 1104 2168 { ··· 1113 2175 /* @p must match the task on the enqueue path */ 1114 2176 if (unlikely(p != ddsp_task)) { 1115 2177 if (IS_ERR(ddsp_task)) 1116 - scx_kf_error("%s[%d] already direct-dispatched", 2178 + scx_error(sch, "%s[%d] already direct-dispatched", 1117 2179 p->comm, p->pid); 1118 2180 else 1119 - scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 2181 + scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", 1120 2182 ddsp_task->comm, ddsp_task->pid, 1121 2183 p->comm, p->pid); 1122 2184 return; ··· 1271 2333 * higher priority it becomes from scx_prio_less()'s POV. 1272 2334 */ 1273 2335 touch_core_sched(rq, p); 1274 - refill_task_slice_dfl(p); 2336 + refill_task_slice_dfl(sch, p); 1275 2337 local_norefill: 1276 2338 dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); 1277 2339 return; 1278 2340 1279 2341 global: 1280 2342 touch_core_sched(rq, p); /* see the comment in local: */ 1281 - refill_task_slice_dfl(p); 1282 - dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); 2343 + refill_task_slice_dfl(sch, p); 2344 + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); 1283 2345 } 1284 2346 1285 2347 static bool task_runnable(const struct task_struct *p) ··· 1589 2651 1590 2652 if (!scx_rq_online(rq)) { 1591 2653 if (enforce) 1592 - __scx_add_event(scx_root, 1593 - SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2654 + __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 1594 2655 return false; 1595 2656 } 1596 2657 ··· 1691 2754 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 1692 2755 if (src_rq != dst_rq && 1693 2756 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 1694 - dst_dsq = find_global_dsq(p); 2757 + dst_dsq = find_global_dsq(sch, p); 1695 2758 dst_rq = src_rq; 1696 2759 } 1697 2760 } else { ··· 1847 2910 1848 2911 if (src_rq != dst_rq && 1849 2912 unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { 1850 - dispatch_enqueue(sch, find_global_dsq(p), p, 2913 + dispatch_enqueue(sch, find_global_dsq(sch, p), p, 1851 2914 enq_flags | SCX_ENQ_CLEAR_OPSS); 1852 2915 return; 1853 2916 } ··· 2092 3155 * balance(), we want to complete this scheduling cycle and then 2093 3156 * start a new one. IOW, we want to call resched_curr() on the 2094 3157 * next, most likely idle, task, not the current one. Use 2095 - * scx_bpf_kick_cpu() for deferred kicking. 3158 + * scx_kick_cpu() for deferred kicking. 2096 3159 */ 2097 3160 if (unlikely(!--nr_loops)) { 2098 - scx_bpf_kick_cpu(cpu_of(rq), 0); 3161 + scx_kick_cpu(sch, cpu_of(rq), 0); 2099 3162 break; 2100 3163 } 2101 3164 } while (dspc->nr_tasks); ··· 2379 3442 if (keep_prev) { 2380 3443 p = prev; 2381 3444 if (!p->scx.slice) 2382 - refill_task_slice_dfl(p); 3445 + refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); 2383 3446 } else { 2384 3447 p = first_local_task(rq); 2385 3448 if (!p) { 2386 3449 if (kick_idle) 2387 - scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); 3450 + scx_kick_cpu(rcu_dereference_sched(scx_root), 3451 + cpu_of(rq), SCX_KICK_IDLE); 2388 3452 return NULL; 2389 3453 } 2390 3454 2391 3455 if (unlikely(!p->scx.slice)) { 2392 - struct scx_sched *sch = scx_root; 3456 + struct scx_sched *sch = rcu_dereference_sched(scx_root); 2393 3457 2394 3458 if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { 2395 3459 printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", 2396 3460 p->comm, p->pid, __func__); 2397 3461 sch->warned_zero_slice = true; 2398 3462 } 2399 - refill_task_slice_dfl(p); 3463 + refill_task_slice_dfl(sch, p); 2400 3464 } 2401 3465 } 2402 3466 ··· 2486 3548 2487 3549 cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); 2488 3550 if (cpu >= 0) { 2489 - refill_task_slice_dfl(p); 3551 + refill_task_slice_dfl(sch, p); 2490 3552 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 2491 3553 } else { 2492 3554 cpu = prev_cpu; ··· 3022 4084 3023 4085 #ifdef CONFIG_EXT_GROUP_SCHED 3024 4086 3025 - DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); 4087 + DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); 3026 4088 static bool scx_cgroup_enabled; 3027 4089 3028 4090 void scx_tg_init(struct task_group *tg) ··· 3038 4100 int ret = 0; 3039 4101 3040 4102 WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 3041 - 3042 - percpu_down_read(&scx_cgroup_rwsem); 3043 4103 3044 4104 if (scx_cgroup_enabled) { 3045 4105 if (SCX_HAS_OP(sch, cgroup_init)) { ··· 3058 4122 tg->scx.flags |= SCX_TG_ONLINE; 3059 4123 } 3060 4124 3061 - percpu_up_read(&scx_cgroup_rwsem); 3062 4125 return ret; 3063 4126 } 3064 4127 ··· 3067 4132 3068 4133 WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); 3069 4134 3070 - percpu_down_read(&scx_cgroup_rwsem); 3071 - 3072 4135 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && 3073 4136 (tg->scx.flags & SCX_TG_INITED)) 3074 4137 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, 3075 4138 tg->css.cgroup); 3076 4139 tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 3077 - 3078 - percpu_up_read(&scx_cgroup_rwsem); 3079 4140 } 3080 4141 3081 4142 int scx_cgroup_can_attach(struct cgroup_taskset *tset) ··· 3080 4149 struct cgroup_subsys_state *css; 3081 4150 struct task_struct *p; 3082 4151 int ret; 3083 - 3084 - /* released in scx_finish/cancel_attach() */ 3085 - percpu_down_read(&scx_cgroup_rwsem); 3086 4152 3087 4153 if (!scx_cgroup_enabled) 3088 4154 return 0; ··· 3120 4192 p->scx.cgrp_moving_from = NULL; 3121 4193 } 3122 4194 3123 - percpu_up_read(&scx_cgroup_rwsem); 3124 4195 return ops_sanitize_err(sch, "cgroup_prep_move", ret); 3125 4196 } 3126 4197 ··· 3142 4215 p->scx.cgrp_moving_from = NULL; 3143 4216 } 3144 4217 3145 - void scx_cgroup_finish_attach(void) 3146 - { 3147 - percpu_up_read(&scx_cgroup_rwsem); 3148 - } 3149 - 3150 4218 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 3151 4219 { 3152 4220 struct scx_sched *sch = scx_root; ··· 3149 4227 struct task_struct *p; 3150 4228 3151 4229 if (!scx_cgroup_enabled) 3152 - goto out_unlock; 4230 + return; 3153 4231 3154 4232 cgroup_taskset_for_each(p, css, tset) { 3155 4233 if (SCX_HAS_OP(sch, cgroup_cancel_move) && ··· 3158 4236 p, p->scx.cgrp_moving_from, css->cgroup); 3159 4237 p->scx.cgrp_moving_from = NULL; 3160 4238 } 3161 - out_unlock: 3162 - percpu_up_read(&scx_cgroup_rwsem); 3163 4239 } 3164 4240 3165 4241 void scx_group_set_weight(struct task_group *tg, unsigned long weight) 3166 4242 { 3167 4243 struct scx_sched *sch = scx_root; 3168 4244 3169 - percpu_down_read(&scx_cgroup_rwsem); 4245 + percpu_down_read(&scx_cgroup_ops_rwsem); 3170 4246 3171 4247 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && 3172 4248 tg->scx.weight != weight) ··· 3173 4253 3174 4254 tg->scx.weight = weight; 3175 4255 3176 - percpu_up_read(&scx_cgroup_rwsem); 4256 + percpu_up_read(&scx_cgroup_ops_rwsem); 3177 4257 } 3178 4258 3179 4259 void scx_group_set_idle(struct task_group *tg, bool idle) ··· 3186 4266 { 3187 4267 struct scx_sched *sch = scx_root; 3188 4268 3189 - percpu_down_read(&scx_cgroup_rwsem); 4269 + percpu_down_read(&scx_cgroup_ops_rwsem); 3190 4270 3191 4271 if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && 3192 4272 (tg->scx.bw_period_us != period_us || ··· 3199 4279 tg->scx.bw_quota_us = quota_us; 3200 4280 tg->scx.bw_burst_us = burst_us; 3201 4281 3202 - percpu_up_read(&scx_cgroup_rwsem); 4282 + percpu_up_read(&scx_cgroup_ops_rwsem); 3203 4283 } 3204 4284 3205 4285 static void scx_cgroup_lock(void) 3206 4286 { 3207 - percpu_down_write(&scx_cgroup_rwsem); 4287 + percpu_down_write(&scx_cgroup_ops_rwsem); 4288 + cgroup_lock(); 3208 4289 } 3209 4290 3210 4291 static void scx_cgroup_unlock(void) 3211 4292 { 3212 - percpu_up_write(&scx_cgroup_rwsem); 4293 + cgroup_unlock(); 4294 + percpu_up_write(&scx_cgroup_ops_rwsem); 3213 4295 } 3214 4296 3215 4297 #else /* CONFIG_EXT_GROUP_SCHED */ 3216 4298 3217 - static inline void scx_cgroup_lock(void) {} 3218 - static inline void scx_cgroup_unlock(void) {} 4299 + static void scx_cgroup_lock(void) {} 4300 + static void scx_cgroup_unlock(void) {} 3219 4301 3220 4302 #endif /* CONFIG_EXT_GROUP_SCHED */ 3221 4303 ··· 3333 4411 { 3334 4412 struct cgroup_subsys_state *css; 3335 4413 3336 - percpu_rwsem_assert_held(&scx_cgroup_rwsem); 3337 - 3338 4414 scx_cgroup_enabled = false; 3339 4415 3340 4416 /* 3341 - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk 4417 + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 3342 4418 * cgroups and exit all the inited ones, all online cgroups are exited. 3343 4419 */ 3344 - rcu_read_lock(); 3345 4420 css_for_each_descendant_post(css, &root_task_group.css) { 3346 4421 struct task_group *tg = css_tg(css); 3347 4422 ··· 3349 4430 if (!sch->ops.cgroup_exit) 3350 4431 continue; 3351 4432 3352 - if (WARN_ON_ONCE(!css_tryget(css))) 3353 - continue; 3354 - rcu_read_unlock(); 3355 - 3356 4433 SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, 3357 4434 css->cgroup); 3358 - 3359 - rcu_read_lock(); 3360 - css_put(css); 3361 4435 } 3362 - rcu_read_unlock(); 3363 4436 } 3364 4437 3365 4438 static int scx_cgroup_init(struct scx_sched *sch) ··· 3359 4448 struct cgroup_subsys_state *css; 3360 4449 int ret; 3361 4450 3362 - percpu_rwsem_assert_held(&scx_cgroup_rwsem); 3363 - 3364 4451 /* 3365 - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk 4452 + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk 3366 4453 * cgroups and init, all online cgroups are initialized. 3367 4454 */ 3368 - rcu_read_lock(); 3369 4455 css_for_each_descendant_pre(css, &root_task_group.css) { 3370 4456 struct task_group *tg = css_tg(css); 3371 4457 struct scx_cgroup_init_args args = { ··· 3381 4473 continue; 3382 4474 } 3383 4475 3384 - if (WARN_ON_ONCE(!css_tryget(css))) 3385 - continue; 3386 - rcu_read_unlock(); 3387 - 3388 4476 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, 3389 4477 css->cgroup, &args); 3390 4478 if (ret) { ··· 3389 4485 return ret; 3390 4486 } 3391 4487 tg->scx.flags |= SCX_TG_INITED; 3392 - 3393 - rcu_read_lock(); 3394 - css_put(css); 3395 4488 } 3396 - rcu_read_unlock(); 3397 4489 3398 4490 WARN_ON_ONCE(scx_cgroup_enabled); 3399 4491 scx_cgroup_enabled = true; ··· 3472 4572 int node; 3473 4573 3474 4574 kthread_stop(sch->helper->task); 3475 - free_percpu(sch->event_stats_cpu); 4575 + free_percpu(sch->pcpu); 3476 4576 3477 4577 for_each_node_state(node, N_POSSIBLE) 3478 4578 kfree(sch->global_dsqs[node]); ··· 3571 4671 3572 4672 bool scx_allow_ttwu_queue(const struct task_struct *p) 3573 4673 { 3574 - return !scx_enabled() || 3575 - (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || 3576 - p->sched_class != &ext_sched_class; 4674 + struct scx_sched *sch; 4675 + 4676 + if (!scx_enabled()) 4677 + return true; 4678 + 4679 + sch = rcu_dereference_sched(scx_root); 4680 + if (unlikely(!sch)) 4681 + return true; 4682 + 4683 + if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 4684 + return true; 4685 + 4686 + if (unlikely(p->sched_class != &ext_sched_class)) 4687 + return true; 4688 + 4689 + return false; 3577 4690 } 3578 4691 3579 4692 /** ··· 3702 4789 * 3703 4790 * - pick_next_task() suppresses zero slice warning. 3704 4791 * 3705 - * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM 4792 + * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM 3706 4793 * operations. 3707 4794 * 3708 4795 * - scx_prio_less() reverts to the default core_sched_at order. ··· 4147 5234 p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); 4148 5235 dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", 4149 5236 p->scx.dsq_vtime, p->scx.slice, p->scx.weight); 4150 - dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); 5237 + dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), 5238 + p->migration_disabled); 4151 5239 4152 5240 if (SCX_HAS_OP(sch, dump_task)) { 4153 5241 ops_dump_init(s, " "); ··· 4387 5473 sch->global_dsqs[node] = dsq; 4388 5474 } 4389 5475 4390 - sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); 4391 - if (!sch->event_stats_cpu) 5476 + sch->pcpu = alloc_percpu(struct scx_sched_pcpu); 5477 + if (!sch->pcpu) 4392 5478 goto err_free_gdsqs; 4393 5479 4394 5480 sch->helper = kthread_run_worker(0, "sched_ext_helper"); 4395 5481 if (!sch->helper) 4396 - goto err_free_event_stats; 5482 + goto err_free_pcpu; 4397 5483 sched_set_fifo(sch->helper->task); 4398 5484 4399 5485 atomic_set(&sch->exit_kind, SCX_EXIT_NONE); ··· 4411 5497 4412 5498 err_stop_helper: 4413 5499 kthread_stop(sch->helper->task); 4414 - err_free_event_stats: 4415 - free_percpu(sch->event_stats_cpu); 5500 + err_free_pcpu: 5501 + free_percpu(sch->pcpu); 4416 5502 err_free_gdsqs: 4417 5503 for_each_node_state(node, N_POSSIBLE) 4418 5504 kfree(sch->global_dsqs[node]); ··· 4535 5621 scx_error(sch, "ops.init() failed (%d)", ret); 4536 5622 goto err_disable; 4537 5623 } 5624 + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; 4538 5625 } 4539 5626 4540 5627 for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) ··· 4628 5713 ret = scx_init_task(p, task_group(p), false); 4629 5714 if (ret) { 4630 5715 put_task_struct(p); 4631 - scx_task_iter_relock(&sti); 4632 5716 scx_task_iter_stop(&sti); 4633 5717 scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", 4634 5718 ret, p->comm, p->pid); ··· 4637 5723 scx_set_task_state(p, SCX_TASK_READY); 4638 5724 4639 5725 put_task_struct(p); 4640 - scx_task_iter_relock(&sti); 4641 5726 } 4642 5727 scx_task_iter_stop(&sti); 4643 5728 scx_cgroup_unlock(); ··· 4708 5795 err_disable_unlock_all: 4709 5796 scx_cgroup_unlock(); 4710 5797 percpu_up_write(&scx_fork_rwsem); 4711 - scx_bypass(false); 5798 + /* we'll soon enter disable path, keep bypass on */ 4712 5799 err_disable: 4713 5800 mutex_unlock(&scx_enable_mutex); 4714 5801 /* ··· 5241 6328 /******************************************************************************** 5242 6329 * Helpers that can be called from the BPF scheduler. 5243 6330 */ 5244 - static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) 6331 + static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, 6332 + u64 enq_flags) 5245 6333 { 5246 - if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) 6334 + if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) 5247 6335 return false; 5248 6336 5249 6337 lockdep_assert_irqs_disabled(); 5250 6338 5251 6339 if (unlikely(!p)) { 5252 - scx_kf_error("called with NULL task"); 6340 + scx_error(sch, "called with NULL task"); 5253 6341 return false; 5254 6342 } 5255 6343 5256 6344 if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { 5257 - scx_kf_error("invalid enq_flags 0x%llx", enq_flags); 6345 + scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); 5258 6346 return false; 5259 6347 } 5260 6348 5261 6349 return true; 5262 6350 } 5263 6351 5264 - static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, 5265 - u64 enq_flags) 6352 + static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, 6353 + u64 dsq_id, u64 enq_flags) 5266 6354 { 5267 6355 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 5268 6356 struct task_struct *ddsp_task; 5269 6357 5270 6358 ddsp_task = __this_cpu_read(direct_dispatch_task); 5271 6359 if (ddsp_task) { 5272 - mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); 6360 + mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); 5273 6361 return; 5274 6362 } 5275 6363 5276 6364 if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { 5277 - scx_kf_error("dispatch buffer overflow"); 6365 + scx_error(sch, "dispatch buffer overflow"); 5278 6366 return; 5279 6367 } 5280 6368 ··· 5327 6413 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, 5328 6414 u64 enq_flags) 5329 6415 { 5330 - if (!scx_dsq_insert_preamble(p, enq_flags)) 6416 + struct scx_sched *sch; 6417 + 6418 + guard(rcu)(); 6419 + sch = rcu_dereference(scx_root); 6420 + if (unlikely(!sch)) 6421 + return; 6422 + 6423 + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) 5331 6424 return; 5332 6425 5333 6426 if (slice) ··· 5342 6421 else 5343 6422 p->scx.slice = p->scx.slice ?: 1; 5344 6423 5345 - scx_dsq_insert_commit(p, dsq_id, enq_flags); 6424 + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 5346 6425 } 5347 6426 5348 6427 /** ··· 5369 6448 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 5370 6449 u64 slice, u64 vtime, u64 enq_flags) 5371 6450 { 5372 - if (!scx_dsq_insert_preamble(p, enq_flags)) 6451 + struct scx_sched *sch; 6452 + 6453 + guard(rcu)(); 6454 + sch = rcu_dereference(scx_root); 6455 + if (unlikely(!sch)) 6456 + return; 6457 + 6458 + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) 5373 6459 return; 5374 6460 5375 6461 if (slice) ··· 5386 6458 5387 6459 p->scx.dsq_vtime = vtime; 5388 6460 5389 - scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 6461 + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 5390 6462 } 5391 6463 5392 6464 __bpf_kfunc_end_defs(); ··· 5411 6483 bool in_balance; 5412 6484 unsigned long flags; 5413 6485 5414 - if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) 6486 + if (!scx_kf_allowed_if_unlocked() && 6487 + !scx_kf_allowed(sch, SCX_KF_DISPATCH)) 5415 6488 return false; 5416 6489 5417 6490 /* ··· 5497 6568 */ 5498 6569 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) 5499 6570 { 5500 - if (!scx_kf_allowed(SCX_KF_DISPATCH)) 6571 + struct scx_sched *sch; 6572 + 6573 + guard(rcu)(); 6574 + 6575 + sch = rcu_dereference(scx_root); 6576 + if (unlikely(!sch)) 6577 + return 0; 6578 + 6579 + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) 5501 6580 return 0; 5502 6581 5503 6582 return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); ··· 5520 6583 __bpf_kfunc void scx_bpf_dispatch_cancel(void) 5521 6584 { 5522 6585 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 6586 + struct scx_sched *sch; 5523 6587 5524 - if (!scx_kf_allowed(SCX_KF_DISPATCH)) 6588 + guard(rcu)(); 6589 + 6590 + sch = rcu_dereference(scx_root); 6591 + if (unlikely(!sch)) 6592 + return; 6593 + 6594 + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) 5525 6595 return; 5526 6596 5527 6597 if (dspc->cursor > 0) 5528 6598 dspc->cursor--; 5529 6599 else 5530 - scx_kf_error("dispatch buffer underflow"); 6600 + scx_error(sch, "dispatch buffer underflow"); 5531 6601 } 5532 6602 5533 6603 /** ··· 5553 6609 */ 5554 6610 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) 5555 6611 { 5556 - struct scx_sched *sch = scx_root; 5557 6612 struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); 5558 6613 struct scx_dispatch_q *dsq; 6614 + struct scx_sched *sch; 5559 6615 5560 - if (!scx_kf_allowed(SCX_KF_DISPATCH)) 6616 + guard(rcu)(); 6617 + 6618 + sch = rcu_dereference(scx_root); 6619 + if (unlikely(!sch)) 6620 + return false; 6621 + 6622 + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) 5561 6623 return false; 5562 6624 5563 6625 flush_dispatch_buf(sch, dspc->rq); ··· 5710 6760 */ 5711 6761 __bpf_kfunc u32 scx_bpf_reenqueue_local(void) 5712 6762 { 6763 + struct scx_sched *sch; 5713 6764 LIST_HEAD(tasks); 5714 6765 u32 nr_enqueued = 0; 5715 6766 struct rq *rq; 5716 6767 struct task_struct *p, *n; 5717 6768 5718 - if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) 6769 + guard(rcu)(); 6770 + sch = rcu_dereference(scx_root); 6771 + if (unlikely(!sch)) 6772 + return 0; 6773 + 6774 + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) 5719 6775 return 0; 5720 6776 5721 6777 rq = cpu_rq(smp_processor_id()); ··· 5833 6877 5834 6878 __bpf_kfunc_start_defs(); 5835 6879 5836 - /** 5837 - * scx_bpf_kick_cpu - Trigger reschedule on a CPU 5838 - * @cpu: cpu to kick 5839 - * @flags: %SCX_KICK_* flags 5840 - * 5841 - * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 5842 - * trigger rescheduling on a busy CPU. This can be called from any online 5843 - * scx_ops operation and the actual kicking is performed asynchronously through 5844 - * an irq work. 5845 - */ 5846 - __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) 6880 + static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 5847 6881 { 5848 6882 struct rq *this_rq; 5849 6883 unsigned long irq_flags; 5850 6884 5851 - if (!kf_cpu_valid(cpu, NULL)) 6885 + if (!ops_cpu_valid(sch, cpu, NULL)) 5852 6886 return; 5853 6887 5854 6888 local_irq_save(irq_flags); ··· 5862 6916 struct rq *target_rq = cpu_rq(cpu); 5863 6917 5864 6918 if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) 5865 - scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 6919 + scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); 5866 6920 5867 6921 if (raw_spin_rq_trylock(target_rq)) { 5868 6922 if (can_skip_idle_kick(target_rq)) { ··· 5884 6938 irq_work_queue(&this_rq->scx.kick_cpus_irq_work); 5885 6939 out: 5886 6940 local_irq_restore(irq_flags); 6941 + } 6942 + 6943 + /** 6944 + * scx_bpf_kick_cpu - Trigger reschedule on a CPU 6945 + * @cpu: cpu to kick 6946 + * @flags: %SCX_KICK_* flags 6947 + * 6948 + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or 6949 + * trigger rescheduling on a busy CPU. This can be called from any online 6950 + * scx_ops operation and the actual kicking is performed asynchronously through 6951 + * an irq work. 6952 + */ 6953 + __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) 6954 + { 6955 + struct scx_sched *sch; 6956 + 6957 + guard(rcu)(); 6958 + sch = rcu_dereference(scx_root); 6959 + if (likely(sch)) 6960 + scx_kick_cpu(sch, cpu, flags); 5887 6961 } 5888 6962 5889 6963 /** ··· 6086 7120 6087 7121 __bpf_kfunc_end_defs(); 6088 7122 6089 - static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, 6090 - char *fmt, unsigned long long *data, u32 data__sz) 7123 + static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, 7124 + size_t line_size, char *fmt, unsigned long long *data, 7125 + u32 data__sz) 6091 7126 { 6092 7127 struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; 6093 7128 s32 ret; 6094 7129 6095 7130 if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || 6096 7131 (data__sz && !data)) { 6097 - scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); 7132 + scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); 6098 7133 return -EINVAL; 6099 7134 } 6100 7135 6101 7136 ret = copy_from_kernel_nofault(data_buf, data, data__sz); 6102 7137 if (ret < 0) { 6103 - scx_kf_error("failed to read data fields (%d)", ret); 7138 + scx_error(sch, "failed to read data fields (%d)", ret); 6104 7139 return ret; 6105 7140 } 6106 7141 6107 7142 ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, 6108 7143 &bprintf_data); 6109 7144 if (ret < 0) { 6110 - scx_kf_error("format preparation failed (%d)", ret); 7145 + scx_error(sch, "format preparation failed (%d)", ret); 6111 7146 return ret; 6112 7147 } 6113 7148 ··· 6116 7149 bprintf_data.bin_args); 6117 7150 bpf_bprintf_cleanup(&bprintf_data); 6118 7151 if (ret < 0) { 6119 - scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 7152 + scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); 6120 7153 return ret; 6121 7154 } 6122 7155 6123 7156 return ret; 6124 7157 } 6125 7158 6126 - static s32 bstr_format(struct scx_bstr_buf *buf, 7159 + static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, 6127 7160 char *fmt, unsigned long long *data, u32 data__sz) 6128 7161 { 6129 - return __bstr_format(buf->data, buf->line, sizeof(buf->line), 7162 + return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), 6130 7163 fmt, data, data__sz); 6131 7164 } 6132 7165 ··· 6145 7178 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, 6146 7179 unsigned long long *data, u32 data__sz) 6147 7180 { 7181 + struct scx_sched *sch; 6148 7182 unsigned long flags; 6149 7183 6150 7184 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 6151 - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 6152 - scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 7185 + sch = rcu_dereference_bh(scx_root); 7186 + if (likely(sch) && 7187 + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 7188 + scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); 6153 7189 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 6154 7190 } 6155 7191 ··· 6168 7198 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, 6169 7199 u32 data__sz) 6170 7200 { 7201 + struct scx_sched *sch; 6171 7202 unsigned long flags; 6172 7203 6173 7204 raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); 6174 - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 6175 - scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 7205 + sch = rcu_dereference_bh(scx_root); 7206 + if (likely(sch) && 7207 + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) 7208 + scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); 6176 7209 raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); 6177 7210 } 6178 7211 ··· 6194 7221 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, 6195 7222 u32 data__sz) 6196 7223 { 7224 + struct scx_sched *sch; 6197 7225 struct scx_dump_data *dd = &scx_dump_data; 6198 7226 struct scx_bstr_buf *buf = &dd->buf; 6199 7227 s32 ret; 6200 7228 7229 + guard(rcu)(); 7230 + 7231 + sch = rcu_dereference(scx_root); 7232 + if (unlikely(!sch)) 7233 + return; 7234 + 6201 7235 if (raw_smp_processor_id() != dd->cpu) { 6202 - scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); 7236 + scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); 6203 7237 return; 6204 7238 } 6205 7239 6206 7240 /* append the formatted string to the line buf */ 6207 - ret = __bstr_format(buf->data, buf->line + dd->cursor, 7241 + ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, 6208 7242 sizeof(buf->line) - dd->cursor, fmt, data, data__sz); 6209 7243 if (ret < 0) { 6210 7244 dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", ··· 6247 7267 */ 6248 7268 __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) 6249 7269 { 6250 - if (kf_cpu_valid(cpu, NULL)) 7270 + struct scx_sched *sch; 7271 + 7272 + guard(rcu)(); 7273 + 7274 + sch = rcu_dereference(scx_root); 7275 + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 6251 7276 return arch_scale_cpu_capacity(cpu); 6252 7277 else 6253 7278 return SCX_CPUPERF_ONE; ··· 6274 7289 */ 6275 7290 __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) 6276 7291 { 6277 - if (kf_cpu_valid(cpu, NULL)) 7292 + struct scx_sched *sch; 7293 + 7294 + guard(rcu)(); 7295 + 7296 + sch = rcu_dereference(scx_root); 7297 + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) 6278 7298 return arch_scale_freq_capacity(cpu); 6279 7299 else 6280 7300 return SCX_CPUPERF_ONE; ··· 6301 7311 */ 6302 7312 __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) 6303 7313 { 7314 + struct scx_sched *sch; 7315 + 7316 + guard(rcu)(); 7317 + 7318 + sch = rcu_dereference(sch); 7319 + if (unlikely(!sch)) 7320 + return; 7321 + 6304 7322 if (unlikely(perf > SCX_CPUPERF_ONE)) { 6305 - scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); 7323 + scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); 6306 7324 return; 6307 7325 } 6308 7326 6309 - if (kf_cpu_valid(cpu, NULL)) { 7327 + if (ops_cpu_valid(sch, cpu, NULL)) { 6310 7328 struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); 6311 7329 struct rq_flags rf; 6312 7330 ··· 6323 7325 * to the corresponding CPU to prevent ABBA deadlocks. 6324 7326 */ 6325 7327 if (locked_rq && rq != locked_rq) { 6326 - scx_kf_error("Invalid target CPU %d", cpu); 7328 + scx_error(sch, "Invalid target CPU %d", cpu); 6327 7329 return; 6328 7330 } 6329 7331 ··· 6418 7420 */ 6419 7421 __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) 6420 7422 { 6421 - if (!kf_cpu_valid(cpu, NULL)) 7423 + struct scx_sched *sch; 7424 + 7425 + guard(rcu)(); 7426 + 7427 + sch = rcu_dereference(scx_root); 7428 + if (unlikely(!sch)) 6422 7429 return NULL; 6423 7430 7431 + if (!ops_cpu_valid(sch, cpu, NULL)) 7432 + return NULL; 7433 + 7434 + if (!sch->warned_deprecated_rq) { 7435 + printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " 7436 + "use scx_bpf_locked_rq() when holding rq lock " 7437 + "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); 7438 + sch->warned_deprecated_rq = true; 7439 + } 7440 + 6424 7441 return cpu_rq(cpu); 7442 + } 7443 + 7444 + /** 7445 + * scx_bpf_locked_rq - Return the rq currently locked by SCX 7446 + * 7447 + * Returns the rq if a rq lock is currently held by SCX. 7448 + * Otherwise emits an error and returns NULL. 7449 + */ 7450 + __bpf_kfunc struct rq *scx_bpf_locked_rq(void) 7451 + { 7452 + struct scx_sched *sch; 7453 + struct rq *rq; 7454 + 7455 + guard(preempt)(); 7456 + 7457 + sch = rcu_dereference_sched(scx_root); 7458 + if (unlikely(!sch)) 7459 + return NULL; 7460 + 7461 + rq = scx_locked_rq(); 7462 + if (!rq) { 7463 + scx_error(sch, "accessing rq without holding rq lock"); 7464 + return NULL; 7465 + } 7466 + 7467 + return rq; 7468 + } 7469 + 7470 + /** 7471 + * scx_bpf_cpu_curr - Return remote CPU's curr task 7472 + * @cpu: CPU of interest 7473 + * 7474 + * Callers must hold RCU read lock (KF_RCU). 7475 + */ 7476 + __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) 7477 + { 7478 + struct scx_sched *sch; 7479 + 7480 + guard(rcu)(); 7481 + 7482 + sch = rcu_dereference(scx_root); 7483 + if (unlikely(!sch)) 7484 + return NULL; 7485 + 7486 + if (!ops_cpu_valid(sch, cpu, NULL)) 7487 + return NULL; 7488 + 7489 + return rcu_dereference(cpu_rq(cpu)->curr); 6425 7490 } 6426 7491 6427 7492 /** ··· 6503 7442 { 6504 7443 struct task_group *tg = p->sched_task_group; 6505 7444 struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 7445 + struct scx_sched *sch; 6506 7446 6507 - if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) 7447 + guard(rcu)(); 7448 + 7449 + sch = rcu_dereference(scx_root); 7450 + if (unlikely(!sch)) 7451 + goto out; 7452 + 7453 + if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) 6508 7454 goto out; 6509 7455 6510 7456 cgrp = tg_cgrp(tg); ··· 6592 7524 /* Aggregate per-CPU event counters into @events. */ 6593 7525 memset(events, 0, sizeof(*events)); 6594 7526 for_each_possible_cpu(cpu) { 6595 - e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); 7527 + e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; 6596 7528 scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 6597 7529 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 6598 7530 scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); ··· 6658 7590 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 6659 7591 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 6660 7592 BTF_ID_FLAGS(func, scx_bpf_cpu_rq) 7593 + BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) 7594 + BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) 6661 7595 #ifdef CONFIG_CGROUP_SCHED 6662 7596 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) 6663 7597 #endif

-25

kernel/sched/ext.h

··· 8 8 */ 9 9 #ifdef CONFIG_SCHED_CLASS_EXT 10 10 11 - static inline bool scx_kf_allowed_if_unlocked(void) 12 - { 13 - return !current->scx.kf_mask; 14 - } 15 - 16 - static inline bool scx_rq_bypassing(struct rq *rq) 17 - { 18 - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); 19 - } 20 - 21 - DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); 22 - 23 - DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); 24 - 25 - /* 26 - * Return the rq currently locked from an scx callback, or NULL if no rq is 27 - * locked. 28 - */ 29 - static inline struct rq *scx_locked_rq(void) 30 - { 31 - return __this_cpu_read(scx_locked_rq_state); 32 - } 33 - 34 11 void scx_tick(struct rq *rq); 35 12 void init_scx_entity(struct sched_ext_entity *scx); 36 13 void scx_pre_fork(struct task_struct *p); ··· 77 100 void scx_tg_offline(struct task_group *tg); 78 101 int scx_cgroup_can_attach(struct cgroup_taskset *tset); 79 102 void scx_cgroup_move_task(struct task_struct *p); 80 - void scx_cgroup_finish_attach(void); 81 103 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); 82 104 void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); 83 105 void scx_group_set_idle(struct task_group *tg, bool idle); ··· 87 111 static inline void scx_tg_offline(struct task_group *tg) {} 88 112 static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } 89 113 static inline void scx_cgroup_move_task(struct task_struct *p) {} 90 - static inline void scx_cgroup_finish_attach(void) {} 91 114 static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} 92 115 static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} 93 116 static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}

+118 -28

kernel/sched/ext_idle.c

··· 819 819 * Helpers that can be called from the BPF scheduler. 820 820 */ 821 821 822 - static int validate_node(int node) 822 + static int validate_node(struct scx_sched *sch, int node) 823 823 { 824 824 if (!static_branch_likely(&scx_builtin_idle_per_node)) { 825 - scx_kf_error("per-node idle tracking is disabled"); 825 + scx_error(sch, "per-node idle tracking is disabled"); 826 826 return -EOPNOTSUPP; 827 827 } 828 828 ··· 832 832 833 833 /* Make sure node is in a valid range */ 834 834 if (node < 0 || node >= nr_node_ids) { 835 - scx_kf_error("invalid node %d", node); 835 + scx_error(sch, "invalid node %d", node); 836 836 return -EINVAL; 837 837 } 838 838 839 839 /* Make sure the node is part of the set of possible nodes */ 840 840 if (!node_possible(node)) { 841 - scx_kf_error("unavailable node %d", node); 841 + scx_error(sch, "unavailable node %d", node); 842 842 return -EINVAL; 843 843 } 844 844 ··· 847 847 848 848 __bpf_kfunc_start_defs(); 849 849 850 - static bool check_builtin_idle_enabled(void) 850 + static bool check_builtin_idle_enabled(struct scx_sched *sch) 851 851 { 852 852 if (static_branch_likely(&scx_builtin_idle_enabled)) 853 853 return true; 854 854 855 - scx_kf_error("built-in idle tracking is disabled"); 855 + scx_error(sch, "built-in idle tracking is disabled"); 856 856 return false; 857 857 } 858 858 ··· 882 882 return p->migration_disabled; 883 883 } 884 884 885 - static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, 885 + static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, 886 + s32 prev_cpu, u64 wake_flags, 886 887 const struct cpumask *allowed, u64 flags) 887 888 { 888 889 struct rq *rq; 889 890 struct rq_flags rf; 890 891 s32 cpu; 891 892 892 - if (!kf_cpu_valid(prev_cpu, NULL)) 893 + if (!ops_cpu_valid(sch, prev_cpu, NULL)) 893 894 return -EINVAL; 894 895 895 - if (!check_builtin_idle_enabled()) 896 + if (!check_builtin_idle_enabled(sch)) 896 897 return -EBUSY; 897 898 898 899 /* ··· 906 905 if (scx_kf_allowed_if_unlocked()) { 907 906 rq = task_rq_lock(p, &rf); 908 907 } else { 909 - if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) 908 + if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) 910 909 return -EPERM; 911 910 rq = scx_locked_rq(); 912 911 } ··· 949 948 */ 950 949 __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) 951 950 { 952 - if (!kf_cpu_valid(cpu, NULL)) 953 - return NUMA_NO_NODE; 951 + struct scx_sched *sch; 954 952 953 + guard(rcu)(); 954 + 955 + sch = rcu_dereference(scx_root); 956 + if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) 957 + return NUMA_NO_NODE; 955 958 return cpu_to_node(cpu); 956 959 } 957 960 ··· 977 972 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 978 973 u64 wake_flags, bool *is_idle) 979 974 { 975 + struct scx_sched *sch; 980 976 s32 cpu; 981 977 982 - cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); 978 + guard(rcu)(); 979 + 980 + sch = rcu_dereference(scx_root); 981 + if (unlikely(!sch)) 982 + return -ENODEV; 983 + 984 + cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0); 983 985 if (cpu >= 0) { 984 986 *is_idle = true; 985 987 return cpu; 986 988 } 987 989 *is_idle = false; 988 - 989 990 return prev_cpu; 990 991 } 991 992 ··· 1018 1007 __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, 1019 1008 const struct cpumask *cpus_allowed, u64 flags) 1020 1009 { 1021 - return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); 1010 + struct scx_sched *sch; 1011 + 1012 + guard(rcu)(); 1013 + 1014 + sch = rcu_dereference(scx_root); 1015 + if (unlikely(!sch)) 1016 + return -ENODEV; 1017 + 1018 + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, 1019 + cpus_allowed, flags); 1022 1020 } 1023 1021 1024 1022 /** ··· 1041 1021 */ 1042 1022 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) 1043 1023 { 1044 - node = validate_node(node); 1024 + struct scx_sched *sch; 1025 + 1026 + guard(rcu)(); 1027 + 1028 + sch = rcu_dereference(scx_root); 1029 + if (unlikely(!sch)) 1030 + return cpu_none_mask; 1031 + 1032 + node = validate_node(sch, node); 1045 1033 if (node < 0) 1046 1034 return cpu_none_mask; 1047 1035 ··· 1065 1037 */ 1066 1038 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) 1067 1039 { 1040 + struct scx_sched *sch; 1041 + 1042 + guard(rcu)(); 1043 + 1044 + sch = rcu_dereference(scx_root); 1045 + if (unlikely(!sch)) 1046 + return cpu_none_mask; 1047 + 1068 1048 if (static_branch_unlikely(&scx_builtin_idle_per_node)) { 1069 - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); 1049 + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); 1070 1050 return cpu_none_mask; 1071 1051 } 1072 1052 1073 - if (!check_builtin_idle_enabled()) 1053 + if (!check_builtin_idle_enabled(sch)) 1074 1054 return cpu_none_mask; 1075 1055 1076 1056 return idle_cpumask(NUMA_NO_NODE)->cpu; ··· 1096 1060 */ 1097 1061 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) 1098 1062 { 1099 - node = validate_node(node); 1063 + struct scx_sched *sch; 1064 + 1065 + guard(rcu)(); 1066 + 1067 + sch = rcu_dereference(scx_root); 1068 + if (unlikely(!sch)) 1069 + return cpu_none_mask; 1070 + 1071 + node = validate_node(sch, node); 1100 1072 if (node < 0) 1101 1073 return cpu_none_mask; 1102 1074 ··· 1124 1080 */ 1125 1081 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) 1126 1082 { 1083 + struct scx_sched *sch; 1084 + 1085 + guard(rcu)(); 1086 + 1087 + sch = rcu_dereference(scx_root); 1088 + if (unlikely(!sch)) 1089 + return cpu_none_mask; 1090 + 1127 1091 if (static_branch_unlikely(&scx_builtin_idle_per_node)) { 1128 - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); 1092 + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); 1129 1093 return cpu_none_mask; 1130 1094 } 1131 1095 1132 - if (!check_builtin_idle_enabled()) 1096 + if (!check_builtin_idle_enabled(sch)) 1133 1097 return cpu_none_mask; 1134 1098 1135 1099 if (sched_smt_active()) ··· 1173 1121 */ 1174 1122 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) 1175 1123 { 1176 - if (!check_builtin_idle_enabled()) 1124 + struct scx_sched *sch; 1125 + 1126 + guard(rcu)(); 1127 + 1128 + sch = rcu_dereference(scx_root); 1129 + if (unlikely(!sch)) 1177 1130 return false; 1178 1131 1179 - if (!kf_cpu_valid(cpu, NULL)) 1132 + if (!check_builtin_idle_enabled(sch)) 1133 + return false; 1134 + 1135 + if (!ops_cpu_valid(sch, cpu, NULL)) 1180 1136 return false; 1181 1137 1182 1138 return scx_idle_test_and_clear_cpu(cpu); ··· 1212 1152 __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, 1213 1153 int node, u64 flags) 1214 1154 { 1215 - node = validate_node(node); 1155 + struct scx_sched *sch; 1156 + 1157 + guard(rcu)(); 1158 + 1159 + sch = rcu_dereference(scx_root); 1160 + if (unlikely(!sch)) 1161 + return -ENODEV; 1162 + 1163 + node = validate_node(sch, node); 1216 1164 if (node < 0) 1217 1165 return node; 1218 1166 ··· 1252 1184 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, 1253 1185 u64 flags) 1254 1186 { 1187 + struct scx_sched *sch; 1188 + 1189 + guard(rcu)(); 1190 + 1191 + sch = rcu_dereference(scx_root); 1192 + if (unlikely(!sch)) 1193 + return -ENODEV; 1194 + 1255 1195 if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { 1256 - scx_kf_error("per-node idle tracking is enabled"); 1196 + scx_error(sch, "per-node idle tracking is enabled"); 1257 1197 return -EBUSY; 1258 1198 } 1259 1199 1260 - if (!check_builtin_idle_enabled()) 1200 + if (!check_builtin_idle_enabled(sch)) 1261 1201 return -EBUSY; 1262 1202 1263 1203 return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); ··· 1295 1219 __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, 1296 1220 int node, u64 flags) 1297 1221 { 1222 + struct scx_sched *sch; 1298 1223 s32 cpu; 1299 1224 1300 - node = validate_node(node); 1225 + guard(rcu)(); 1226 + 1227 + sch = rcu_dereference(scx_root); 1228 + if (unlikely(!sch)) 1229 + return -ENODEV; 1230 + 1231 + node = validate_node(sch, node); 1301 1232 if (node < 0) 1302 1233 return node; 1303 1234 ··· 1342 1259 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, 1343 1260 u64 flags) 1344 1261 { 1262 + struct scx_sched *sch; 1345 1263 s32 cpu; 1346 1264 1265 + guard(rcu)(); 1266 + 1267 + sch = rcu_dereference(scx_root); 1268 + if (unlikely(!sch)) 1269 + return -ENODEV; 1270 + 1347 1271 if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { 1348 - scx_kf_error("per-node idle tracking is enabled"); 1272 + scx_error(sch, "per-node idle tracking is enabled"); 1349 1273 return -EBUSY; 1350 1274 } 1351 1275

+1078

kernel/sched/ext_internal.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 + * 5 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 6 + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> 7 + */ 8 + #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 9 + 10 + enum scx_consts { 11 + SCX_DSP_DFL_MAX_BATCH = 32, 12 + SCX_DSP_MAX_LOOPS = 32, 13 + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, 14 + 15 + SCX_EXIT_BT_LEN = 64, 16 + SCX_EXIT_MSG_LEN = 1024, 17 + SCX_EXIT_DUMP_DFL_LEN = 32768, 18 + 19 + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, 20 + 21 + /* 22 + * Iterating all tasks may take a while. Periodically drop 23 + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. 24 + */ 25 + SCX_TASK_ITER_BATCH = 32, 26 + }; 27 + 28 + enum scx_exit_kind { 29 + SCX_EXIT_NONE, 30 + SCX_EXIT_DONE, 31 + 32 + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ 33 + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ 34 + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ 35 + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ 36 + 37 + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ 38 + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ 39 + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ 40 + }; 41 + 42 + /* 43 + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), 44 + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes 45 + * are 64bit of the format: 46 + * 47 + * Bits: [63 .. 48 47 .. 32 31 .. 0] 48 + * [ SYS ACT ] [ SYS RSN ] [ USR ] 49 + * 50 + * SYS ACT: System-defined exit actions 51 + * SYS RSN: System-defined exit reasons 52 + * USR : User-defined exit codes and reasons 53 + * 54 + * Using the above, users may communicate intention and context by ORing system 55 + * actions and/or system reasons with a user-defined exit code. 56 + */ 57 + enum scx_exit_code { 58 + /* Reasons */ 59 + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, 60 + 61 + /* Actions */ 62 + SCX_ECODE_ACT_RESTART = 1LLU << 48, 63 + }; 64 + 65 + enum scx_exit_flags { 66 + /* 67 + * ops.exit() may be called even if the loading failed before ops.init() 68 + * finishes successfully. This is because ops.exit() allows rich exit 69 + * info communication. The following flag indicates whether ops.init() 70 + * finished successfully. 71 + */ 72 + SCX_EFLAG_INITIALIZED, 73 + }; 74 + 75 + /* 76 + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is 77 + * being disabled. 78 + */ 79 + struct scx_exit_info { 80 + /* %SCX_EXIT_* - broad category of the exit reason */ 81 + enum scx_exit_kind kind; 82 + 83 + /* exit code if gracefully exiting */ 84 + s64 exit_code; 85 + 86 + /* %SCX_EFLAG_* */ 87 + u64 flags; 88 + 89 + /* textual representation of the above */ 90 + const char *reason; 91 + 92 + /* backtrace if exiting due to an error */ 93 + unsigned long *bt; 94 + u32 bt_len; 95 + 96 + /* informational message */ 97 + char *msg; 98 + 99 + /* debug dump */ 100 + char *dump; 101 + }; 102 + 103 + /* sched_ext_ops.flags */ 104 + enum scx_ops_flags { 105 + /* 106 + * Keep built-in idle tracking even if ops.update_idle() is implemented. 107 + */ 108 + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 109 + 110 + /* 111 + * By default, if there are no other task to run on the CPU, ext core 112 + * keeps running the current task even after its slice expires. If this 113 + * flag is specified, such tasks are passed to ops.enqueue() with 114 + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. 115 + */ 116 + SCX_OPS_ENQ_LAST = 1LLU << 1, 117 + 118 + /* 119 + * An exiting task may schedule after PF_EXITING is set. In such cases, 120 + * bpf_task_from_pid() may not be able to find the task and if the BPF 121 + * scheduler depends on pid lookup for dispatching, the task will be 122 + * lost leading to various issues including RCU grace period stalls. 123 + * 124 + * To mask this problem, by default, unhashed tasks are automatically 125 + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't 126 + * depend on pid lookups and wants to handle these tasks directly, the 127 + * following flag can be used. 128 + */ 129 + SCX_OPS_ENQ_EXITING = 1LLU << 2, 130 + 131 + /* 132 + * If set, only tasks with policy set to SCHED_EXT are attached to 133 + * sched_ext. If clear, SCHED_NORMAL tasks are also included. 134 + */ 135 + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 136 + 137 + /* 138 + * A migration disabled task can only execute on its current CPU. By 139 + * default, such tasks are automatically put on the CPU's local DSQ with 140 + * the default slice on enqueue. If this ops flag is set, they also go 141 + * through ops.enqueue(). 142 + * 143 + * A migration disabled task never invokes ops.select_cpu() as it can 144 + * only select the current CPU. Also, p->cpus_ptr will only contain its 145 + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr 146 + * and thus may disagree with cpumask_weight(p->cpus_ptr). 147 + */ 148 + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, 149 + 150 + /* 151 + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes 152 + * ops.enqueue() on the ops.select_cpu() selected or the wakee's 153 + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline 154 + * transfers. When this optimization is enabled, ops.select_cpu() is 155 + * skipped in some cases (when racing against the wakee switching out). 156 + * As the BPF scheduler may depend on ops.select_cpu() being invoked 157 + * during wakeups, queued wakeup is disabled by default. 158 + * 159 + * If this ops flag is set, queued wakeup optimization is enabled and 160 + * the BPF scheduler must be able to handle ops.enqueue() invoked on the 161 + * wakee's CPU without preceding ops.select_cpu() even for tasks which 162 + * may be executed on multiple CPUs. 163 + */ 164 + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, 165 + 166 + /* 167 + * If set, enable per-node idle cpumasks. If clear, use a single global 168 + * flat idle cpumask. 169 + */ 170 + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, 171 + 172 + /* 173 + * CPU cgroup support flags 174 + */ 175 + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ 176 + 177 + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | 178 + SCX_OPS_ENQ_LAST | 179 + SCX_OPS_ENQ_EXITING | 180 + SCX_OPS_ENQ_MIGRATION_DISABLED | 181 + SCX_OPS_ALLOW_QUEUED_WAKEUP | 182 + SCX_OPS_SWITCH_PARTIAL | 183 + SCX_OPS_BUILTIN_IDLE_PER_NODE | 184 + SCX_OPS_HAS_CGROUP_WEIGHT, 185 + 186 + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ 187 + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, 188 + 189 + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, 190 + }; 191 + 192 + /* argument container for ops.init_task() */ 193 + struct scx_init_task_args { 194 + /* 195 + * Set if ops.init_task() is being invoked on the fork path, as opposed 196 + * to the scheduler transition path. 197 + */ 198 + bool fork; 199 + #ifdef CONFIG_EXT_GROUP_SCHED 200 + /* the cgroup the task is joining */ 201 + struct cgroup *cgroup; 202 + #endif 203 + }; 204 + 205 + /* argument container for ops.exit_task() */ 206 + struct scx_exit_task_args { 207 + /* Whether the task exited before running on sched_ext. */ 208 + bool cancelled; 209 + }; 210 + 211 + /* argument container for ops->cgroup_init() */ 212 + struct scx_cgroup_init_args { 213 + /* the weight of the cgroup [1..10000] */ 214 + u32 weight; 215 + 216 + /* bandwidth control parameters from cpu.max and cpu.max.burst */ 217 + u64 bw_period_us; 218 + u64 bw_quota_us; 219 + u64 bw_burst_us; 220 + }; 221 + 222 + enum scx_cpu_preempt_reason { 223 + /* next task is being scheduled by &sched_class_rt */ 224 + SCX_CPU_PREEMPT_RT, 225 + /* next task is being scheduled by &sched_class_dl */ 226 + SCX_CPU_PREEMPT_DL, 227 + /* next task is being scheduled by &sched_class_stop */ 228 + SCX_CPU_PREEMPT_STOP, 229 + /* unknown reason for SCX being preempted */ 230 + SCX_CPU_PREEMPT_UNKNOWN, 231 + }; 232 + 233 + /* 234 + * Argument container for ops->cpu_acquire(). Currently empty, but may be 235 + * expanded in the future. 236 + */ 237 + struct scx_cpu_acquire_args {}; 238 + 239 + /* argument container for ops->cpu_release() */ 240 + struct scx_cpu_release_args { 241 + /* the reason the CPU was preempted */ 242 + enum scx_cpu_preempt_reason reason; 243 + 244 + /* the task that's going to be scheduled on the CPU */ 245 + struct task_struct *task; 246 + }; 247 + 248 + /* 249 + * Informational context provided to dump operations. 250 + */ 251 + struct scx_dump_ctx { 252 + enum scx_exit_kind kind; 253 + s64 exit_code; 254 + const char *reason; 255 + u64 at_ns; 256 + u64 at_jiffies; 257 + }; 258 + 259 + /** 260 + * struct sched_ext_ops - Operation table for BPF scheduler implementation 261 + * 262 + * A BPF scheduler can implement an arbitrary scheduling policy by 263 + * implementing and loading operations in this table. Note that a userland 264 + * scheduling policy can also be implemented using the BPF scheduler 265 + * as a shim layer. 266 + */ 267 + struct sched_ext_ops { 268 + /** 269 + * @select_cpu: Pick the target CPU for a task which is being woken up 270 + * @p: task being woken up 271 + * @prev_cpu: the cpu @p was on before sleeping 272 + * @wake_flags: SCX_WAKE_* 273 + * 274 + * Decision made here isn't final. @p may be moved to any CPU while it 275 + * is getting dispatched for execution later. However, as @p is not on 276 + * the rq at this point, getting the eventual execution CPU right here 277 + * saves a small bit of overhead down the line. 278 + * 279 + * If an idle CPU is returned, the CPU is kicked and will try to 280 + * dispatch. While an explicit custom mechanism can be added, 281 + * select_cpu() serves as the default way to wake up idle CPUs. 282 + * 283 + * @p may be inserted into a DSQ directly by calling 284 + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. 285 + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ 286 + * of the CPU returned by this operation. 287 + * 288 + * Note that select_cpu() is never called for tasks that can only run 289 + * on a single CPU or tasks with migration disabled, as they don't have 290 + * the option to select a different CPU. See select_task_rq() for 291 + * details. 292 + */ 293 + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); 294 + 295 + /** 296 + * @enqueue: Enqueue a task on the BPF scheduler 297 + * @p: task being enqueued 298 + * @enq_flags: %SCX_ENQ_* 299 + * 300 + * @p is ready to run. Insert directly into a DSQ by calling 301 + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly 302 + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, 303 + * the task will stall. 304 + * 305 + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is 306 + * skipped. 307 + */ 308 + void (*enqueue)(struct task_struct *p, u64 enq_flags); 309 + 310 + /** 311 + * @dequeue: Remove a task from the BPF scheduler 312 + * @p: task being dequeued 313 + * @deq_flags: %SCX_DEQ_* 314 + * 315 + * Remove @p from the BPF scheduler. This is usually called to isolate 316 + * the task while updating its scheduling properties (e.g. priority). 317 + * 318 + * The ext core keeps track of whether the BPF side owns a given task or 319 + * not and can gracefully ignore spurious dispatches from BPF side, 320 + * which makes it safe to not implement this method. However, depending 321 + * on the scheduling logic, this can lead to confusing behaviors - e.g. 322 + * scheduling position not being updated across a priority change. 323 + */ 324 + void (*dequeue)(struct task_struct *p, u64 deq_flags); 325 + 326 + /** 327 + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs 328 + * @cpu: CPU to dispatch tasks for 329 + * @prev: previous task being switched out 330 + * 331 + * Called when a CPU's local dsq is empty. The operation should dispatch 332 + * one or more tasks from the BPF scheduler into the DSQs using 333 + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ 334 + * using scx_bpf_dsq_move_to_local(). 335 + * 336 + * The maximum number of times scx_bpf_dsq_insert() can be called 337 + * without an intervening scx_bpf_dsq_move_to_local() is specified by 338 + * ops.dispatch_max_batch. See the comments on top of the two functions 339 + * for more details. 340 + * 341 + * When not %NULL, @prev is an SCX task with its slice depleted. If 342 + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in 343 + * @prev->scx.flags, it is not enqueued yet and will be enqueued after 344 + * ops.dispatch() returns. To keep executing @prev, return without 345 + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. 346 + */ 347 + void (*dispatch)(s32 cpu, struct task_struct *prev); 348 + 349 + /** 350 + * @tick: Periodic tick 351 + * @p: task running currently 352 + * 353 + * This operation is called every 1/HZ seconds on CPUs which are 354 + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an 355 + * immediate dispatch cycle on the CPU. 356 + */ 357 + void (*tick)(struct task_struct *p); 358 + 359 + /** 360 + * @runnable: A task is becoming runnable on its associated CPU 361 + * @p: task becoming runnable 362 + * @enq_flags: %SCX_ENQ_* 363 + * 364 + * This and the following three functions can be used to track a task's 365 + * execution state transitions. A task becomes ->runnable() on a CPU, 366 + * and then goes through one or more ->running() and ->stopping() pairs 367 + * as it runs on the CPU, and eventually becomes ->quiescent() when it's 368 + * done running on the CPU. 369 + * 370 + * @p is becoming runnable on the CPU because it's 371 + * 372 + * - waking up (%SCX_ENQ_WAKEUP) 373 + * - being moved from another CPU 374 + * - being restored after temporarily taken off the queue for an 375 + * attribute change. 376 + * 377 + * This and ->enqueue() are related but not coupled. This operation 378 + * notifies @p's state transition and may not be followed by ->enqueue() 379 + * e.g. when @p is being dispatched to a remote CPU, or when @p is 380 + * being enqueued on a CPU experiencing a hotplug event. Likewise, a 381 + * task may be ->enqueue()'d without being preceded by this operation 382 + * e.g. after exhausting its slice. 383 + */ 384 + void (*runnable)(struct task_struct *p, u64 enq_flags); 385 + 386 + /** 387 + * @running: A task is starting to run on its associated CPU 388 + * @p: task starting to run 389 + * 390 + * Note that this callback may be called from a CPU other than the 391 + * one the task is going to run on. This can happen when a task 392 + * property is changed (i.e., affinity), since scx_next_task_scx(), 393 + * which triggers this callback, may run on a CPU different from 394 + * the task's assigned CPU. 395 + * 396 + * Therefore, always use scx_bpf_task_cpu(@p) to determine the 397 + * target CPU the task is going to use. 398 + * 399 + * See ->runnable() for explanation on the task state notifiers. 400 + */ 401 + void (*running)(struct task_struct *p); 402 + 403 + /** 404 + * @stopping: A task is stopping execution 405 + * @p: task stopping to run 406 + * @runnable: is task @p still runnable? 407 + * 408 + * Note that this callback may be called from a CPU other than the 409 + * one the task was running on. This can happen when a task 410 + * property is changed (i.e., affinity), since dequeue_task_scx(), 411 + * which triggers this callback, may run on a CPU different from 412 + * the task's assigned CPU. 413 + * 414 + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU 415 + * the task was running on. 416 + * 417 + * See ->runnable() for explanation on the task state notifiers. If 418 + * !@runnable, ->quiescent() will be invoked after this operation 419 + * returns. 420 + */ 421 + void (*stopping)(struct task_struct *p, bool runnable); 422 + 423 + /** 424 + * @quiescent: A task is becoming not runnable on its associated CPU 425 + * @p: task becoming not runnable 426 + * @deq_flags: %SCX_DEQ_* 427 + * 428 + * See ->runnable() for explanation on the task state notifiers. 429 + * 430 + * @p is becoming quiescent on the CPU because it's 431 + * 432 + * - sleeping (%SCX_DEQ_SLEEP) 433 + * - being moved to another CPU 434 + * - being temporarily taken off the queue for an attribute change 435 + * (%SCX_DEQ_SAVE) 436 + * 437 + * This and ->dequeue() are related but not coupled. This operation 438 + * notifies @p's state transition and may not be preceded by ->dequeue() 439 + * e.g. when @p is being dispatched to a remote CPU. 440 + */ 441 + void (*quiescent)(struct task_struct *p, u64 deq_flags); 442 + 443 + /** 444 + * @yield: Yield CPU 445 + * @from: yielding task 446 + * @to: optional yield target task 447 + * 448 + * If @to is NULL, @from is yielding the CPU to other runnable tasks. 449 + * The BPF scheduler should ensure that other available tasks are 450 + * dispatched before the yielding task. Return value is ignored in this 451 + * case. 452 + * 453 + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf 454 + * scheduler can implement the request, return %true; otherwise, %false. 455 + */ 456 + bool (*yield)(struct task_struct *from, struct task_struct *to); 457 + 458 + /** 459 + * @core_sched_before: Task ordering for core-sched 460 + * @a: task A 461 + * @b: task B 462 + * 463 + * Used by core-sched to determine the ordering between two tasks. See 464 + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on 465 + * core-sched. 466 + * 467 + * Both @a and @b are runnable and may or may not currently be queued on 468 + * the BPF scheduler. Should return %true if @a should run before @b. 469 + * %false if there's no required ordering or @b should run before @a. 470 + * 471 + * If not specified, the default is ordering them according to when they 472 + * became runnable. 473 + */ 474 + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); 475 + 476 + /** 477 + * @set_weight: Set task weight 478 + * @p: task to set weight for 479 + * @weight: new weight [1..10000] 480 + * 481 + * Update @p's weight to @weight. 482 + */ 483 + void (*set_weight)(struct task_struct *p, u32 weight); 484 + 485 + /** 486 + * @set_cpumask: Set CPU affinity 487 + * @p: task to set CPU affinity for 488 + * @cpumask: cpumask of cpus that @p can run on 489 + * 490 + * Update @p's CPU affinity to @cpumask. 491 + */ 492 + void (*set_cpumask)(struct task_struct *p, 493 + const struct cpumask *cpumask); 494 + 495 + /** 496 + * @update_idle: Update the idle state of a CPU 497 + * @cpu: CPU to update the idle state for 498 + * @idle: whether entering or exiting the idle state 499 + * 500 + * This operation is called when @rq's CPU goes or leaves the idle 501 + * state. By default, implementing this operation disables the built-in 502 + * idle CPU tracking and the following helpers become unavailable: 503 + * 504 + * - scx_bpf_select_cpu_dfl() 505 + * - scx_bpf_select_cpu_and() 506 + * - scx_bpf_test_and_clear_cpu_idle() 507 + * - scx_bpf_pick_idle_cpu() 508 + * 509 + * The user also must implement ops.select_cpu() as the default 510 + * implementation relies on scx_bpf_select_cpu_dfl(). 511 + * 512 + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle 513 + * tracking. 514 + */ 515 + void (*update_idle)(s32 cpu, bool idle); 516 + 517 + /** 518 + * @cpu_acquire: A CPU is becoming available to the BPF scheduler 519 + * @cpu: The CPU being acquired by the BPF scheduler. 520 + * @args: Acquire arguments, see the struct definition. 521 + * 522 + * A CPU that was previously released from the BPF scheduler is now once 523 + * again under its control. 524 + */ 525 + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); 526 + 527 + /** 528 + * @cpu_release: A CPU is taken away from the BPF scheduler 529 + * @cpu: The CPU being released by the BPF scheduler. 530 + * @args: Release arguments, see the struct definition. 531 + * 532 + * The specified CPU is no longer under the control of the BPF 533 + * scheduler. This could be because it was preempted by a higher 534 + * priority sched_class, though there may be other reasons as well. The 535 + * caller should consult @args->reason to determine the cause. 536 + */ 537 + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); 538 + 539 + /** 540 + * @init_task: Initialize a task to run in a BPF scheduler 541 + * @p: task to initialize for BPF scheduling 542 + * @args: init arguments, see the struct definition 543 + * 544 + * Either we're loading a BPF scheduler or a new task is being forked. 545 + * Initialize @p for BPF scheduling. This operation may block and can 546 + * be used for allocations, and is called exactly once for a task. 547 + * 548 + * Return 0 for success, -errno for failure. An error return while 549 + * loading will abort loading of the BPF scheduler. During a fork, it 550 + * will abort that specific fork. 551 + */ 552 + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); 553 + 554 + /** 555 + * @exit_task: Exit a previously-running task from the system 556 + * @p: task to exit 557 + * @args: exit arguments, see the struct definition 558 + * 559 + * @p is exiting or the BPF scheduler is being unloaded. Perform any 560 + * necessary cleanup for @p. 561 + */ 562 + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); 563 + 564 + /** 565 + * @enable: Enable BPF scheduling for a task 566 + * @p: task to enable BPF scheduling for 567 + * 568 + * Enable @p for BPF scheduling. enable() is called on @p any time it 569 + * enters SCX, and is always paired with a matching disable(). 570 + */ 571 + void (*enable)(struct task_struct *p); 572 + 573 + /** 574 + * @disable: Disable BPF scheduling for a task 575 + * @p: task to disable BPF scheduling for 576 + * 577 + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. 578 + * Disable BPF scheduling for @p. A disable() call is always matched 579 + * with a prior enable() call. 580 + */ 581 + void (*disable)(struct task_struct *p); 582 + 583 + /** 584 + * @dump: Dump BPF scheduler state on error 585 + * @ctx: debug dump context 586 + * 587 + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. 588 + */ 589 + void (*dump)(struct scx_dump_ctx *ctx); 590 + 591 + /** 592 + * @dump_cpu: Dump BPF scheduler state for a CPU on error 593 + * @ctx: debug dump context 594 + * @cpu: CPU to generate debug dump for 595 + * @idle: @cpu is currently idle without any runnable tasks 596 + * 597 + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 598 + * @cpu. If @idle is %true and this operation doesn't produce any 599 + * output, @cpu is skipped for dump. 600 + */ 601 + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); 602 + 603 + /** 604 + * @dump_task: Dump BPF scheduler state for a runnable task on error 605 + * @ctx: debug dump context 606 + * @p: runnable task to generate debug dump for 607 + * 608 + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for 609 + * @p. 610 + */ 611 + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); 612 + 613 + #ifdef CONFIG_EXT_GROUP_SCHED 614 + /** 615 + * @cgroup_init: Initialize a cgroup 616 + * @cgrp: cgroup being initialized 617 + * @args: init arguments, see the struct definition 618 + * 619 + * Either the BPF scheduler is being loaded or @cgrp created, initialize 620 + * @cgrp for sched_ext. This operation may block. 621 + * 622 + * Return 0 for success, -errno for failure. An error return while 623 + * loading will abort loading of the BPF scheduler. During cgroup 624 + * creation, it will abort the specific cgroup creation. 625 + */ 626 + s32 (*cgroup_init)(struct cgroup *cgrp, 627 + struct scx_cgroup_init_args *args); 628 + 629 + /** 630 + * @cgroup_exit: Exit a cgroup 631 + * @cgrp: cgroup being exited 632 + * 633 + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit 634 + * @cgrp for sched_ext. This operation my block. 635 + */ 636 + void (*cgroup_exit)(struct cgroup *cgrp); 637 + 638 + /** 639 + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup 640 + * @p: task being moved 641 + * @from: cgroup @p is being moved from 642 + * @to: cgroup @p is being moved to 643 + * 644 + * Prepare @p for move from cgroup @from to @to. This operation may 645 + * block and can be used for allocations. 646 + * 647 + * Return 0 for success, -errno for failure. An error return aborts the 648 + * migration. 649 + */ 650 + s32 (*cgroup_prep_move)(struct task_struct *p, 651 + struct cgroup *from, struct cgroup *to); 652 + 653 + /** 654 + * @cgroup_move: Commit cgroup move 655 + * @p: task being moved 656 + * @from: cgroup @p is being moved from 657 + * @to: cgroup @p is being moved to 658 + * 659 + * Commit the move. @p is dequeued during this operation. 660 + */ 661 + void (*cgroup_move)(struct task_struct *p, 662 + struct cgroup *from, struct cgroup *to); 663 + 664 + /** 665 + * @cgroup_cancel_move: Cancel cgroup move 666 + * @p: task whose cgroup move is being canceled 667 + * @from: cgroup @p was being moved from 668 + * @to: cgroup @p was being moved to 669 + * 670 + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). 671 + * Undo the preparation. 672 + */ 673 + void (*cgroup_cancel_move)(struct task_struct *p, 674 + struct cgroup *from, struct cgroup *to); 675 + 676 + /** 677 + * @cgroup_set_weight: A cgroup's weight is being changed 678 + * @cgrp: cgroup whose weight is being updated 679 + * @weight: new weight [1..10000] 680 + * 681 + * Update @cgrp's weight to @weight. 682 + */ 683 + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); 684 + 685 + /** 686 + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed 687 + * @cgrp: cgroup whose bandwidth is being updated 688 + * @period_us: bandwidth control period 689 + * @quota_us: bandwidth control quota 690 + * @burst_us: bandwidth control burst 691 + * 692 + * Update @cgrp's bandwidth control parameters. This is from the cpu.max 693 + * cgroup interface. 694 + * 695 + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled 696 + * to. For example, if @period_us is 1_000_000 and @quota_us is 697 + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be 698 + * interpreted in the same fashion and specifies how much @cgrp can 699 + * burst temporarily. The specific control mechanism and thus the 700 + * interpretation of @period_us and burstiness is upto to the BPF 701 + * scheduler. 702 + */ 703 + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, 704 + u64 period_us, u64 quota_us, u64 burst_us); 705 + 706 + #endif /* CONFIG_EXT_GROUP_SCHED */ 707 + 708 + /* 709 + * All online ops must come before ops.cpu_online(). 710 + */ 711 + 712 + /** 713 + * @cpu_online: A CPU became online 714 + * @cpu: CPU which just came up 715 + * 716 + * @cpu just came online. @cpu will not call ops.enqueue() or 717 + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. 718 + */ 719 + void (*cpu_online)(s32 cpu); 720 + 721 + /** 722 + * @cpu_offline: A CPU is going offline 723 + * @cpu: CPU which is going offline 724 + * 725 + * @cpu is going offline. @cpu will not call ops.enqueue() or 726 + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. 727 + */ 728 + void (*cpu_offline)(s32 cpu); 729 + 730 + /* 731 + * All CPU hotplug ops must come before ops.init(). 732 + */ 733 + 734 + /** 735 + * @init: Initialize the BPF scheduler 736 + */ 737 + s32 (*init)(void); 738 + 739 + /** 740 + * @exit: Clean up after the BPF scheduler 741 + * @info: Exit info 742 + * 743 + * ops.exit() is also called on ops.init() failure, which is a bit 744 + * unusual. This is to allow rich reporting through @info on how 745 + * ops.init() failed. 746 + */ 747 + void (*exit)(struct scx_exit_info *info); 748 + 749 + /** 750 + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch 751 + */ 752 + u32 dispatch_max_batch; 753 + 754 + /** 755 + * @flags: %SCX_OPS_* flags 756 + */ 757 + u64 flags; 758 + 759 + /** 760 + * @timeout_ms: The maximum amount of time, in milliseconds, that a 761 + * runnable task should be able to wait before being scheduled. The 762 + * maximum timeout may not exceed the default timeout of 30 seconds. 763 + * 764 + * Defaults to the maximum allowed timeout value of 30 seconds. 765 + */ 766 + u32 timeout_ms; 767 + 768 + /** 769 + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default 770 + * value of 32768 is used. 771 + */ 772 + u32 exit_dump_len; 773 + 774 + /** 775 + * @hotplug_seq: A sequence number that may be set by the scheduler to 776 + * detect when a hotplug event has occurred during the loading process. 777 + * If 0, no detection occurs. Otherwise, the scheduler will fail to 778 + * load if the sequence number does not match @scx_hotplug_seq on the 779 + * enable path. 780 + */ 781 + u64 hotplug_seq; 782 + 783 + /** 784 + * @name: BPF scheduler's name 785 + * 786 + * Must be a non-zero valid BPF object name including only isalnum(), 787 + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the 788 + * BPF scheduler is enabled. 789 + */ 790 + char name[SCX_OPS_NAME_LEN]; 791 + 792 + /* internal use only, must be NULL */ 793 + void *priv; 794 + }; 795 + 796 + enum scx_opi { 797 + SCX_OPI_BEGIN = 0, 798 + SCX_OPI_NORMAL_BEGIN = 0, 799 + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), 800 + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), 801 + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), 802 + SCX_OPI_END = SCX_OP_IDX(init), 803 + }; 804 + 805 + /* 806 + * Collection of event counters. Event types are placed in descending order. 807 + */ 808 + struct scx_event_stats { 809 + /* 810 + * If ops.select_cpu() returns a CPU which can't be used by the task, 811 + * the core scheduler code silently picks a fallback CPU. 812 + */ 813 + s64 SCX_EV_SELECT_CPU_FALLBACK; 814 + 815 + /* 816 + * When dispatching to a local DSQ, the CPU may have gone offline in 817 + * the meantime. In this case, the task is bounced to the global DSQ. 818 + */ 819 + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; 820 + 821 + /* 822 + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task 823 + * continued to run because there were no other tasks on the CPU. 824 + */ 825 + s64 SCX_EV_DISPATCH_KEEP_LAST; 826 + 827 + /* 828 + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task 829 + * is dispatched to a local DSQ when exiting. 830 + */ 831 + s64 SCX_EV_ENQ_SKIP_EXITING; 832 + 833 + /* 834 + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a 835 + * migration disabled task skips ops.enqueue() and is dispatched to its 836 + * local DSQ. 837 + */ 838 + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; 839 + 840 + /* 841 + * Total number of times a task's time slice was refilled with the 842 + * default value (SCX_SLICE_DFL). 843 + */ 844 + s64 SCX_EV_REFILL_SLICE_DFL; 845 + 846 + /* 847 + * The total duration of bypass modes in nanoseconds. 848 + */ 849 + s64 SCX_EV_BYPASS_DURATION; 850 + 851 + /* 852 + * The number of tasks dispatched in the bypassing mode. 853 + */ 854 + s64 SCX_EV_BYPASS_DISPATCH; 855 + 856 + /* 857 + * The number of times the bypassing mode has been activated. 858 + */ 859 + s64 SCX_EV_BYPASS_ACTIVATE; 860 + }; 861 + 862 + struct scx_sched_pcpu { 863 + /* 864 + * The event counters are in a per-CPU variable to minimize the 865 + * accounting overhead. A system-wide view on the event counter is 866 + * constructed when requested by scx_bpf_events(). 867 + */ 868 + struct scx_event_stats event_stats; 869 + }; 870 + 871 + struct scx_sched { 872 + struct sched_ext_ops ops; 873 + DECLARE_BITMAP(has_op, SCX_OPI_END); 874 + 875 + /* 876 + * Dispatch queues. 877 + * 878 + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. 879 + * This is to avoid live-locking in bypass mode where all tasks are 880 + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If 881 + * per-node split isn't sufficient, it can be further split. 882 + */ 883 + struct rhashtable dsq_hash; 884 + struct scx_dispatch_q **global_dsqs; 885 + struct scx_sched_pcpu __percpu *pcpu; 886 + 887 + bool warned_zero_slice:1; 888 + bool warned_deprecated_rq:1; 889 + 890 + atomic_t exit_kind; 891 + struct scx_exit_info *exit_info; 892 + 893 + struct kobject kobj; 894 + 895 + struct kthread_worker *helper; 896 + struct irq_work error_irq_work; 897 + struct kthread_work disable_work; 898 + struct rcu_work rcu_work; 899 + }; 900 + 901 + enum scx_wake_flags { 902 + /* expose select WF_* flags as enums */ 903 + SCX_WAKE_FORK = WF_FORK, 904 + SCX_WAKE_TTWU = WF_TTWU, 905 + SCX_WAKE_SYNC = WF_SYNC, 906 + }; 907 + 908 + enum scx_enq_flags { 909 + /* expose select ENQUEUE_* flags as enums */ 910 + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, 911 + SCX_ENQ_HEAD = ENQUEUE_HEAD, 912 + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, 913 + 914 + /* high 32bits are SCX specific */ 915 + 916 + /* 917 + * Set the following to trigger preemption when calling 918 + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the 919 + * current task is cleared to zero and the CPU is kicked into the 920 + * scheduling path. Implies %SCX_ENQ_HEAD. 921 + */ 922 + SCX_ENQ_PREEMPT = 1LLU << 32, 923 + 924 + /* 925 + * The task being enqueued was previously enqueued on the current CPU's 926 + * %SCX_DSQ_LOCAL, but was removed from it in a call to the 927 + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was 928 + * invoked in a ->cpu_release() callback, and the task is again 929 + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the 930 + * task will not be scheduled on the CPU until at least the next invocation 931 + * of the ->cpu_acquire() callback. 932 + */ 933 + SCX_ENQ_REENQ = 1LLU << 40, 934 + 935 + /* 936 + * The task being enqueued is the only task available for the cpu. By 937 + * default, ext core keeps executing such tasks but when 938 + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the 939 + * %SCX_ENQ_LAST flag set. 940 + * 941 + * The BPF scheduler is responsible for triggering a follow-up 942 + * scheduling event. Otherwise, Execution may stall. 943 + */ 944 + SCX_ENQ_LAST = 1LLU << 41, 945 + 946 + /* high 8 bits are internal */ 947 + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, 948 + 949 + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 950 + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, 951 + }; 952 + 953 + enum scx_deq_flags { 954 + /* expose select DEQUEUE_* flags as enums */ 955 + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, 956 + 957 + /* high 32bits are SCX specific */ 958 + 959 + /* 960 + * The generic core-sched layer decided to execute the task even though 961 + * it hasn't been dispatched yet. Dequeue from the BPF side. 962 + */ 963 + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, 964 + }; 965 + 966 + enum scx_pick_idle_cpu_flags { 967 + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ 968 + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ 969 + }; 970 + 971 + enum scx_kick_flags { 972 + /* 973 + * Kick the target CPU if idle. Guarantees that the target CPU goes 974 + * through at least one full scheduling cycle before going idle. If the 975 + * target CPU can be determined to be currently not idle and going to go 976 + * through a scheduling cycle before going idle, noop. 977 + */ 978 + SCX_KICK_IDLE = 1LLU << 0, 979 + 980 + /* 981 + * Preempt the current task and execute the dispatch path. If the 982 + * current task of the target CPU is an SCX task, its ->scx.slice is 983 + * cleared to zero before the scheduling path is invoked so that the 984 + * task expires and the dispatch path is invoked. 985 + */ 986 + SCX_KICK_PREEMPT = 1LLU << 1, 987 + 988 + /* 989 + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will 990 + * return after the target CPU finishes picking the next task. 991 + */ 992 + SCX_KICK_WAIT = 1LLU << 2, 993 + }; 994 + 995 + enum scx_tg_flags { 996 + SCX_TG_ONLINE = 1U << 0, 997 + SCX_TG_INITED = 1U << 1, 998 + }; 999 + 1000 + enum scx_enable_state { 1001 + SCX_ENABLING, 1002 + SCX_ENABLED, 1003 + SCX_DISABLING, 1004 + SCX_DISABLED, 1005 + }; 1006 + 1007 + static const char *scx_enable_state_str[] = { 1008 + [SCX_ENABLING] = "enabling", 1009 + [SCX_ENABLED] = "enabled", 1010 + [SCX_DISABLING] = "disabling", 1011 + [SCX_DISABLED] = "disabled", 1012 + }; 1013 + 1014 + /* 1015 + * sched_ext_entity->ops_state 1016 + * 1017 + * Used to track the task ownership between the SCX core and the BPF scheduler. 1018 + * State transitions look as follows: 1019 + * 1020 + * NONE -> QUEUEING -> QUEUED -> DISPATCHING 1021 + * ^ | | 1022 + * | v v 1023 + * \-------------------------------/ 1024 + * 1025 + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call 1026 + * sites for explanations on the conditions being waited upon and why they are 1027 + * safe. Transitions out of them into NONE or QUEUED must store_release and the 1028 + * waiters should load_acquire. 1029 + * 1030 + * Tracking scx_ops_state enables sched_ext core to reliably determine whether 1031 + * any given task can be dispatched by the BPF scheduler at all times and thus 1032 + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler 1033 + * to try to dispatch any task anytime regardless of its state as the SCX core 1034 + * can safely reject invalid dispatches. 1035 + */ 1036 + enum scx_ops_state { 1037 + SCX_OPSS_NONE, /* owned by the SCX core */ 1038 + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ 1039 + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ 1040 + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ 1041 + 1042 + /* 1043 + * QSEQ brands each QUEUED instance so that, when dispatch races 1044 + * dequeue/requeue, the dispatcher can tell whether it still has a claim 1045 + * on the task being dispatched. 1046 + * 1047 + * As some 32bit archs can't do 64bit store_release/load_acquire, 1048 + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on 1049 + * 32bit machines. The dispatch race window QSEQ protects is very narrow 1050 + * and runs with IRQ disabled. 30 bits should be sufficient. 1051 + */ 1052 + SCX_OPSS_QSEQ_SHIFT = 2, 1053 + }; 1054 + 1055 + /* Use macros to ensure that the type is unsigned long for the masks */ 1056 + #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) 1057 + #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) 1058 + 1059 + DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); 1060 + 1061 + /* 1062 + * Return the rq currently locked from an scx callback, or NULL if no rq is 1063 + * locked. 1064 + */ 1065 + static inline struct rq *scx_locked_rq(void) 1066 + { 1067 + return __this_cpu_read(scx_locked_rq_state); 1068 + } 1069 + 1070 + static inline bool scx_kf_allowed_if_unlocked(void) 1071 + { 1072 + return !current->scx.kf_mask; 1073 + } 1074 + 1075 + static inline bool scx_rq_bypassing(struct rq *rq) 1076 + { 1077 + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); 1078 + }

+175

tools/sched_ext/include/scx/bpf_arena_common.bpf.h

··· 1 + /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 + /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ 3 + #pragma once 4 + 5 + #ifndef PAGE_SIZE 6 + #define PAGE_SIZE __PAGE_SIZE 7 + /* 8 + * for older kernels try sizeof(struct genradix_node) 9 + * or flexible: 10 + * static inline long __bpf_page_size(void) { 11 + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); 12 + * } 13 + * but generated code is not great. 14 + */ 15 + #endif 16 + 17 + #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) 18 + #define __arena __attribute__((address_space(1))) 19 + #define __arena_global __attribute__((address_space(1))) 20 + #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ 21 + #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ 22 + #else 23 + 24 + /* emit instruction: 25 + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as 26 + * 27 + * This is a workaround for LLVM compiler versions without 28 + * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena 29 + * pointers and native kernel/userspace ones. In this case we explicitly do so 30 + * with cast_kern() and cast_user(). E.g., in the Linux kernel tree, 31 + * tools/testing/selftests/bpf includes tests that use these macros to implement 32 + * linked lists and hashtables backed by arena memory. In sched_ext, we use 33 + * cast_kern() and cast_user() for compatibility with older LLVM toolchains. 34 + */ 35 + #ifndef bpf_addr_space_cast 36 + #define bpf_addr_space_cast(var, dst_as, src_as)\ 37 + asm volatile(".byte 0xBF; \ 38 + .ifc %[reg], r0; \ 39 + .byte 0x00; \ 40 + .endif; \ 41 + .ifc %[reg], r1; \ 42 + .byte 0x11; \ 43 + .endif; \ 44 + .ifc %[reg], r2; \ 45 + .byte 0x22; \ 46 + .endif; \ 47 + .ifc %[reg], r3; \ 48 + .byte 0x33; \ 49 + .endif; \ 50 + .ifc %[reg], r4; \ 51 + .byte 0x44; \ 52 + .endif; \ 53 + .ifc %[reg], r5; \ 54 + .byte 0x55; \ 55 + .endif; \ 56 + .ifc %[reg], r6; \ 57 + .byte 0x66; \ 58 + .endif; \ 59 + .ifc %[reg], r7; \ 60 + .byte 0x77; \ 61 + .endif; \ 62 + .ifc %[reg], r8; \ 63 + .byte 0x88; \ 64 + .endif; \ 65 + .ifc %[reg], r9; \ 66 + .byte 0x99; \ 67 + .endif; \ 68 + .short %[off]; \ 69 + .long %[as]" \ 70 + : [reg]"+r"(var) \ 71 + : [off]"i"(BPF_ADDR_SPACE_CAST) \ 72 + , [as]"i"((dst_as << 16) | src_as)); 73 + #endif 74 + 75 + #define __arena 76 + #define __arena_global SEC(".addr_space.1") 77 + #define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) 78 + #define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) 79 + #endif 80 + 81 + void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, 82 + int node_id, __u64 flags) __ksym __weak; 83 + void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; 84 + 85 + /* 86 + * Note that cond_break can only be portably used in the body of a breakable 87 + * construct, whereas can_loop can be used anywhere. 88 + */ 89 + #ifdef TEST 90 + #define can_loop true 91 + #define __cond_break(expr) expr 92 + #else 93 + #ifdef __BPF_FEATURE_MAY_GOTO 94 + #define can_loop \ 95 + ({ __label__ l_break, l_continue; \ 96 + bool ret = true; \ 97 + asm volatile goto("may_goto %l[l_break]" \ 98 + :::: l_break); \ 99 + goto l_continue; \ 100 + l_break: ret = false; \ 101 + l_continue:; \ 102 + ret; \ 103 + }) 104 + 105 + #define __cond_break(expr) \ 106 + ({ __label__ l_break, l_continue; \ 107 + asm volatile goto("may_goto %l[l_break]" \ 108 + :::: l_break); \ 109 + goto l_continue; \ 110 + l_break: expr; \ 111 + l_continue:; \ 112 + }) 113 + #else 114 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 115 + #define can_loop \ 116 + ({ __label__ l_break, l_continue; \ 117 + bool ret = true; \ 118 + asm volatile goto("1:.byte 0xe5; \ 119 + .byte 0; \ 120 + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ 121 + .short 0" \ 122 + :::: l_break); \ 123 + goto l_continue; \ 124 + l_break: ret = false; \ 125 + l_continue:; \ 126 + ret; \ 127 + }) 128 + 129 + #define __cond_break(expr) \ 130 + ({ __label__ l_break, l_continue; \ 131 + asm volatile goto("1:.byte 0xe5; \ 132 + .byte 0; \ 133 + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ 134 + .short 0" \ 135 + :::: l_break); \ 136 + goto l_continue; \ 137 + l_break: expr; \ 138 + l_continue:; \ 139 + }) 140 + #else 141 + #define can_loop \ 142 + ({ __label__ l_break, l_continue; \ 143 + bool ret = true; \ 144 + asm volatile goto("1:.byte 0xe5; \ 145 + .byte 0; \ 146 + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ 147 + .short 0" \ 148 + :::: l_break); \ 149 + goto l_continue; \ 150 + l_break: ret = false; \ 151 + l_continue:; \ 152 + ret; \ 153 + }) 154 + 155 + #define __cond_break(expr) \ 156 + ({ __label__ l_break, l_continue; \ 157 + asm volatile goto("1:.byte 0xe5; \ 158 + .byte 0; \ 159 + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ 160 + .short 0" \ 161 + :::: l_break); \ 162 + goto l_continue; \ 163 + l_break: expr; \ 164 + l_continue:; \ 165 + }) 166 + #endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ 167 + #endif /* __BPF_FEATURE_MAY_GOTO */ 168 + #endif /* TEST */ 169 + 170 + #define cond_break __cond_break(break) 171 + #define cond_break_label(label) __cond_break(goto label) 172 + 173 + 174 + void bpf_preempt_disable(void) __weak __ksym; 175 + void bpf_preempt_enable(void) __weak __ksym;

+33

tools/sched_ext/include/scx/bpf_arena_common.h

··· 1 + /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 + /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ 3 + #pragma once 4 + 5 + #ifndef arena_container_of 6 + #define arena_container_of(ptr, type, member) \ 7 + ({ \ 8 + void __arena *__mptr = (void __arena *)(ptr); \ 9 + ((type *)(__mptr - offsetof(type, member))); \ 10 + }) 11 + #endif 12 + 13 + /* Provide the definition of PAGE_SIZE. */ 14 + #include <sys/user.h> 15 + 16 + #define __arena 17 + #define __arg_arena 18 + #define cast_kern(ptr) /* nop for user space */ 19 + #define cast_user(ptr) /* nop for user space */ 20 + char __attribute__((weak)) arena[1]; 21 + 22 + #ifndef offsetof 23 + #define offsetof(type, member) ((unsigned long)&((type *)0)->member) 24 + #endif 25 + 26 + static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, 27 + int node_id, __u64 flags) 28 + { 29 + return NULL; 30 + } 31 + static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) 32 + { 33 + }

+95 -9

tools/sched_ext/include/scx/common.bpf.h

··· 24 24 #include <bpf/bpf_helpers.h> 25 25 #include <bpf/bpf_tracing.h> 26 26 #include <asm-generic/errno.h> 27 - #include "user_exit_info.h" 27 + #include "user_exit_info.bpf.h" 28 28 #include "enum_defs.autogen.h" 29 29 30 + #define PF_IDLE 0x00000002 /* I am an IDLE thread */ 31 + #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ 30 32 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ 33 + #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ 34 + #define PF_KSWAPD 0x00020000 /* I am kswapd */ 31 35 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 32 36 #define PF_EXITING 0x00000004 33 37 #define CLOCK_MONOTONIC 1 38 + 39 + #ifndef NR_CPUS 40 + #define NR_CPUS 1024 41 + #endif 42 + 43 + #ifndef NUMA_NO_NODE 44 + #define NUMA_NO_NODE (-1) 45 + #endif 34 46 35 47 extern int LINUX_KERNEL_VERSION __kconfig; 36 48 extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak; ··· 103 91 bool scx_bpf_task_running(const struct task_struct *p) __ksym; 104 92 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; 105 93 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; 94 + struct rq *scx_bpf_locked_rq(void) __ksym; 95 + struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; 106 96 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; 107 97 u64 scx_bpf_now(void) __ksym __weak; 108 98 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; ··· 120 106 121 107 static inline __attribute__((format(printf, 1, 2))) 122 108 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} 109 + 110 + #define SCX_STRINGIFY(x) #x 111 + #define SCX_TOSTRING(x) SCX_STRINGIFY(x) 123 112 124 113 /* 125 114 * Helper macro for initializing the fmt and variadic argument inputs to both ··· 158 141 * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments 159 142 * instead of an array of u64. Invoking this macro will cause the scheduler to 160 143 * exit in an erroneous state, with diagnostic information being passed to the 161 - * user. 144 + * user. It appends the file and line number to aid debugging. 162 145 */ 163 146 #define scx_bpf_error(fmt, args...) \ 164 147 ({ \ 165 - scx_bpf_bstr_preamble(fmt, args) \ 148 + scx_bpf_bstr_preamble( \ 149 + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \ 166 150 scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ 167 - ___scx_bpf_bstr_format_checker(fmt, ##args); \ 151 + ___scx_bpf_bstr_format_checker( \ 152 + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \ 168 153 }) 169 154 170 155 /* ··· 248 229 * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of 249 230 * `MEMBER_VPTR(ptr, ->member)`. 250 231 */ 232 + #ifndef MEMBER_VPTR 251 233 #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ 252 234 ({ \ 253 235 u64 __base = (u64)&(base); \ ··· 265 245 [max]"i"(sizeof(base) - sizeof((base) member))); \ 266 246 __addr; \ 267 247 }) 248 + #endif /* MEMBER_VPTR */ 268 249 269 250 /** 270 251 * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element ··· 281 260 * size of the array to compute the max, which will result in rejection by 282 261 * the verifier. 283 262 */ 263 + #ifndef ARRAY_ELEM_PTR 284 264 #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ 285 265 ({ \ 286 266 u64 __base = (u64)arr; \ ··· 296 274 [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ 297 275 __addr; \ 298 276 }) 299 - 277 + #endif /* ARRAY_ELEM_PTR */ 300 278 301 279 /* 302 280 * BPF declarations and helpers ··· 460 438 */ 461 439 static inline bool is_migration_disabled(const struct task_struct *p) 462 440 { 463 - if (bpf_core_field_exists(p->migration_disabled)) 464 - return p->migration_disabled; 441 + /* 442 + * Testing p->migration_disabled in a BPF code is tricky because the 443 + * migration is _always_ disabled while running the BPF code. 444 + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF 445 + * code execution disable and re-enable the migration of the current 446 + * task, respectively. So, the _current_ task of the sched_ext ops is 447 + * always migration-disabled. Moreover, p->migration_disabled could be 448 + * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is 449 + * executed in the middle of the other BPF code execution. 450 + * 451 + * Therefore, we should decide that the _current_ task is 452 + * migration-disabled only when its migration_disabled count is greater 453 + * than one. In other words, when p->migration_disabled == 1, there is 454 + * an ambiguity, so we should check if @p is the current task or not. 455 + */ 456 + if (bpf_core_field_exists(p->migration_disabled)) { 457 + if (p->migration_disabled == 1) 458 + return bpf_get_current_task_btf() != p; 459 + else 460 + return p->migration_disabled; 461 + } 465 462 return false; 466 463 } 467 464 ··· 517 476 */ 518 477 static inline bool time_after(u64 a, u64 b) 519 478 { 520 - return (s64)(b - a) < 0; 479 + return (s64)(b - a) < 0; 521 480 } 522 481 523 482 /** ··· 541 500 */ 542 501 static inline bool time_after_eq(u64 a, u64 b) 543 502 { 544 - return (s64)(a - b) >= 0; 503 + return (s64)(a - b) >= 0; 545 504 } 546 505 547 506 /** ··· 588 547 */ 589 548 590 549 /* useful compiler attributes */ 550 + #ifndef likely 591 551 #define likely(x) __builtin_expect(!!(x), 1) 552 + #endif 553 + #ifndef unlikely 592 554 #define unlikely(x) __builtin_expect(!!(x), 0) 555 + #endif 556 + #ifndef __maybe_unused 593 557 #define __maybe_unused __attribute__((__unused__)) 558 + #endif 594 559 595 560 /* 596 561 * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They ··· 680 633 }) 681 634 682 635 /* 636 + * __calc_avg - Calculate exponential weighted moving average (EWMA) with 637 + * @old and @new values. @decay represents how large the @old value remains. 638 + * With a larger @decay value, the moving average changes slowly, exhibiting 639 + * fewer fluctuations. 640 + */ 641 + #define __calc_avg(old, new, decay) ({ \ 642 + typeof(decay) thr = 1 << (decay); \ 643 + typeof(old) ret; \ 644 + if (((old) < thr) || ((new) < thr)) { \ 645 + if (((old) == 1) && ((new) == 0)) \ 646 + ret = 0; \ 647 + else \ 648 + ret = ((old) - ((old) >> 1)) + ((new) >> 1); \ 649 + } else { \ 650 + ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \ 651 + } \ 652 + ret; \ 653 + }) 654 + 655 + /* 683 656 * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. 684 657 * @v: The value for which we're computing the base 2 logarithm. 685 658 */ ··· 727 660 return log2_u32(hi) + 32 + 1; 728 661 else 729 662 return log2_u32(v) + 1; 663 + } 664 + 665 + /* 666 + * sqrt_u64 - Calculate the square root of value @x using Newton's method. 667 + */ 668 + static inline u64 __sqrt_u64(u64 x) 669 + { 670 + if (x == 0 || x == 1) 671 + return x; 672 + 673 + u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32); 674 + 675 + for (int i = 0; i < 8; ++i) { 676 + u64 q = x / r; 677 + if (r <= q) 678 + break; 679 + r = (r + q) >> 1; 680 + } 681 + return r; 730 682 } 731 683 732 684 /*

+3 -2

tools/sched_ext/include/scx/common.h

··· 75 75 #include "enums.h" 76 76 77 77 /* not available when building kernel tools/sched_ext */ 78 - #if __has_include(<lib/sdt_task.h>) 79 - #include <lib/sdt_task.h> 78 + #if __has_include(<lib/sdt_task_defs.h>) 79 + #include "bpf_arena_common.h" 80 + #include <lib/sdt_task_defs.h> 80 81 #endif 81 82 82 83 #endif /* __SCHED_EXT_COMMON_H */

+22

tools/sched_ext/include/scx/compat.bpf.h

··· 38 38 void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; 39 39 bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 40 40 bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 41 + int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; 41 42 42 43 #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ 43 44 (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ ··· 82 81 (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \ 83 82 scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ 84 83 false)) 84 + 85 + #define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ 86 + (bpf_ksym_exists(bpf_cpumask_populate) ? \ 87 + (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) 85 88 86 89 #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ 87 90 _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") ··· 229 224 (bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ? \ 230 225 scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ 231 226 scx_bpf_pick_any_cpu(cpus_allowed, flags)) 227 + 228 + /* 229 + * v6.18: Add a helper to retrieve the current task running on a CPU. 230 + * 231 + * Keep this helper available until v6.20 for compatibility. 232 + */ 233 + static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) 234 + { 235 + struct rq *rq; 236 + 237 + if (bpf_ksym_exists(scx_bpf_cpu_curr)) 238 + return scx_bpf_cpu_curr(cpu); 239 + 240 + rq = scx_bpf_cpu_rq(cpu); 241 + 242 + return rq ? rq->curr : NULL; 243 + } 232 244 233 245 /* 234 246 * Define sched_ext_ops. This may be expanded to define multiple variants for

+40

tools/sched_ext/include/scx/user_exit_info.bpf.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Define struct user_exit_info which is shared between BPF and userspace parts 4 + * to communicate exit status and other information. 5 + * 6 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 7 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 8 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 9 + */ 10 + 11 + #ifndef __USER_EXIT_INFO_BPF_H 12 + #define __USER_EXIT_INFO_BPF_H 13 + 14 + #ifndef LSP 15 + #include "vmlinux.h" 16 + #endif 17 + #include <bpf/bpf_core_read.h> 18 + 19 + #include "user_exit_info_common.h" 20 + 21 + #define UEI_DEFINE(__name) \ 22 + char RESIZABLE_ARRAY(data, __name##_dump); \ 23 + const volatile u32 __name##_dump_len; \ 24 + struct user_exit_info __name SEC(".data") 25 + 26 + #define UEI_RECORD(__uei_name, __ei) ({ \ 27 + bpf_probe_read_kernel_str(__uei_name.reason, \ 28 + sizeof(__uei_name.reason), (__ei)->reason); \ 29 + bpf_probe_read_kernel_str(__uei_name.msg, \ 30 + sizeof(__uei_name.msg), (__ei)->msg); \ 31 + bpf_probe_read_kernel_str(__uei_name##_dump, \ 32 + __uei_name##_dump_len, (__ei)->dump); \ 33 + if (bpf_core_field_exists((__ei)->exit_code)) \ 34 + __uei_name.exit_code = (__ei)->exit_code; \ 35 + /* use __sync to force memory barrier */ \ 36 + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ 37 + (__ei)->kind); \ 38 + }) 39 + 40 + #endif /* __USER_EXIT_INFO_BPF_H */

+2 -47

tools/sched_ext/include/scx/user_exit_info.h

··· 10 10 #ifndef __USER_EXIT_INFO_H 11 11 #define __USER_EXIT_INFO_H 12 12 13 - #ifdef LSP 14 - #define __bpf__ 15 - #include "../vmlinux.h" 16 - #endif 17 - 18 - enum uei_sizes { 19 - UEI_REASON_LEN = 128, 20 - UEI_MSG_LEN = 1024, 21 - UEI_DUMP_DFL_LEN = 32768, 22 - }; 23 - 24 - struct user_exit_info { 25 - int kind; 26 - s64 exit_code; 27 - char reason[UEI_REASON_LEN]; 28 - char msg[UEI_MSG_LEN]; 29 - }; 30 - 31 - #ifdef __bpf__ 32 - 33 - #ifndef LSP 34 - #include "vmlinux.h" 35 - #endif 36 - #include <bpf/bpf_core_read.h> 37 - 38 - #define UEI_DEFINE(__name) \ 39 - char RESIZABLE_ARRAY(data, __name##_dump); \ 40 - const volatile u32 __name##_dump_len; \ 41 - struct user_exit_info __name SEC(".data") 42 - 43 - #define UEI_RECORD(__uei_name, __ei) ({ \ 44 - bpf_probe_read_kernel_str(__uei_name.reason, \ 45 - sizeof(__uei_name.reason), (__ei)->reason); \ 46 - bpf_probe_read_kernel_str(__uei_name.msg, \ 47 - sizeof(__uei_name.msg), (__ei)->msg); \ 48 - bpf_probe_read_kernel_str(__uei_name##_dump, \ 49 - __uei_name##_dump_len, (__ei)->dump); \ 50 - if (bpf_core_field_exists((__ei)->exit_code)) \ 51 - __uei_name.exit_code = (__ei)->exit_code; \ 52 - /* use __sync to force memory barrier */ \ 53 - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ 54 - (__ei)->kind); \ 55 - }) 56 - 57 - #else /* !__bpf__ */ 58 - 59 13 #include <stdio.h> 60 14 #include <stdbool.h> 15 + 16 + #include "user_exit_info_common.h" 61 17 62 18 /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ 63 19 #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ ··· 70 114 71 115 #define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) 72 116 73 - #endif /* __bpf__ */ 74 117 #endif /* __USER_EXIT_INFO_H */

+30

tools/sched_ext/include/scx/user_exit_info_common.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Define struct user_exit_info which is shared between BPF and userspace parts 4 + * to communicate exit status and other information. 5 + * 6 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 7 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 8 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 9 + */ 10 + #ifndef __USER_EXIT_INFO_COMMON_H 11 + #define __USER_EXIT_INFO_COMMON_H 12 + 13 + #ifdef LSP 14 + #include "../vmlinux.h" 15 + #endif 16 + 17 + enum uei_sizes { 18 + UEI_REASON_LEN = 128, 19 + UEI_MSG_LEN = 1024, 20 + UEI_DUMP_DFL_LEN = 32768, 21 + }; 22 + 23 + struct user_exit_info { 24 + int kind; 25 + s64 exit_code; 26 + char reason[UEI_REASON_LEN]; 27 + char msg[UEI_MSG_LEN]; 28 + }; 29 + 30 + #endif /* __USER_EXIT_INFO_COMMON_H */

+1 -1

tools/sched_ext/scx_central.bpf.c

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 /* 3 - * A central FIFO sched_ext scheduler which demonstrates the followings: 3 + * A central FIFO sched_ext scheduler which demonstrates the following: 4 4 * 5 5 * a. Making all scheduling decisions from one CPU: 6 6 *

+1

tools/sched_ext/scx_central.c

··· 61 61 skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); 62 62 skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); 63 63 64 + assert(skel->rodata->nr_cpu_ids > 0); 64 65 assert(skel->rodata->nr_cpu_ids <= INT32_MAX); 65 66 66 67 while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {

+1 -1

tools/sched_ext/scx_flatcg.bpf.c

··· 950 950 .cgroup_move = (void *)fcg_cgroup_move, 951 951 .init = (void *)fcg_init, 952 952 .exit = (void *)fcg_exit, 953 - .flags = SCX_OPS_ENQ_EXITING, 953 + .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, 954 954 .name = "flatcg");

+2

tools/sched_ext/scx_flatcg.c

··· 6 6 */ 7 7 #include <stdio.h> 8 8 #include <signal.h> 9 + #include <assert.h> 9 10 #include <unistd.h> 10 11 #include <libgen.h> 11 12 #include <limits.h> ··· 138 137 skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); 139 138 140 139 skel->rodata->nr_cpus = libbpf_num_possible_cpus(); 140 + assert(skel->rodata->nr_cpus > 0); 141 141 skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); 142 142 143 143 while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {

+61 -35

tools/sched_ext/scx_qmap.bpf.c

··· 39 39 const volatile u32 dsp_inf_loop_after; 40 40 const volatile u32 dsp_batch; 41 41 const volatile bool highpri_boosting; 42 - const volatile bool print_shared_dsq; 42 + const volatile bool print_dsqs_and_events; 43 + const volatile bool print_msgs; 43 44 const volatile s32 disallow_tgid; 44 45 const volatile bool suppress_dump; 45 46 ··· 57 56 queue1 SEC(".maps"), 58 57 queue2 SEC(".maps"), 59 58 queue3 SEC(".maps"), 60 - queue4 SEC(".maps"); 59 + queue4 SEC(".maps"), 60 + dump_store SEC(".maps"); 61 61 62 62 struct { 63 63 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); ··· 580 578 return; 581 579 582 580 scx_bpf_dump("QMAP FIFO[%d]:", i); 581 + 582 + /* 583 + * Dump can be invoked anytime and there is no way to iterate in 584 + * a non-destructive way. Pop and store in dump_store and then 585 + * restore afterwards. If racing against new enqueues, ordering 586 + * can get mixed up. 587 + */ 583 588 bpf_repeat(4096) { 584 589 if (bpf_map_pop_elem(fifo, &pid)) 585 590 break; 591 + bpf_map_push_elem(&dump_store, &pid, 0); 586 592 scx_bpf_dump(" %d", pid); 587 593 } 594 + 595 + bpf_repeat(4096) { 596 + if (bpf_map_pop_elem(&dump_store, &pid)) 597 + break; 598 + bpf_map_push_elem(fifo, &pid, 0); 599 + } 600 + 588 601 scx_bpf_dump("\n"); 589 602 } 590 603 } ··· 634 617 635 618 s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) 636 619 { 637 - bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", 638 - cgrp->kn->id, args->weight, args->bw_period_us, 639 - args->bw_quota_us, args->bw_burst_us); 620 + if (print_msgs) 621 + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", 622 + cgrp->kn->id, args->weight, args->bw_period_us, 623 + args->bw_quota_us, args->bw_burst_us); 640 624 return 0; 641 625 } 642 626 643 627 void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 644 628 { 645 - bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); 629 + if (print_msgs) 630 + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); 646 631 } 647 632 648 633 void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, 649 634 u64 period_us, u64 quota_us, u64 burst_us) 650 635 { 651 - bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, 652 - period_us, quota_us, burst_us); 636 + if (print_msgs) 637 + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", 638 + cgrp->kn->id, period_us, quota_us, burst_us); 653 639 } 654 640 655 641 /* ··· 696 676 697 677 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) 698 678 { 699 - bpf_printk("CPU %d coming online", cpu); 700 - /* @cpu is already online at this point */ 701 - print_cpus(); 679 + if (print_msgs) { 680 + bpf_printk("CPU %d coming online", cpu); 681 + /* @cpu is already online at this point */ 682 + print_cpus(); 683 + } 702 684 } 703 685 704 686 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) 705 687 { 706 - bpf_printk("CPU %d going offline", cpu); 707 - /* @cpu is still online at this point */ 708 - print_cpus(); 688 + if (print_msgs) { 689 + bpf_printk("CPU %d going offline", cpu); 690 + /* @cpu is still online at this point */ 691 + print_cpus(); 692 + } 709 693 } 710 694 711 695 struct monitor_timer { ··· 807 783 808 784 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 809 785 { 810 - struct scx_event_stats events; 811 - 812 786 bpf_rcu_read_lock(); 813 787 dispatch_highpri(true); 814 788 bpf_rcu_read_unlock(); 815 789 816 790 monitor_cpuperf(); 817 791 818 - if (print_shared_dsq) 792 + if (print_dsqs_and_events) { 793 + struct scx_event_stats events; 794 + 819 795 dump_shared_dsq(); 820 796 821 - __COMPAT_scx_bpf_events(&events, sizeof(events)); 797 + __COMPAT_scx_bpf_events(&events, sizeof(events)); 822 798 823 - bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", 824 - scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); 825 - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", 826 - scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); 827 - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", 828 - scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); 829 - bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", 830 - scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); 831 - bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", 832 - scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); 833 - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", 834 - scx_read_event(&events, SCX_EV_BYPASS_DURATION)); 835 - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", 836 - scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); 837 - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", 838 - scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); 799 + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", 800 + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); 801 + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", 802 + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); 803 + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", 804 + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); 805 + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", 806 + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); 807 + bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", 808 + scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); 809 + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", 810 + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); 811 + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", 812 + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); 813 + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", 814 + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); 815 + } 839 816 840 817 bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 841 818 return 0; ··· 848 823 struct bpf_timer *timer; 849 824 s32 ret; 850 825 851 - print_cpus(); 826 + if (print_msgs) 827 + print_cpus(); 852 828 853 829 ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 854 830 if (ret)

+8 -4

tools/sched_ext/scx_qmap.c

··· 20 20 "See the top-level comment in .bpf.c for more details.\n" 21 21 "\n" 22 22 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" 23 - " [-P] [-d PID] [-D LEN] [-p] [-v]\n" 23 + " [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n" 24 24 "\n" 25 25 " -s SLICE_US Override slice duration\n" 26 26 " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" ··· 28 28 " -T COUNT Stall every COUNT'th kernel thread\n" 29 29 " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" 30 30 " -b COUNT Dispatch upto COUNT tasks together\n" 31 - " -P Print out DSQ content to trace_pipe every second, use with -b\n" 31 + " -P Print out DSQ content and event counters to trace_pipe every second\n" 32 + " -M Print out debug messages to trace_pipe\n" 32 33 " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" 33 34 " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" 34 35 " -D LEN Set scx_exit_info.dump buffer length\n" ··· 67 66 68 67 skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); 69 68 70 - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { 69 + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { 71 70 switch (opt) { 72 71 case 's': 73 72 skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; ··· 88 87 skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); 89 88 break; 90 89 case 'P': 91 - skel->rodata->print_shared_dsq = true; 90 + skel->rodata->print_dsqs_and_events = true; 91 + break; 92 + case 'M': 93 + skel->rodata->print_msgs = true; 92 94 break; 93 95 case 'H': 94 96 skel->rodata->highpri_boosting = true;

+2

tools/sched_ext/scx_simple.c

··· 7 7 #include <stdio.h> 8 8 #include <unistd.h> 9 9 #include <signal.h> 10 + #include <assert.h> 10 11 #include <libgen.h> 11 12 #include <bpf/bpf.h> 12 13 #include <scx/common.h> ··· 42 41 static void read_stats(struct scx_simple *skel, __u64 *stats) 43 42 { 44 43 int nr_cpus = libbpf_num_possible_cpus(); 44 + assert(nr_cpus > 0); 45 45 __u64 cnts[2][nr_cpus]; 46 46 __u32 idx; 47 47

Configure Feed

Configure Feed