Merge tag 'sched_ext-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

+27 -3

Documentation/scheduler/sched-ext.rst

··· 43 43 CONFIG_DEBUG_INFO_BTF=y 44 44 CONFIG_BPF_JIT_ALWAYS_ON=y 45 45 CONFIG_BPF_JIT_DEFAULT_ON=y 46 - CONFIG_PAHOLE_HAS_BTF_TAG=y 47 46 48 47 sched_ext is used only when the BPF scheduler is loaded and running. 49 48 ··· 57 58 However, when the BPF scheduler is loaded and ``SCX_OPS_SWITCH_PARTIAL`` is 58 59 set in ``ops->flags``, only tasks with the ``SCHED_EXT`` policy are scheduled 59 60 by sched_ext, while tasks with ``SCHED_NORMAL``, ``SCHED_BATCH`` and 60 - ``SCHED_IDLE`` policies are scheduled by the fair-class scheduler. 61 + ``SCHED_IDLE`` policies are scheduled by the fair-class scheduler which has 62 + higher sched_class precedence than ``SCHED_EXT``. 61 63 62 64 Terminating the sched_ext scheduler program, triggering `SysRq-S`, or 63 65 detection of any internal error including stalled runnable tasks aborts the ··· 345 345 The functions prefixed with ``scx_bpf_`` can be called from the BPF 346 346 scheduler. 347 347 348 + * ``kernel/sched/ext_idle.c`` contains the built-in idle CPU selection policy. 349 + 348 350 * ``tools/sched_ext/`` hosts example BPF scheduler implementations. 349 351 350 352 * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a ··· 355 353 * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five 356 354 levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. 357 355 356 + * ``scx_central[.bpf].c``: A central FIFO scheduler where all scheduling 357 + decisions are made on one CPU, demonstrating ``LOCAL_ON`` dispatching, 358 + tickless operation, and kthread preemption. 359 + 360 + * ``scx_cpu0[.bpf].c``: A scheduler that queues all tasks to a shared DSQ 361 + and only dispatches them on CPU0 in FIFO order. Useful for testing bypass 362 + behavior. 363 + 364 + * ``scx_flatcg[.bpf].c``: A flattened cgroup hierarchy scheduler 365 + implementing hierarchical weight-based cgroup CPU control by compounding 366 + each cgroup's share at every level into a single flat scheduling layer. 367 + 368 + * ``scx_pair[.bpf].c``: A core-scheduling example that always makes 369 + sibling CPU pairs execute tasks from the same CPU cgroup. 370 + 371 + * ``scx_sdt[.bpf].c``: A variation of ``scx_simple`` demonstrating BPF 372 + arena memory management for per-task data. 373 + 374 + * ``scx_userland[.bpf].c``: A minimal scheduler demonstrating user space 375 + scheduling. Tasks with CPU affinity are direct-dispatched in FIFO order; 376 + all others are scheduled in user space by a simple vruntime scheduler. 377 + 358 378 ABI Instability 359 379 =============== 360 380 361 381 The APIs provided by sched_ext to BPF schedulers programs have no stability 362 382 guarantees. This includes the ops table callbacks and constants defined in 363 383 ``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in 364 - ``kernel/sched/ext.c``. 384 + ``kernel/sched/ext.c`` and ``kernel/sched/ext_idle.c``. 365 385 366 386 While we will attempt to provide a relatively stable API surface when 367 387 possible, they are subject to change without warning between kernel

+11 -11

kernel/sched/ext.c

··· 1103 1103 } 1104 1104 1105 1105 /* seq records the order tasks are queued, used by BPF DSQ iterator */ 1106 - dsq->seq++; 1106 + WRITE_ONCE(dsq->seq, dsq->seq + 1); 1107 1107 p->scx.dsq_seq = dsq->seq; 1108 1108 1109 1109 dsq_mod_nr(dsq, 1); ··· 1470 1470 p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; 1471 1471 } 1472 1472 1473 - static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) 1473 + static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) 1474 1474 { 1475 1475 struct scx_sched *sch = scx_root; 1476 1476 int sticky_cpu = p->scx.sticky_cpu; 1477 + u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; 1477 1478 1478 1479 if (enq_flags & ENQUEUE_WAKEUP) 1479 1480 rq->scx.flags |= SCX_RQ_IN_WAKEUP; 1480 - 1481 - enq_flags |= rq->scx.extra_enq_flags; 1482 1481 1483 1482 if (sticky_cpu >= 0) 1484 1483 p->scx.sticky_cpu = -1; ··· 3907 3908 * consider offloading iff the total queued duration is over the 3908 3909 * threshold. 3909 3910 */ 3910 - min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; 3911 - if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) 3911 + min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; 3912 + if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) 3912 3913 return 0; 3913 3914 3914 3915 raw_spin_rq_lock_irq(rq); ··· 4136 4137 WARN_ON_ONCE(scx_bypass_depth <= 0); 4137 4138 if (scx_bypass_depth != 1) 4138 4139 goto unlock; 4139 - WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); 4140 + WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); 4140 4141 bypass_timestamp = ktime_get_ns(); 4141 4142 if (sch) 4142 4143 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); ··· 5258 5259 if (!READ_ONCE(helper)) { 5259 5260 mutex_lock(&helper_mutex); 5260 5261 if (!helper) { 5261 - helper = kthread_run_worker(0, "scx_enable_helper"); 5262 - if (IS_ERR_OR_NULL(helper)) { 5263 - helper = NULL; 5262 + struct kthread_worker *w = 5263 + kthread_run_worker(0, "scx_enable_helper"); 5264 + if (IS_ERR_OR_NULL(w)) { 5264 5265 mutex_unlock(&helper_mutex); 5265 5266 return -ENOMEM; 5266 5267 } 5267 - sched_set_fifo(helper->task); 5268 + sched_set_fifo(w->task); 5269 + WRITE_ONCE(helper, w); 5268 5270 } 5269 5271 mutex_unlock(&helper_mutex); 5270 5272 }

+98 -16

kernel/sched/ext_internal.h

··· 1035 1035 }; 1036 1036 1037 1037 /* 1038 - * sched_ext_entity->ops_state 1038 + * Task Ownership State Machine (sched_ext_entity->ops_state) 1039 1039 * 1040 - * Used to track the task ownership between the SCX core and the BPF scheduler. 1041 - * State transitions look as follows: 1040 + * The sched_ext core uses this state machine to track task ownership 1041 + * between the SCX core and the BPF scheduler. This allows the BPF 1042 + * scheduler to dispatch tasks without strict ordering requirements, while 1043 + * the SCX core safely rejects invalid dispatches. 1042 1044 * 1043 - * NONE -> QUEUEING -> QUEUED -> DISPATCHING 1044 - * ^ | | 1045 - * | v v 1046 - * \-------------------------------/ 1045 + * State Transitions 1047 1046 * 1048 - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call 1049 - * sites for explanations on the conditions being waited upon and why they are 1050 - * safe. Transitions out of them into NONE or QUEUED must store_release and the 1051 - * waiters should load_acquire. 1047 + * .------------> NONE (owned by SCX core) 1048 + * | | ^ 1049 + * | enqueue | | direct dispatch 1050 + * | v | 1051 + * | QUEUEING -------' 1052 + * | | 1053 + * | enqueue | 1054 + * | completes | 1055 + * | v 1056 + * | QUEUED (owned by BPF scheduler) 1057 + * | | 1058 + * | dispatch | 1059 + * | | 1060 + * | v 1061 + * | DISPATCHING 1062 + * | | 1063 + * | dispatch | 1064 + * | completes | 1065 + * `---------------' 1052 1066 * 1053 - * Tracking scx_ops_state enables sched_ext core to reliably determine whether 1054 - * any given task can be dispatched by the BPF scheduler at all times and thus 1055 - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler 1056 - * to try to dispatch any task anytime regardless of its state as the SCX core 1057 - * can safely reject invalid dispatches. 1067 + * State Descriptions 1068 + * 1069 + * - %SCX_OPSS_NONE: 1070 + * Task is owned by the SCX core. It's either on a run queue, running, 1071 + * or being manipulated by the core scheduler. The BPF scheduler has no 1072 + * claim on this task. 1073 + * 1074 + * - %SCX_OPSS_QUEUEING: 1075 + * Transitional state while transferring a task from the SCX core to 1076 + * the BPF scheduler. The task's rq lock is held during this state. 1077 + * Since QUEUEING is both entered and exited under the rq lock, dequeue 1078 + * can never observe this state (it would be a BUG). When finishing a 1079 + * dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion 1080 + * path busy-waits for it to leave this state (via wait_ops_state()) 1081 + * before retrying. 1082 + * 1083 + * - %SCX_OPSS_QUEUED: 1084 + * Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue) 1085 + * and the BPF scheduler is responsible for dispatching it. A QSEQ 1086 + * (queue sequence number) is embedded in this state to detect 1087 + * dispatch/dequeue races: if a task is dequeued and re-enqueued, the 1088 + * QSEQ changes and any in-flight dispatch operations targeting the old 1089 + * QSEQ are safely ignored. 1090 + * 1091 + * - %SCX_OPSS_DISPATCHING: 1092 + * Transitional state while transferring a task from the BPF scheduler 1093 + * back to the SCX core. This state indicates the BPF scheduler has 1094 + * selected the task for execution. When dequeue needs to take the task 1095 + * off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path 1096 + * busy-waits for it to leave this state (via wait_ops_state()) before 1097 + * proceeding. Exits to %SCX_OPSS_NONE when dispatch completes. 1098 + * 1099 + * Memory Ordering 1100 + * 1101 + * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into 1102 + * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release() 1103 + * and waiters must use atomic_long_read_acquire(). This ensures proper 1104 + * synchronization between concurrent operations. 1105 + * 1106 + * Cross-CPU Task Migration 1107 + * 1108 + * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply 1109 + * grab the target CPU's rq lock because a concurrent dequeue might be 1110 + * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock 1111 + * (deadlock). 1112 + * 1113 + * The sched_ext core uses a "lock dancing" protocol coordinated by 1114 + * p->scx.holding_cpu. When moving a task to a different rq: 1115 + * 1116 + * 1. Verify task can be moved (CPU affinity, migration_disabled, etc.) 1117 + * 2. Set p->scx.holding_cpu to the current CPU 1118 + * 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING 1119 + * is set, so clearing DISPATCHING first prevents the circular wait 1120 + * (safe to lock the rq we need) 1121 + * 4. Unlock the current CPU's rq 1122 + * 5. Lock src_rq (where the task currently lives) 1123 + * 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the 1124 + * race (dequeue clears holding_cpu to -1 when it takes the task), in 1125 + * this case migration is aborted 1126 + * 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly 1127 + * into dst_rq's local DSQ (no lock swap needed) 1128 + * 8. Otherwise: call move_remote_task_to_local_dsq(), which releases 1129 + * src_rq, locks dst_rq, and performs the deactivate/activate 1130 + * migration cycle (dst_rq is held on return) 1131 + * 9. Unlock dst_rq and re-lock the current CPU's rq to restore 1132 + * the lock state expected by the caller 1133 + * 1134 + * If any verification fails, abort the migration. 1135 + * 1136 + * This state tracking allows the BPF scheduler to try to dispatch any task 1137 + * at any time regardless of its state. The SCX core can safely 1138 + * reject/ignore invalid dispatches, simplifying the BPF scheduler 1139 + * implementation. 1058 1140 */ 1059 1141 enum scx_ops_state { 1060 1142 SCX_OPSS_NONE, /* owned by the SCX core */

+2 -2

tools/testing/selftests/sched_ext/util.c

··· 60 60 char buf[64]; 61 61 int ret; 62 62 63 - ret = sprintf(buf, "%lu", val); 63 + ret = sprintf(buf, "%ld", val); 64 64 if (ret < 0) 65 65 return ret; 66 66 67 - if (write_text(path, buf, sizeof(buf)) <= 0) 67 + if (write_text(path, buf, ret) <= 0) 68 68 return -1; 69 69 70 70 return 0;

Configure Feed

Configure Feed