Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext updates from Tejun Heo:

- Improve recovery from misbehaving BPF schedulers.

When a scheduler puts many tasks with varying affinity restrictions
on a shared DSQ, CPUs scanning through tasks they cannot run can
overwhelm the system, causing lockups.

Bypass mode now uses per-CPU DSQs with a load balancer to avoid this,
and hooks into the hardlockup detector to attempt recovery.

Add scx_cpu0 example scheduler to demonstrate this scenario.

- Add lockless peek operation for DSQs to reduce lock contention for
schedulers that need to query queue state during load balancing.

- Allow scx_bpf_reenqueue_local() to be called from anywhere in
preparation for deprecating cpu_acquire/release() callbacks in favor
of generic BPF hooks.

- Prepare for hierarchical scheduler support: add
scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs,
make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in
structs for future aux__prog parameter.

- Implement cgroup_set_idle() callback to notify BPF schedulers when a
cgroup's idle state changes.

- Fix migration tasks being incorrectly downgraded from
stop_sched_class to rt_sched_class across sched_ext enable/disable.
Applied late as the fix is low risk and the bug subtle but needs
stable backporting.

- Various fixes and cleanups including cgroup exit ordering,
SCX_KICK_WAIT reliability, and backward compatibility improvements.

* tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits)
sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks
sched_ext: tools: Removing duplicate targets during non-cross compilation
sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object
sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs
sched_ext: Update comments replacing breather with aborting mechanism
sched_ext: Implement load balancer for bypass mode
sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked()
sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
sched_ext: Add scx_cpu0 example scheduler
sched_ext: Hook up hardlockup detector
sched_ext: Make handle_lockup() propagate scx_verror() result
sched_ext: Refactor lockup handlers into handle_lockup()
sched_ext: Make scx_exit() and scx_vexit() return bool
sched_ext: Exit dispatch and move operations immediately when aborting
sched_ext: Simplify breather mechanism with scx_aborting flag
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
sched_ext: Refactor do_enqueue_task() local and global DSQ paths
sched_ext: Use shorter slice in bypass mode
sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races
sched_ext: Minor cleanups to scx_task_iter
...

+1905 -423
+25 -2
include/linux/sched/ext.h
··· 17 17 enum scx_public_consts { 18 18 SCX_OPS_NAME_LEN = 128, 19 19 20 + /* 21 + * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses 22 + * to set the slice for a task that is selected for execution. 23 + * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice 24 + * refill has been triggered. 25 + * 26 + * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass 27 + * mode. As making forward progress for all tasks is the main goal of 28 + * the bypass mode, a shorter slice is used. 29 + */ 20 30 SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ 31 + SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ 21 32 SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ 22 33 }; 23 34 ··· 57 46 SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, 58 47 SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, 59 48 SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, 49 + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, 60 50 SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, 61 51 SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, 62 52 }; ··· 70 58 */ 71 59 struct scx_dispatch_q { 72 60 raw_spinlock_t lock; 61 + struct task_struct __rcu *first_task; /* lockless peek at head */ 73 62 struct list_head list; /* tasks in dispatch order */ 74 63 struct rb_root priq; /* used to order by p->scx.dsq_vtime */ 75 64 u32 nr; ··· 149 136 u32 priv; /* can be used by iter cursor */ 150 137 }; 151 138 139 + #define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ 140 + (struct scx_dsq_list_node) { \ 141 + .node = LIST_HEAD_INIT((__node).node), \ 142 + .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ 143 + .priv = (__priv), \ 144 + } 145 + 152 146 /* 153 147 * The following is embedded in task_struct and contains all fields necessary 154 148 * for a task to be scheduled by SCX. ··· 227 207 struct list_head tasks_node; 228 208 }; 229 209 230 - void sched_ext_free(struct task_struct *p); 210 + void sched_ext_dead(struct task_struct *p); 231 211 void print_scx_info(const char *log_lvl, struct task_struct *p); 232 212 void scx_softlockup(u32 dur_s); 213 + bool scx_hardlockup(int cpu); 233 214 bool scx_rcu_cpu_stall(void); 234 215 235 216 #else /* !CONFIG_SCHED_CLASS_EXT */ 236 217 237 - static inline void sched_ext_free(struct task_struct *p) {} 218 + static inline void sched_ext_dead(struct task_struct *p) {} 238 219 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} 239 220 static inline void scx_softlockup(u32 dur_s) {} 221 + static inline bool scx_hardlockup(int cpu) { return false; } 240 222 static inline bool scx_rcu_cpu_stall(void) { return false; } 241 223 242 224 #endif /* CONFIG_SCHED_CLASS_EXT */ ··· 250 228 u64 bw_period_us; 251 229 u64 bw_quota_us; 252 230 u64 bw_burst_us; 231 + bool idle; 253 232 #endif 254 233 }; 255 234
+39
include/trace/events/sched_ext.h
··· 45 45 ) 46 46 ); 47 47 48 + TRACE_EVENT(sched_ext_bypass_lb, 49 + 50 + TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced, 51 + __u32 before_min, __u32 before_max, 52 + __u32 after_min, __u32 after_max), 53 + 54 + TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced, 55 + before_min, before_max, after_min, after_max), 56 + 57 + TP_STRUCT__entry( 58 + __field( __u32, node ) 59 + __field( __u32, nr_cpus ) 60 + __field( __u32, nr_tasks ) 61 + __field( __u32, nr_balanced ) 62 + __field( __u32, before_min ) 63 + __field( __u32, before_max ) 64 + __field( __u32, after_min ) 65 + __field( __u32, after_max ) 66 + ), 67 + 68 + TP_fast_assign( 69 + __entry->node = node; 70 + __entry->nr_cpus = nr_cpus; 71 + __entry->nr_tasks = nr_tasks; 72 + __entry->nr_balanced = nr_balanced; 73 + __entry->before_min = before_min; 74 + __entry->before_max = before_max; 75 + __entry->after_min = after_min; 76 + __entry->after_max = after_max; 77 + ), 78 + 79 + TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u", 80 + __entry->node, __entry->nr_cpus, 81 + __entry->nr_tasks, __entry->nr_balanced, 82 + __entry->before_min, __entry->after_min, 83 + __entry->before_max, __entry->after_max 84 + ) 85 + ); 86 + 48 87 #endif /* _TRACE_SCHED_EXT_H */ 49 88 50 89 /* This part must be outside protection */
-1
kernel/fork.c
··· 736 736 WARN_ON(tsk == current); 737 737 738 738 unwind_task_free(tsk); 739 - sched_ext_free(tsk); 740 739 io_uring_free(tsk); 741 740 cgroup_task_free(tsk); 742 741 task_numa_free(tsk, true);
+6
kernel/sched/core.c
··· 5143 5143 if (prev->sched_class->task_dead) 5144 5144 prev->sched_class->task_dead(prev); 5145 5145 5146 + /* 5147 + * sched_ext_dead() must come before cgroup_task_dead() to 5148 + * prevent cgroups from being removed while its member tasks are 5149 + * visible to SCX schedulers. 5150 + */ 5151 + sched_ext_dead(prev); 5146 5152 cgroup_task_dead(prev); 5147 5153 5148 5154 /* Task is done with its stack. */
+814 -285
kernel/sched/ext.c
··· 33 33 DEFINE_STATIC_KEY_FALSE(__scx_enabled); 34 34 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 35 35 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); 36 - static unsigned long scx_in_softlockup; 37 - static atomic_t scx_breather_depth = ATOMIC_INIT(0); 38 36 static int scx_bypass_depth; 37 + static cpumask_var_t scx_bypass_lb_donee_cpumask; 38 + static cpumask_var_t scx_bypass_lb_resched_cpumask; 39 + static bool scx_aborting; 39 40 static bool scx_init_task_enabled; 40 41 static bool scx_switching_all; 41 42 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); ··· 69 68 static struct delayed_work scx_watchdog_work; 70 69 71 70 /* 72 - * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence 71 + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence 73 72 * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 74 73 * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 75 74 * lazily when enabling and freed when disabling to avoid waste when sched_ext 76 75 * isn't active. 77 76 */ 78 - struct scx_kick_pseqs { 77 + struct scx_kick_syncs { 79 78 struct rcu_head rcu; 80 - unsigned long seqs[]; 79 + unsigned long syncs[]; 81 80 }; 82 81 83 - static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs); 82 + static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); 84 83 85 84 /* 86 85 * Direct dispatch marker. ··· 144 143 /* /sys/kernel/sched_ext interface */ 145 144 static struct kset *scx_kset; 146 145 146 + /* 147 + * Parameters that can be adjusted through /sys/module/sched_ext/parameters. 148 + * There usually is no reason to modify these as normal scheduler operation 149 + * shouldn't be affected by them. The knobs are primarily for debugging. 150 + */ 151 + static u64 scx_slice_dfl = SCX_SLICE_DFL; 152 + static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; 153 + static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; 154 + 155 + static int set_slice_us(const char *val, const struct kernel_param *kp) 156 + { 157 + return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); 158 + } 159 + 160 + static const struct kernel_param_ops slice_us_param_ops = { 161 + .set = set_slice_us, 162 + .get = param_get_uint, 163 + }; 164 + 165 + static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) 166 + { 167 + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); 168 + } 169 + 170 + static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { 171 + .set = set_bypass_lb_intv_us, 172 + .get = param_get_uint, 173 + }; 174 + 175 + #undef MODULE_PARAM_PREFIX 176 + #define MODULE_PARAM_PREFIX "sched_ext." 177 + 178 + module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); 179 + MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); 180 + module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); 181 + MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); 182 + 183 + #undef MODULE_PARAM_PREFIX 184 + 147 185 #define CREATE_TRACE_POINTS 148 186 #include <trace/events/sched_ext.h> 149 187 150 188 static void process_ddsp_deferred_locals(struct rq *rq); 189 + static u32 reenq_local(struct rq *rq); 151 190 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); 152 - static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 191 + static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, 153 192 s64 exit_code, const char *fmt, va_list args); 154 193 155 - static __printf(4, 5) void scx_exit(struct scx_sched *sch, 194 + static __printf(4, 5) bool scx_exit(struct scx_sched *sch, 156 195 enum scx_exit_kind kind, s64 exit_code, 157 196 const char *fmt, ...) 158 197 { 159 198 va_list args; 199 + bool ret; 160 200 161 201 va_start(args, fmt); 162 - scx_vexit(sch, kind, exit_code, fmt, args); 202 + ret = scx_vexit(sch, kind, exit_code, fmt, args); 163 203 va_end(args); 204 + 205 + return ret; 164 206 } 165 207 166 208 #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) 209 + #define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) 167 210 168 211 #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) 169 212 ··· 245 200 246 201 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) 247 202 { 248 - return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); 203 + return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); 204 + } 205 + 206 + static const struct sched_class *scx_setscheduler_class(struct task_struct *p) 207 + { 208 + if (p->sched_class == &stop_sched_class) 209 + return &stop_sched_class; 210 + 211 + return __setscheduler_class(p->policy, p->prio); 249 212 } 250 213 251 214 /* ··· 522 469 * RCU read lock or obtaining a reference count. 523 470 * 524 471 * All tasks which existed when the iteration started are guaranteed to be 525 - * visited as long as they still exist. 472 + * visited as long as they are not dead. 526 473 */ 527 474 static void scx_task_iter_start(struct scx_task_iter *iter) 528 475 { 529 - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 530 - ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 476 + memset(iter, 0, sizeof(*iter)); 531 477 532 478 raw_spin_lock_irq(&scx_tasks_lock); 533 479 534 480 iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; 535 481 list_add(&iter->cursor.tasks_node, &scx_tasks); 536 - iter->locked_task = NULL; 537 - iter->cnt = 0; 538 482 iter->list_locked = true; 539 483 } 540 484 ··· 597 547 struct list_head *cursor = &iter->cursor.tasks_node; 598 548 struct sched_ext_entity *pos; 599 549 600 - __scx_task_iter_maybe_relock(iter); 601 - 602 550 if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { 603 551 scx_task_iter_unlock(iter); 604 552 cond_resched(); 605 - __scx_task_iter_maybe_relock(iter); 606 553 } 554 + 555 + __scx_task_iter_maybe_relock(iter); 607 556 608 557 list_for_each_entry(pos, cursor, tasks_node) { 609 558 if (&pos->tasks_node == &scx_tasks) ··· 804 755 static void run_deferred(struct rq *rq) 805 756 { 806 757 process_ddsp_deferred_locals(rq); 758 + 759 + if (local_read(&rq->scx.reenq_local_deferred)) { 760 + local_set(&rq->scx.reenq_local_deferred, 0); 761 + reenq_local(rq); 762 + } 807 763 } 808 764 809 765 static void deferred_bal_cb_workfn(struct rq *rq) ··· 829 775 * schedule_deferred - Schedule execution of deferred actions on an rq 830 776 * @rq: target rq 831 777 * 832 - * Schedule execution of deferred actions on @rq. Must be called with @rq 833 - * locked. Deferred actions are executed with @rq locked but unpinned, and thus 834 - * can unlock @rq to e.g. migrate tasks to other rqs. 778 + * Schedule execution of deferred actions on @rq. Deferred actions are executed 779 + * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks 780 + * to other rqs. 835 781 */ 836 782 static void schedule_deferred(struct rq *rq) 783 + { 784 + /* 785 + * Queue an irq work. They are executed on IRQ re-enable which may take 786 + * a bit longer than the scheduler hook in schedule_deferred_locked(). 787 + */ 788 + irq_work_queue(&rq->scx.deferred_irq_work); 789 + } 790 + 791 + /** 792 + * schedule_deferred_locked - Schedule execution of deferred actions on an rq 793 + * @rq: target rq 794 + * 795 + * Schedule execution of deferred actions on @rq. Equivalent to 796 + * schedule_deferred() but requires @rq to be locked and can be more efficient. 797 + */ 798 + static void schedule_deferred_locked(struct rq *rq) 837 799 { 838 800 lockdep_assert_rq_held(rq); 839 801 ··· 882 812 } 883 813 884 814 /* 885 - * No scheduler hooks available. Queue an irq work. They are executed on 886 - * IRQ re-enable which may take a bit longer than the scheduler hooks. 887 - * The above WAKEUP and BALANCE paths should cover most of the cases and 888 - * the time to IRQ re-enable shouldn't be long. 815 + * No scheduler hooks available. Use the generic irq_work path. The 816 + * above WAKEUP and BALANCE paths should cover most of the cases and the 817 + * time to IRQ re-enable shouldn't be long. 889 818 */ 890 - irq_work_queue(&rq->scx.deferred_irq_work); 819 + schedule_deferred(rq); 891 820 } 892 821 893 822 /** ··· 971 902 972 903 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) 973 904 { 974 - p->scx.slice = SCX_SLICE_DFL; 905 + p->scx.slice = READ_ONCE(scx_slice_dfl); 975 906 __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); 976 907 } 977 908 ··· 985 916 !RB_EMPTY_NODE(&p->scx.dsq_priq)); 986 917 987 918 if (!is_local) { 988 - raw_spin_lock(&dsq->lock); 919 + raw_spin_lock_nested(&dsq->lock, 920 + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); 921 + 989 922 if (unlikely(dsq->id == SCX_DSQ_INVALID)) { 990 923 scx_error(sch, "attempting to dispatch to a destroyed dsq"); 991 924 /* fall back to the global dsq */ ··· 1036 965 container_of(rbp, struct task_struct, 1037 966 scx.dsq_priq); 1038 967 list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); 968 + /* first task unchanged - no update needed */ 1039 969 } else { 1040 970 list_add(&p->scx.dsq_list.node, &dsq->list); 971 + /* not builtin and new task is at head - use fastpath */ 972 + rcu_assign_pointer(dsq->first_task, p); 1041 973 } 1042 974 } else { 1043 975 /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ ··· 1048 974 scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", 1049 975 dsq->id); 1050 976 1051 - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) 977 + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { 1052 978 list_add(&p->scx.dsq_list.node, &dsq->list); 1053 - else 979 + /* new task inserted at head - use fastpath */ 980 + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 981 + rcu_assign_pointer(dsq->first_task, p); 982 + } else { 983 + bool was_empty; 984 + 985 + was_empty = list_empty(&dsq->list); 1054 986 list_add_tail(&p->scx.dsq_list.node, &dsq->list); 987 + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) 988 + rcu_assign_pointer(dsq->first_task, p); 989 + } 1055 990 } 1056 991 1057 992 /* seq records the order tasks are queued, used by BPF DSQ iterator */ ··· 1117 1034 1118 1035 list_del_init(&p->scx.dsq_list.node); 1119 1036 dsq_mod_nr(dsq, -1); 1037 + 1038 + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { 1039 + struct task_struct *first_task; 1040 + 1041 + first_task = nldsq_next_task(dsq, NULL, false); 1042 + rcu_assign_pointer(dsq->first_task, first_task); 1043 + } 1120 1044 } 1121 1045 1122 1046 static void dispatch_dequeue(struct rq *rq, struct task_struct *p) 1123 1047 { 1124 1048 struct scx_dispatch_q *dsq = p->scx.dsq; 1125 1049 bool is_local = dsq == &rq->scx.local_dsq; 1050 + 1051 + lockdep_assert_rq_held(rq); 1126 1052 1127 1053 if (!dsq) { 1128 1054 /* ··· 1177 1085 1178 1086 if (!is_local) 1179 1087 raw_spin_unlock(&dsq->lock); 1088 + } 1089 + 1090 + /* 1091 + * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq 1092 + * and dsq are locked. 1093 + */ 1094 + static void dispatch_dequeue_locked(struct task_struct *p, 1095 + struct scx_dispatch_q *dsq) 1096 + { 1097 + lockdep_assert_rq_held(task_rq(p)); 1098 + lockdep_assert_held(&dsq->lock); 1099 + 1100 + task_unlink_from_dsq(p, dsq); 1101 + p->scx.dsq = NULL; 1180 1102 } 1181 1103 1182 1104 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, ··· 1298 1192 WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); 1299 1193 list_add_tail(&p->scx.dsq_list.node, 1300 1194 &rq->scx.ddsp_deferred_locals); 1301 - schedule_deferred(rq); 1195 + schedule_deferred_locked(rq); 1302 1196 return; 1303 1197 } 1304 1198 ··· 1323 1217 { 1324 1218 struct scx_sched *sch = scx_root; 1325 1219 struct task_struct **ddsp_taskp; 1220 + struct scx_dispatch_q *dsq; 1326 1221 unsigned long qseq; 1327 1222 1328 1223 WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); ··· 1342 1235 1343 1236 if (scx_rq_bypassing(rq)) { 1344 1237 __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); 1345 - goto global; 1238 + goto bypass; 1346 1239 } 1347 1240 1348 1241 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) ··· 1391 1284 direct: 1392 1285 direct_dispatch(sch, p, enq_flags); 1393 1286 return; 1394 - 1287 + local_norefill: 1288 + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); 1289 + return; 1395 1290 local: 1291 + dsq = &rq->scx.local_dsq; 1292 + goto enqueue; 1293 + global: 1294 + dsq = find_global_dsq(sch, p); 1295 + goto enqueue; 1296 + bypass: 1297 + dsq = &task_rq(p)->scx.bypass_dsq; 1298 + goto enqueue; 1299 + 1300 + enqueue: 1396 1301 /* 1397 1302 * For task-ordering, slice refill must be treated as implying the end 1398 1303 * of the current slice. Otherwise, the longer @p stays on the CPU, the ··· 1412 1293 */ 1413 1294 touch_core_sched(rq, p); 1414 1295 refill_task_slice_dfl(sch, p); 1415 - local_norefill: 1416 - dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); 1417 - return; 1418 - 1419 - global: 1420 - touch_core_sched(rq, p); /* see the comment in local: */ 1421 - refill_task_slice_dfl(sch, p); 1422 - dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); 1296 + dispatch_enqueue(sch, dsq, p, enq_flags); 1423 1297 } 1424 1298 1425 1299 static bool task_runnable(const struct task_struct *p) ··· 1853 1741 * @p is going from a non-local DSQ to a non-local DSQ. As 1854 1742 * $src_dsq is already locked, do an abbreviated dequeue. 1855 1743 */ 1856 - task_unlink_from_dsq(p, src_dsq); 1857 - p->scx.dsq = NULL; 1744 + dispatch_dequeue_locked(p, src_dsq); 1858 1745 raw_spin_unlock(&src_dsq->lock); 1859 1746 1860 1747 dispatch_enqueue(sch, dst_dsq, p, enq_flags); ··· 1862 1751 return dst_rq; 1863 1752 } 1864 1753 1865 - /* 1866 - * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly 1867 - * banging on the same DSQ on a large NUMA system to the point where switching 1868 - * to the bypass mode can take a long time. Inject artificial delays while the 1869 - * bypass mode is switching to guarantee timely completion. 1870 - */ 1871 - static void scx_breather(struct rq *rq) 1872 - { 1873 - u64 until; 1874 - 1875 - lockdep_assert_rq_held(rq); 1876 - 1877 - if (likely(!atomic_read(&scx_breather_depth))) 1878 - return; 1879 - 1880 - raw_spin_rq_unlock(rq); 1881 - 1882 - until = ktime_get_ns() + NSEC_PER_MSEC; 1883 - 1884 - do { 1885 - int cnt = 1024; 1886 - while (atomic_read(&scx_breather_depth) && --cnt) 1887 - cpu_relax(); 1888 - } while (atomic_read(&scx_breather_depth) && 1889 - time_before64(ktime_get_ns(), until)); 1890 - 1891 - raw_spin_rq_lock(rq); 1892 - } 1893 - 1894 1754 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, 1895 1755 struct scx_dispatch_q *dsq) 1896 1756 { 1897 1757 struct task_struct *p; 1898 1758 retry: 1899 - /* 1900 - * This retry loop can repeatedly race against scx_bypass() dequeueing 1901 - * tasks from @dsq trying to put the system into the bypass mode. On 1902 - * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock 1903 - * the machine into soft lockups. Give a breather. 1904 - */ 1905 - scx_breather(rq); 1906 - 1907 1759 /* 1908 1760 * The caller can't expect to successfully consume a task if the task's 1909 1761 * addition to @dsq isn't guaranteed to be visible somehow. Test ··· 1879 1805 1880 1806 nldsq_for_each_task(p, dsq) { 1881 1807 struct rq *task_rq = task_rq(p); 1808 + 1809 + /* 1810 + * This loop can lead to multiple lockup scenarios, e.g. the BPF 1811 + * scheduler can put an enormous number of affinitized tasks into 1812 + * a contended DSQ, or the outer retry loop can repeatedly race 1813 + * against scx_bypass() dequeueing tasks from @dsq trying to put 1814 + * the system into the bypass mode. This can easily live-lock the 1815 + * machine. If aborting, exit from all non-bypass DSQs. 1816 + */ 1817 + if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) 1818 + break; 1882 1819 1883 1820 if (rq == task_rq) { 1884 1821 task_unlink_from_dsq(p, dsq); ··· 2174 2089 if (consume_global_dsq(sch, rq)) 2175 2090 goto has_tasks; 2176 2091 2177 - if (unlikely(!SCX_HAS_OP(sch, dispatch)) || 2178 - scx_rq_bypassing(rq) || !scx_rq_online(rq)) 2092 + if (scx_rq_bypassing(rq)) { 2093 + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) 2094 + goto has_tasks; 2095 + else 2096 + goto no_tasks; 2097 + } 2098 + 2099 + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) 2179 2100 goto no_tasks; 2180 2101 2181 2102 dspc->rq = rq; ··· 2332 2241 struct scx_sched *sch = scx_root; 2333 2242 const struct sched_class *next_class = next->sched_class; 2334 2243 2335 - /* 2336 - * Pairs with the smp_load_acquire() issued by a CPU in 2337 - * kick_cpus_irq_workfn() who is waiting for this CPU to perform a 2338 - * resched. 2339 - */ 2340 - smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); 2341 2244 if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) 2342 2245 return; 2343 2246 ··· 2371 2286 struct task_struct *next) 2372 2287 { 2373 2288 struct scx_sched *sch = scx_root; 2289 + 2290 + /* see kick_cpus_irq_workfn() */ 2291 + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2292 + 2374 2293 update_curr_scx(rq); 2375 2294 2376 2295 /* see dequeue_task_scx() on why we skip when !QUEUED */ ··· 2421 2332 struct task_struct, scx.dsq_list.node); 2422 2333 } 2423 2334 2424 - static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 2335 + static struct task_struct * 2336 + do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) 2425 2337 { 2426 2338 struct task_struct *prev = rq->curr; 2427 2339 bool keep_prev, kick_idle = false; 2428 2340 struct task_struct *p; 2429 2341 2342 + /* see kick_cpus_irq_workfn() */ 2343 + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2344 + 2430 2345 rq_modified_clear(rq); 2346 + 2431 2347 rq_unpin_lock(rq, rf); 2432 2348 balance_one(rq, prev); 2433 2349 rq_repin_lock(rq, rf); 2434 2350 maybe_queue_balance_callback(rq); 2435 - if (rq_modified_above(rq, &ext_sched_class)) 2351 + 2352 + /* 2353 + * If any higher-priority sched class enqueued a runnable task on 2354 + * this rq during balance_one(), abort and return RETRY_TASK, so 2355 + * that the scheduler loop can restart. 2356 + * 2357 + * If @force_scx is true, always try to pick a SCHED_EXT task, 2358 + * regardless of any higher-priority sched classes activity. 2359 + */ 2360 + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) 2436 2361 return RETRY_TASK; 2437 2362 2438 2363 keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; ··· 2487 2384 } 2488 2385 2489 2386 return p; 2387 + } 2388 + 2389 + static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 2390 + { 2391 + return do_pick_task_scx(rq, rf, false); 2490 2392 } 2491 2393 2492 2394 #ifdef CONFIG_SCHED_CORE ··· 2950 2842 INIT_LIST_HEAD(&scx->runnable_node); 2951 2843 scx->runnable_at = jiffies; 2952 2844 scx->ddsp_dsq_id = SCX_DSQ_INVALID; 2953 - scx->slice = SCX_SLICE_DFL; 2845 + scx->slice = READ_ONCE(scx_slice_dfl); 2954 2846 } 2955 2847 2956 2848 void scx_pre_fork(struct task_struct *p) ··· 3016 2908 percpu_up_read(&scx_fork_rwsem); 3017 2909 } 3018 2910 3019 - void sched_ext_free(struct task_struct *p) 2911 + void sched_ext_dead(struct task_struct *p) 3020 2912 { 3021 2913 unsigned long flags; 3022 2914 ··· 3120 3012 tg->scx.weight = CGROUP_WEIGHT_DFL; 3121 3013 tg->scx.bw_period_us = default_bw_period_us(); 3122 3014 tg->scx.bw_quota_us = RUNTIME_INF; 3015 + tg->scx.idle = false; 3123 3016 } 3124 3017 3125 3018 int scx_tg_online(struct task_group *tg) ··· 3269 3160 3270 3161 void scx_group_set_idle(struct task_group *tg, bool idle) 3271 3162 { 3272 - /* TODO: Implement ops->cgroup_set_idle() */ 3163 + struct scx_sched *sch = scx_root; 3164 + 3165 + percpu_down_read(&scx_cgroup_ops_rwsem); 3166 + 3167 + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) 3168 + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, 3169 + tg_cgrp(tg), idle); 3170 + 3171 + /* Update the task group's idle state */ 3172 + tg->scx.idle = idle; 3173 + 3174 + percpu_up_read(&scx_cgroup_ops_rwsem); 3273 3175 } 3274 3176 3275 3177 void scx_group_set_bandwidth(struct task_group *tg, ··· 3695 3575 } 3696 3576 3697 3577 /** 3578 + * handle_lockup - sched_ext common lockup handler 3579 + * @fmt: format string 3580 + * 3581 + * Called on system stall or lockup condition and initiates abort of sched_ext 3582 + * if enabled, which may resolve the reported lockup. 3583 + * 3584 + * Returns %true if sched_ext is enabled and abort was initiated, which may 3585 + * resolve the lockup. %false if sched_ext is not enabled or abort was already 3586 + * initiated by someone else. 3587 + */ 3588 + static __printf(1, 2) bool handle_lockup(const char *fmt, ...) 3589 + { 3590 + struct scx_sched *sch; 3591 + va_list args; 3592 + bool ret; 3593 + 3594 + guard(rcu)(); 3595 + 3596 + sch = rcu_dereference(scx_root); 3597 + if (unlikely(!sch)) 3598 + return false; 3599 + 3600 + switch (scx_enable_state()) { 3601 + case SCX_ENABLING: 3602 + case SCX_ENABLED: 3603 + va_start(args, fmt); 3604 + ret = scx_verror(sch, fmt, args); 3605 + va_end(args); 3606 + return ret; 3607 + default: 3608 + return false; 3609 + } 3610 + } 3611 + 3612 + /** 3698 3613 * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler 3699 3614 * 3700 3615 * While there are various reasons why RCU CPU stalls can occur on a system 3701 3616 * that may not be caused by the current BPF scheduler, try kicking out the 3702 3617 * current scheduler in an attempt to recover the system to a good state before 3703 3618 * issuing panics. 3619 + * 3620 + * Returns %true if sched_ext is enabled and abort was initiated, which may 3621 + * resolve the reported RCU stall. %false if sched_ext is not enabled or someone 3622 + * else already initiated abort. 3704 3623 */ 3705 3624 bool scx_rcu_cpu_stall(void) 3706 3625 { 3707 - struct scx_sched *sch; 3708 - 3709 - rcu_read_lock(); 3710 - 3711 - sch = rcu_dereference(scx_root); 3712 - if (unlikely(!sch)) { 3713 - rcu_read_unlock(); 3714 - return false; 3715 - } 3716 - 3717 - switch (scx_enable_state()) { 3718 - case SCX_ENABLING: 3719 - case SCX_ENABLED: 3720 - break; 3721 - default: 3722 - rcu_read_unlock(); 3723 - return false; 3724 - } 3725 - 3726 - scx_error(sch, "RCU CPU stall detected!"); 3727 - rcu_read_unlock(); 3728 - 3729 - return true; 3626 + return handle_lockup("RCU CPU stall detected!"); 3730 3627 } 3731 3628 3732 3629 /** ··· 3754 3617 * live-lock the system by making many CPUs target the same DSQ to the point 3755 3618 * where soft-lockup detection triggers. This function is called from 3756 3619 * soft-lockup watchdog when the triggering point is close and tries to unjam 3757 - * the system by enabling the breather and aborting the BPF scheduler. 3620 + * the system and aborting the BPF scheduler. 3758 3621 */ 3759 3622 void scx_softlockup(u32 dur_s) 3760 3623 { 3761 - struct scx_sched *sch; 3624 + if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) 3625 + return; 3762 3626 3763 - rcu_read_lock(); 3627 + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", 3628 + smp_processor_id(), dur_s); 3629 + } 3764 3630 3765 - sch = rcu_dereference(scx_root); 3766 - if (unlikely(!sch)) 3767 - goto out_unlock; 3631 + /** 3632 + * scx_hardlockup - sched_ext hardlockup handler 3633 + * 3634 + * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting 3635 + * numerous affinitized tasks in a single queue and directing all CPUs at it. 3636 + * Try kicking out the current scheduler in an attempt to recover the system to 3637 + * a good state before taking more drastic actions. 3638 + * 3639 + * Returns %true if sched_ext is enabled and abort was initiated, which may 3640 + * resolve the reported hardlockdup. %false if sched_ext is not enabled or 3641 + * someone else already initiated abort. 3642 + */ 3643 + bool scx_hardlockup(int cpu) 3644 + { 3645 + if (!handle_lockup("hard lockup - CPU %d", cpu)) 3646 + return false; 3768 3647 3769 - switch (scx_enable_state()) { 3770 - case SCX_ENABLING: 3771 - case SCX_ENABLED: 3772 - break; 3773 - default: 3774 - goto out_unlock; 3775 - } 3648 + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", 3649 + cpu); 3650 + return true; 3651 + } 3776 3652 3777 - /* allow only one instance, cleared at the end of scx_bypass() */ 3778 - if (test_and_set_bit(0, &scx_in_softlockup)) 3779 - goto out_unlock; 3780 - 3781 - printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", 3782 - smp_processor_id(), dur_s, scx_root->ops.name); 3653 + static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, 3654 + struct cpumask *donee_mask, struct cpumask *resched_mask, 3655 + u32 nr_donor_target, u32 nr_donee_target) 3656 + { 3657 + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; 3658 + struct task_struct *p, *n; 3659 + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); 3660 + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; 3661 + u32 nr_balanced = 0, min_delta_us; 3783 3662 3784 3663 /* 3785 - * Some CPUs may be trapped in the dispatch paths. Enable breather 3786 - * immediately; otherwise, we might even be able to get to scx_bypass(). 3664 + * All we want to guarantee is reasonable forward progress. No reason to 3665 + * fine tune. Assuming every task on @donor_dsq runs their full slice, 3666 + * consider offloading iff the total queued duration is over the 3667 + * threshold. 3787 3668 */ 3788 - atomic_inc(&scx_breather_depth); 3669 + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; 3670 + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) 3671 + return 0; 3789 3672 3790 - scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); 3791 - out_unlock: 3792 - rcu_read_unlock(); 3673 + raw_spin_rq_lock_irq(rq); 3674 + raw_spin_lock(&donor_dsq->lock); 3675 + list_add(&cursor.node, &donor_dsq->list); 3676 + resume: 3677 + n = container_of(&cursor, struct task_struct, scx.dsq_list); 3678 + n = nldsq_next_task(donor_dsq, n, false); 3679 + 3680 + while ((p = n)) { 3681 + struct rq *donee_rq; 3682 + struct scx_dispatch_q *donee_dsq; 3683 + int donee; 3684 + 3685 + n = nldsq_next_task(donor_dsq, n, false); 3686 + 3687 + if (donor_dsq->nr <= nr_donor_target) 3688 + break; 3689 + 3690 + if (cpumask_empty(donee_mask)) 3691 + break; 3692 + 3693 + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); 3694 + if (donee >= nr_cpu_ids) 3695 + continue; 3696 + 3697 + donee_rq = cpu_rq(donee); 3698 + donee_dsq = &donee_rq->scx.bypass_dsq; 3699 + 3700 + /* 3701 + * $p's rq is not locked but $p's DSQ lock protects its 3702 + * scheduling properties making this test safe. 3703 + */ 3704 + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) 3705 + continue; 3706 + 3707 + /* 3708 + * Moving $p from one non-local DSQ to another. The source rq 3709 + * and DSQ are already locked. Do an abbreviated dequeue and 3710 + * then perform enqueue without unlocking $donor_dsq. 3711 + * 3712 + * We don't want to drop and reacquire the lock on each 3713 + * iteration as @donor_dsq can be very long and potentially 3714 + * highly contended. Donee DSQs are less likely to be contended. 3715 + * The nested locking is safe as only this LB moves tasks 3716 + * between bypass DSQs. 3717 + */ 3718 + dispatch_dequeue_locked(p, donor_dsq); 3719 + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); 3720 + 3721 + /* 3722 + * $donee might have been idle and need to be woken up. No need 3723 + * to be clever. Kick every CPU that receives tasks. 3724 + */ 3725 + cpumask_set_cpu(donee, resched_mask); 3726 + 3727 + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) 3728 + cpumask_clear_cpu(donee, donee_mask); 3729 + 3730 + nr_balanced++; 3731 + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { 3732 + list_move_tail(&cursor.node, &n->scx.dsq_list.node); 3733 + raw_spin_unlock(&donor_dsq->lock); 3734 + raw_spin_rq_unlock_irq(rq); 3735 + cpu_relax(); 3736 + raw_spin_rq_lock_irq(rq); 3737 + raw_spin_lock(&donor_dsq->lock); 3738 + goto resume; 3739 + } 3740 + } 3741 + 3742 + list_del_init(&cursor.node); 3743 + raw_spin_unlock(&donor_dsq->lock); 3744 + raw_spin_rq_unlock_irq(rq); 3745 + 3746 + return nr_balanced; 3793 3747 } 3794 3748 3795 - static void scx_clear_softlockup(void) 3749 + static void bypass_lb_node(struct scx_sched *sch, int node) 3796 3750 { 3797 - if (test_and_clear_bit(0, &scx_in_softlockup)) 3798 - atomic_dec(&scx_breather_depth); 3751 + const struct cpumask *node_mask = cpumask_of_node(node); 3752 + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; 3753 + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; 3754 + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; 3755 + u32 nr_target, nr_donor_target; 3756 + u32 before_min = U32_MAX, before_max = 0; 3757 + u32 after_min = U32_MAX, after_max = 0; 3758 + int cpu; 3759 + 3760 + /* count the target tasks and CPUs */ 3761 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3762 + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); 3763 + 3764 + nr_tasks += nr; 3765 + nr_cpus++; 3766 + 3767 + before_min = min(nr, before_min); 3768 + before_max = max(nr, before_max); 3769 + } 3770 + 3771 + if (!nr_cpus) 3772 + return; 3773 + 3774 + /* 3775 + * We don't want CPUs to have more than $nr_donor_target tasks and 3776 + * balancing to fill donee CPUs upto $nr_target. Once targets are 3777 + * calculated, find the donee CPUs. 3778 + */ 3779 + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); 3780 + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); 3781 + 3782 + cpumask_clear(donee_mask); 3783 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3784 + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) 3785 + cpumask_set_cpu(cpu, donee_mask); 3786 + } 3787 + 3788 + /* iterate !donee CPUs and see if they should be offloaded */ 3789 + cpumask_clear(resched_mask); 3790 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3791 + struct rq *rq = cpu_rq(cpu); 3792 + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; 3793 + 3794 + if (cpumask_empty(donee_mask)) 3795 + break; 3796 + if (cpumask_test_cpu(cpu, donee_mask)) 3797 + continue; 3798 + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) 3799 + continue; 3800 + 3801 + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, 3802 + nr_donor_target, nr_target); 3803 + } 3804 + 3805 + for_each_cpu(cpu, resched_mask) { 3806 + struct rq *rq = cpu_rq(cpu); 3807 + 3808 + raw_spin_rq_lock_irq(rq); 3809 + resched_curr(rq); 3810 + raw_spin_rq_unlock_irq(rq); 3811 + } 3812 + 3813 + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { 3814 + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); 3815 + 3816 + after_min = min(nr, after_min); 3817 + after_max = max(nr, after_max); 3818 + 3819 + } 3820 + 3821 + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, 3822 + before_min, before_max, after_min, after_max); 3799 3823 } 3824 + 3825 + /* 3826 + * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine 3827 + * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some 3828 + * bypass DSQs can be overloaded. If there are enough tasks to saturate other 3829 + * lightly loaded CPUs, such imbalance can lead to very high execution latency 3830 + * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such 3831 + * outcomes, a simple load balancing mechanism is implemented by the following 3832 + * timer which runs periodically while bypass mode is in effect. 3833 + */ 3834 + static void scx_bypass_lb_timerfn(struct timer_list *timer) 3835 + { 3836 + struct scx_sched *sch; 3837 + int node; 3838 + u32 intv_us; 3839 + 3840 + sch = rcu_dereference_all(scx_root); 3841 + if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) 3842 + return; 3843 + 3844 + for_each_node_with_cpus(node) 3845 + bypass_lb_node(sch, node); 3846 + 3847 + intv_us = READ_ONCE(scx_bypass_lb_intv_us); 3848 + if (intv_us) 3849 + mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); 3850 + } 3851 + 3852 + static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); 3800 3853 3801 3854 /** 3802 3855 * scx_bypass - [Un]bypass scx_ops and guarantee forward progress ··· 4031 3704 sch = rcu_dereference_bh(scx_root); 4032 3705 4033 3706 if (bypass) { 4034 - scx_bypass_depth++; 3707 + u32 intv_us; 3708 + 3709 + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); 4035 3710 WARN_ON_ONCE(scx_bypass_depth <= 0); 4036 3711 if (scx_bypass_depth != 1) 4037 3712 goto unlock; 3713 + WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); 4038 3714 bypass_timestamp = ktime_get_ns(); 4039 3715 if (sch) 4040 3716 scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); 3717 + 3718 + intv_us = READ_ONCE(scx_bypass_lb_intv_us); 3719 + if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { 3720 + scx_bypass_lb_timer.expires = 3721 + jiffies + usecs_to_jiffies(intv_us); 3722 + add_timer_global(&scx_bypass_lb_timer); 3723 + } 4041 3724 } else { 4042 - scx_bypass_depth--; 3725 + WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); 4043 3726 WARN_ON_ONCE(scx_bypass_depth < 0); 4044 3727 if (scx_bypass_depth != 0) 4045 3728 goto unlock; 3729 + WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); 4046 3730 if (sch) 4047 3731 scx_add_event(sch, SCX_EV_BYPASS_DURATION, 4048 3732 ktime_get_ns() - bypass_timestamp); 4049 3733 } 4050 - 4051 - atomic_inc(&scx_breather_depth); 4052 3734 4053 3735 /* 4054 3736 * No task property is changing. We just need to make sure all currently ··· 4114 3778 raw_spin_rq_unlock(rq); 4115 3779 } 4116 3780 4117 - atomic_dec(&scx_breather_depth); 4118 3781 unlock: 4119 3782 raw_spin_unlock_irqrestore(&bypass_lock, flags); 4120 - scx_clear_softlockup(); 4121 3783 } 4122 3784 4123 3785 static void free_exit_info(struct scx_exit_info *ei) ··· 4168 3834 } 4169 3835 } 4170 3836 4171 - static void free_kick_pseqs_rcu(struct rcu_head *rcu) 4172 - { 4173 - struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu); 4174 - 4175 - kvfree(pseqs); 4176 - } 4177 - 4178 - static void free_kick_pseqs(void) 3837 + static void free_kick_syncs(void) 4179 3838 { 4180 3839 int cpu; 4181 3840 4182 3841 for_each_possible_cpu(cpu) { 4183 - struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); 4184 - struct scx_kick_pseqs *to_free; 3842 + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 3843 + struct scx_kick_syncs *to_free; 4185 3844 4186 - to_free = rcu_replace_pointer(*pseqs, NULL, true); 3845 + to_free = rcu_replace_pointer(*ksyncs, NULL, true); 4187 3846 if (to_free) 4188 - call_rcu(&to_free->rcu, free_kick_pseqs_rcu); 3847 + kvfree_rcu(to_free, rcu); 4189 3848 } 4190 3849 } 4191 3850 ··· 4203 3876 4204 3877 /* guarantee forward progress by bypassing scx_ops */ 4205 3878 scx_bypass(true); 3879 + WRITE_ONCE(scx_aborting, false); 4206 3880 4207 3881 switch (scx_set_enable_state(SCX_DISABLING)) { 4208 3882 case SCX_DISABLING: ··· 4248 3920 while ((p = scx_task_iter_next_locked(&sti))) { 4249 3921 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 4250 3922 const struct sched_class *old_class = p->sched_class; 4251 - const struct sched_class *new_class = 4252 - __setscheduler_class(p->policy, p->prio); 3923 + const struct sched_class *new_class = scx_setscheduler_class(p); 4253 3924 4254 3925 update_rq_clock(task_rq(p)); 4255 3926 ··· 4316 3989 free_percpu(scx_dsp_ctx); 4317 3990 scx_dsp_ctx = NULL; 4318 3991 scx_dsp_max_batch = 0; 4319 - free_kick_pseqs(); 3992 + free_kick_syncs(); 4320 3993 4321 3994 mutex_unlock(&scx_enable_mutex); 4322 3995 ··· 4325 3998 scx_bypass(false); 4326 3999 } 4327 4000 4328 - static void scx_disable(enum scx_exit_kind kind) 4001 + static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 4329 4002 { 4330 4003 int none = SCX_EXIT_NONE; 4004 + 4005 + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 4006 + return false; 4007 + 4008 + /* 4009 + * Some CPUs may be trapped in the dispatch paths. Set the aborting 4010 + * flag to break potential live-lock scenarios, ensuring we can 4011 + * successfully reach scx_bypass(). 4012 + */ 4013 + WRITE_ONCE(scx_aborting, true); 4014 + return true; 4015 + } 4016 + 4017 + static void scx_disable(enum scx_exit_kind kind) 4018 + { 4331 4019 struct scx_sched *sch; 4332 4020 4333 4021 if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) ··· 4351 4009 rcu_read_lock(); 4352 4010 sch = rcu_dereference(scx_root); 4353 4011 if (sch) { 4354 - atomic_try_cmpxchg(&sch->exit_kind, &none, kind); 4012 + scx_claim_exit(sch, kind); 4355 4013 kthread_queue_work(sch->helper, &sch->disable_work); 4356 4014 } 4357 4015 rcu_read_unlock(); ··· 4580 4238 seq_buf_init(&ns, buf, avail); 4581 4239 4582 4240 dump_newline(&ns); 4583 - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", 4241 + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", 4584 4242 cpu, rq->scx.nr_running, rq->scx.flags, 4585 4243 rq->scx.cpu_released, rq->scx.ops_qseq, 4586 - rq->scx.pnt_seq); 4244 + rq->scx.kick_sync); 4587 4245 dump_line(&ns, " curr=%s[%d] class=%ps", 4588 4246 rq->curr->comm, rq->curr->pid, 4589 4247 rq->curr->sched_class); ··· 4667 4325 kthread_queue_work(sch->helper, &sch->disable_work); 4668 4326 } 4669 4327 4670 - static void scx_vexit(struct scx_sched *sch, 4328 + static bool scx_vexit(struct scx_sched *sch, 4671 4329 enum scx_exit_kind kind, s64 exit_code, 4672 4330 const char *fmt, va_list args) 4673 4331 { 4674 4332 struct scx_exit_info *ei = sch->exit_info; 4675 - int none = SCX_EXIT_NONE; 4676 4333 4677 - if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 4678 - return; 4334 + if (!scx_claim_exit(sch, kind)) 4335 + return false; 4679 4336 4680 4337 ei->exit_code = exit_code; 4681 4338 #ifdef CONFIG_STACKTRACE ··· 4691 4350 ei->reason = scx_exit_reason(ei->kind); 4692 4351 4693 4352 irq_work_queue(&sch->error_irq_work); 4353 + return true; 4694 4354 } 4695 4355 4696 - static int alloc_kick_pseqs(void) 4356 + static int alloc_kick_syncs(void) 4697 4357 { 4698 4358 int cpu; 4699 4359 ··· 4703 4361 * can exceed percpu allocator limits on large machines. 4704 4362 */ 4705 4363 for_each_possible_cpu(cpu) { 4706 - struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); 4707 - struct scx_kick_pseqs *new_pseqs; 4364 + struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu); 4365 + struct scx_kick_syncs *new_ksyncs; 4708 4366 4709 - WARN_ON_ONCE(rcu_access_pointer(*pseqs)); 4367 + WARN_ON_ONCE(rcu_access_pointer(*ksyncs)); 4710 4368 4711 - new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids), 4712 - GFP_KERNEL, cpu_to_node(cpu)); 4713 - if (!new_pseqs) { 4714 - free_kick_pseqs(); 4369 + new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids), 4370 + GFP_KERNEL, cpu_to_node(cpu)); 4371 + if (!new_ksyncs) { 4372 + free_kick_syncs(); 4715 4373 return -ENOMEM; 4716 4374 } 4717 4375 4718 - rcu_assign_pointer(*pseqs, new_pseqs); 4376 + rcu_assign_pointer(*ksyncs, new_ksyncs); 4719 4377 } 4720 4378 4721 4379 return 0; ··· 4802 4460 return ERR_PTR(ret); 4803 4461 } 4804 4462 4805 - static void check_hotplug_seq(struct scx_sched *sch, 4463 + static int check_hotplug_seq(struct scx_sched *sch, 4806 4464 const struct sched_ext_ops *ops) 4807 4465 { 4808 4466 unsigned long long global_hotplug_seq; ··· 4819 4477 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, 4820 4478 "expected hotplug seq %llu did not match actual %llu", 4821 4479 ops->hotplug_seq, global_hotplug_seq); 4480 + return -EBUSY; 4822 4481 } 4823 4482 } 4483 + 4484 + return 0; 4824 4485 } 4825 4486 4826 4487 static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) ··· 4850 4505 if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) 4851 4506 pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); 4852 4507 4508 + if (ops->cpu_acquire || ops->cpu_release) 4509 + pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); 4510 + 4853 4511 return 0; 4854 4512 } 4855 4513 ··· 4877 4529 goto err_unlock; 4878 4530 } 4879 4531 4880 - ret = alloc_kick_pseqs(); 4532 + ret = alloc_kick_syncs(); 4881 4533 if (ret) 4882 4534 goto err_unlock; 4883 4535 4884 4536 sch = scx_alloc_and_add_sched(ops); 4885 4537 if (IS_ERR(sch)) { 4886 4538 ret = PTR_ERR(sch); 4887 - goto err_free_pseqs; 4539 + goto err_free_ksyncs; 4888 4540 } 4889 4541 4890 4542 /* ··· 4893 4545 */ 4894 4546 WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); 4895 4547 WARN_ON_ONCE(scx_root); 4548 + if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) 4549 + WRITE_ONCE(scx_aborting, false); 4896 4550 4897 4551 atomic_long_set(&scx_nr_rejected, 0); 4898 4552 ··· 4930 4580 if (((void (**)(void))ops)[i]) 4931 4581 set_bit(i, sch->has_op); 4932 4582 4933 - check_hotplug_seq(sch, ops); 4583 + ret = check_hotplug_seq(sch, ops); 4584 + if (ret) { 4585 + cpus_read_unlock(); 4586 + goto err_disable; 4587 + } 4934 4588 scx_idle_update_selcpu_topology(ops); 4935 4589 4936 4590 cpus_read_unlock(); ··· 5051 4697 while ((p = scx_task_iter_next_locked(&sti))) { 5052 4698 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 5053 4699 const struct sched_class *old_class = p->sched_class; 5054 - const struct sched_class *new_class = 5055 - __setscheduler_class(p->policy, p->prio); 4700 + const struct sched_class *new_class = scx_setscheduler_class(p); 5056 4701 5057 - if (!tryget_task_struct(p)) 4702 + if (scx_get_task_state(p) != SCX_TASK_READY) 5058 4703 continue; 5059 4704 5060 4705 if (old_class != new_class) 5061 4706 queue_flags |= DEQUEUE_CLASS; 5062 4707 5063 4708 scoped_guard (sched_change, p, queue_flags) { 5064 - p->scx.slice = SCX_SLICE_DFL; 4709 + p->scx.slice = READ_ONCE(scx_slice_dfl); 5065 4710 p->sched_class = new_class; 5066 4711 } 5067 - 5068 - put_task_struct(p); 5069 4712 } 5070 4713 scx_task_iter_stop(&sti); 5071 4714 percpu_up_write(&scx_fork_rwsem); ··· 5086 4735 5087 4736 return 0; 5088 4737 5089 - err_free_pseqs: 5090 - free_kick_pseqs(); 4738 + err_free_ksyncs: 4739 + free_kick_syncs(); 5091 4740 err_unlock: 5092 4741 mutex_unlock(&scx_enable_mutex); 5093 4742 return ret; ··· 5304 4953 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 5305 4954 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} 5306 4955 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} 4956 + static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} 5307 4957 #endif 5308 4958 static void sched_ext_ops__cpu_online(s32 cpu) {} 5309 4959 static void sched_ext_ops__cpu_offline(s32 cpu) {} ··· 5343 4991 .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, 5344 4992 .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, 5345 4993 .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, 4994 + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, 5346 4995 #endif 5347 4996 .cpu_online = sched_ext_ops__cpu_online, 5348 4997 .cpu_offline = sched_ext_ops__cpu_offline, ··· 5417 5064 return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); 5418 5065 } 5419 5066 5420 - static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) 5067 + static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) 5421 5068 { 5422 5069 struct rq *rq = cpu_rq(cpu); 5423 5070 struct scx_rq *this_scx = &this_rq->scx; 5071 + const struct sched_class *cur_class; 5424 5072 bool should_wait = false; 5425 5073 unsigned long flags; 5426 5074 5427 5075 raw_spin_rq_lock_irqsave(rq, flags); 5076 + cur_class = rq->curr->sched_class; 5428 5077 5429 5078 /* 5430 5079 * During CPU hotplug, a CPU may depend on kicking itself to make 5431 - * forward progress. Allow kicking self regardless of online state. 5080 + * forward progress. Allow kicking self regardless of online state. If 5081 + * @cpu is running a higher class task, we have no control over @cpu. 5082 + * Skip kicking. 5432 5083 */ 5433 - if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { 5084 + if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) && 5085 + !sched_class_above(cur_class, &ext_sched_class)) { 5434 5086 if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { 5435 - if (rq->curr->sched_class == &ext_sched_class) 5087 + if (cur_class == &ext_sched_class) 5436 5088 rq->curr->scx.slice = 0; 5437 5089 cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); 5438 5090 } 5439 5091 5440 5092 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 5441 - pseqs[cpu] = rq->scx.pnt_seq; 5442 - should_wait = true; 5093 + if (cur_class == &ext_sched_class) { 5094 + ksyncs[cpu] = rq->scx.kick_sync; 5095 + should_wait = true; 5096 + } else { 5097 + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5098 + } 5443 5099 } 5444 5100 5445 5101 resched_curr(rq); ··· 5480 5118 { 5481 5119 struct rq *this_rq = this_rq(); 5482 5120 struct scx_rq *this_scx = &this_rq->scx; 5483 - struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs); 5121 + struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs); 5484 5122 bool should_wait = false; 5485 - unsigned long *pseqs; 5123 + unsigned long *ksyncs; 5486 5124 s32 cpu; 5487 5125 5488 - if (unlikely(!pseqs_pcpu)) { 5489 - pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs"); 5126 + if (unlikely(!ksyncs_pcpu)) { 5127 + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); 5490 5128 return; 5491 5129 } 5492 5130 5493 - pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs; 5131 + ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; 5494 5132 5495 5133 for_each_cpu(cpu, this_scx->cpus_to_kick) { 5496 - should_wait |= kick_one_cpu(cpu, this_rq, pseqs); 5134 + should_wait |= kick_one_cpu(cpu, this_rq, ksyncs); 5497 5135 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); 5498 5136 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 5499 5137 } ··· 5507 5145 return; 5508 5146 5509 5147 for_each_cpu(cpu, this_scx->cpus_to_wait) { 5510 - unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; 5148 + unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; 5511 5149 5512 - if (cpu != cpu_of(this_rq)) { 5513 - /* 5514 - * Pairs with smp_store_release() issued by this CPU in 5515 - * switch_class() on the resched path. 5516 - * 5517 - * We busy-wait here to guarantee that no other task can 5518 - * be scheduled on our core before the target CPU has 5519 - * entered the resched path. 5520 - */ 5521 - while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) 5522 - cpu_relax(); 5523 - } 5150 + /* 5151 + * Busy-wait until the task running at the time of kicking is no 5152 + * longer running. This can be used to implement e.g. core 5153 + * scheduling. 5154 + * 5155 + * smp_cond_load_acquire() pairs with store_releases in 5156 + * pick_task_scx() and put_prev_task_scx(). The former breaks 5157 + * the wait if SCX's scheduling path is entered even if the same 5158 + * task is picked subsequently. The latter is necessary to break 5159 + * the wait when $cpu is taken by a higher sched class. 5160 + */ 5161 + if (cpu != cpu_of(this_rq)) 5162 + smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); 5524 5163 5525 5164 cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5526 5165 } ··· 5620 5257 int n = cpu_to_node(cpu); 5621 5258 5622 5259 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); 5260 + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); 5623 5261 INIT_LIST_HEAD(&rq->scx.runnable_list); 5624 5262 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 5625 5263 ··· 5726 5362 * exhaustion. If zero, the current residual slice is maintained. If 5727 5363 * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with 5728 5364 * scx_bpf_kick_cpu() to trigger scheduling. 5365 + * 5366 + * Returns %true on successful insertion, %false on failure. On the root 5367 + * scheduler, %false return triggers scheduler abort and the caller doesn't need 5368 + * to check the return value. 5729 5369 */ 5730 - __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, 5731 - u64 enq_flags) 5370 + __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, 5371 + u64 slice, u64 enq_flags) 5732 5372 { 5733 5373 struct scx_sched *sch; 5734 5374 5735 5375 guard(rcu)(); 5736 5376 sch = rcu_dereference(scx_root); 5737 5377 if (unlikely(!sch)) 5738 - return; 5378 + return false; 5739 5379 5740 5380 if (!scx_dsq_insert_preamble(sch, p, enq_flags)) 5741 - return; 5381 + return false; 5742 5382 5743 5383 if (slice) 5744 5384 p->scx.slice = slice; ··· 5750 5382 p->scx.slice = p->scx.slice ?: 1; 5751 5383 5752 5384 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); 5385 + 5386 + return true; 5753 5387 } 5754 5388 5755 - /** 5756 - * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ 5757 - * @p: task_struct to insert 5758 - * @dsq_id: DSQ to insert into 5759 - * @slice: duration @p can run for in nsecs, 0 to keep the current value 5760 - * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 5761 - * @enq_flags: SCX_ENQ_* 5762 - * 5763 - * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. 5764 - * Tasks queued into the priority queue are ordered by @vtime. All other aspects 5765 - * are identical to scx_bpf_dsq_insert(). 5766 - * 5767 - * @vtime ordering is according to time_before64() which considers wrapping. A 5768 - * numerically larger vtime may indicate an earlier position in the ordering and 5769 - * vice-versa. 5770 - * 5771 - * A DSQ can only be used as a FIFO or priority queue at any given time and this 5772 - * function must not be called on a DSQ which already has one or more FIFO tasks 5773 - * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 5774 - * SCX_DSQ_GLOBAL) cannot be used as priority queues. 5389 + /* 5390 + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. 5775 5391 */ 5776 - __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 5777 - u64 slice, u64 vtime, u64 enq_flags) 5392 + __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, 5393 + u64 slice, u64 enq_flags) 5778 5394 { 5779 - struct scx_sched *sch; 5395 + scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags); 5396 + } 5780 5397 5781 - guard(rcu)(); 5782 - sch = rcu_dereference(scx_root); 5783 - if (unlikely(!sch)) 5784 - return; 5785 - 5398 + static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, 5399 + u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) 5400 + { 5786 5401 if (!scx_dsq_insert_preamble(sch, p, enq_flags)) 5787 - return; 5402 + return false; 5788 5403 5789 5404 if (slice) 5790 5405 p->scx.slice = slice; ··· 5777 5426 p->scx.dsq_vtime = vtime; 5778 5427 5779 5428 scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); 5429 + 5430 + return true; 5431 + } 5432 + 5433 + struct scx_bpf_dsq_insert_vtime_args { 5434 + /* @p can't be packed together as KF_RCU is not transitive */ 5435 + u64 dsq_id; 5436 + u64 slice; 5437 + u64 vtime; 5438 + u64 enq_flags; 5439 + }; 5440 + 5441 + /** 5442 + * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion 5443 + * @p: task_struct to insert 5444 + * @args: struct containing the rest of the arguments 5445 + * @args->dsq_id: DSQ to insert into 5446 + * @args->slice: duration @p can run for in nsecs, 0 to keep the current value 5447 + * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 5448 + * @args->enq_flags: SCX_ENQ_* 5449 + * 5450 + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 5451 + * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided 5452 + * as an inline wrapper in common.bpf.h. 5453 + * 5454 + * Insert @p into the vtime priority queue of the DSQ identified by 5455 + * @args->dsq_id. Tasks queued into the priority queue are ordered by 5456 + * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert(). 5457 + * 5458 + * @args->vtime ordering is according to time_before64() which considers 5459 + * wrapping. A numerically larger vtime may indicate an earlier position in the 5460 + * ordering and vice-versa. 5461 + * 5462 + * A DSQ can only be used as a FIFO or priority queue at any given time and this 5463 + * function must not be called on a DSQ which already has one or more FIFO tasks 5464 + * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and 5465 + * SCX_DSQ_GLOBAL) cannot be used as priority queues. 5466 + * 5467 + * Returns %true on successful insertion, %false on failure. On the root 5468 + * scheduler, %false return triggers scheduler abort and the caller doesn't need 5469 + * to check the return value. 5470 + */ 5471 + __bpf_kfunc bool 5472 + __scx_bpf_dsq_insert_vtime(struct task_struct *p, 5473 + struct scx_bpf_dsq_insert_vtime_args *args) 5474 + { 5475 + struct scx_sched *sch; 5476 + 5477 + guard(rcu)(); 5478 + 5479 + sch = rcu_dereference(scx_root); 5480 + if (unlikely(!sch)) 5481 + return false; 5482 + 5483 + return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice, 5484 + args->vtime, args->enq_flags); 5485 + } 5486 + 5487 + /* 5488 + * COMPAT: Will be removed in v6.23. 5489 + */ 5490 + __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, 5491 + u64 slice, u64 vtime, u64 enq_flags) 5492 + { 5493 + struct scx_sched *sch; 5494 + 5495 + guard(rcu)(); 5496 + 5497 + sch = rcu_dereference(scx_root); 5498 + if (unlikely(!sch)) 5499 + return; 5500 + 5501 + scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); 5780 5502 } 5781 5503 5782 5504 __bpf_kfunc_end_defs(); 5783 5505 5784 5506 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) 5785 5507 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) 5508 + BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU) 5509 + BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU) 5786 5510 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) 5787 5511 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) 5788 5512 ··· 5881 5455 return false; 5882 5456 5883 5457 /* 5458 + * If the BPF scheduler keeps calling this function repeatedly, it can 5459 + * cause similar live-lock conditions as consume_dispatch_q(). 5460 + */ 5461 + if (unlikely(READ_ONCE(scx_aborting))) 5462 + return false; 5463 + 5464 + /* 5884 5465 * Can be called from either ops.dispatch() locking this_rq() or any 5885 5466 * context where no rq lock is held. If latter, lock @p's task_rq which 5886 5467 * we'll likely need anyway. ··· 5906 5473 } else { 5907 5474 raw_spin_rq_lock(src_rq); 5908 5475 } 5909 - 5910 - /* 5911 - * If the BPF scheduler keeps calling this function repeatedly, it can 5912 - * cause similar live-lock conditions as consume_dispatch_q(). Insert a 5913 - * breather if necessary. 5914 - */ 5915 - scx_breather(src_rq); 5916 5476 5917 5477 locked_rq = src_rq; 5918 5478 raw_spin_lock(&src_dsq->lock); ··· 6111 5685 * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq 6112 5686 * lock (e.g. BPF timers or SYSCALL programs). 6113 5687 * 6114 - * Returns %true if @p has been consumed, %false if @p had already been consumed 6115 - * or dequeued. 5688 + * Returns %true if @p has been consumed, %false if @p had already been 5689 + * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local 5690 + * DSQ. 6116 5691 */ 6117 5692 __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, 6118 5693 struct task_struct *p, u64 dsq_id, ··· 6165 5738 .set = &scx_kfunc_ids_dispatch, 6166 5739 }; 6167 5740 6168 - __bpf_kfunc_start_defs(); 6169 - 6170 - /** 6171 - * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 6172 - * 6173 - * Iterate over all of the tasks currently enqueued on the local DSQ of the 6174 - * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 6175 - * processed tasks. Can only be called from ops.cpu_release(). 6176 - */ 6177 - __bpf_kfunc u32 scx_bpf_reenqueue_local(void) 5741 + static u32 reenq_local(struct rq *rq) 6178 5742 { 6179 - struct scx_sched *sch; 6180 5743 LIST_HEAD(tasks); 6181 5744 u32 nr_enqueued = 0; 6182 - struct rq *rq; 6183 5745 struct task_struct *p, *n; 6184 5746 6185 - guard(rcu)(); 6186 - sch = rcu_dereference(scx_root); 6187 - if (unlikely(!sch)) 6188 - return 0; 6189 - 6190 - if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) 6191 - return 0; 6192 - 6193 - rq = cpu_rq(smp_processor_id()); 6194 5747 lockdep_assert_rq_held(rq); 6195 5748 6196 5749 /* ··· 6205 5798 } 6206 5799 6207 5800 return nr_enqueued; 5801 + } 5802 + 5803 + __bpf_kfunc_start_defs(); 5804 + 5805 + /** 5806 + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 5807 + * 5808 + * Iterate over all of the tasks currently enqueued on the local DSQ of the 5809 + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of 5810 + * processed tasks. Can only be called from ops.cpu_release(). 5811 + * 5812 + * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void 5813 + * returning variant that can be called from anywhere. 5814 + */ 5815 + __bpf_kfunc u32 scx_bpf_reenqueue_local(void) 5816 + { 5817 + struct scx_sched *sch; 5818 + struct rq *rq; 5819 + 5820 + guard(rcu)(); 5821 + sch = rcu_dereference(scx_root); 5822 + if (unlikely(!sch)) 5823 + return 0; 5824 + 5825 + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) 5826 + return 0; 5827 + 5828 + rq = cpu_rq(smp_processor_id()); 5829 + lockdep_assert_rq_held(rq); 5830 + 5831 + return reenq_local(rq); 6208 5832 } 6209 5833 6210 5834 __bpf_kfunc_end_defs(); ··· 6309 5871 }; 6310 5872 6311 5873 __bpf_kfunc_start_defs(); 5874 + 5875 + /** 5876 + * scx_bpf_task_set_slice - Set task's time slice 5877 + * @p: task of interest 5878 + * @slice: time slice to set in nsecs 5879 + * 5880 + * Set @p's time slice to @slice. Returns %true on success, %false if the 5881 + * calling scheduler doesn't have authority over @p. 5882 + */ 5883 + __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) 5884 + { 5885 + p->scx.slice = slice; 5886 + return true; 5887 + } 5888 + 5889 + /** 5890 + * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering 5891 + * @p: task of interest 5892 + * @vtime: virtual time to set 5893 + * 5894 + * Set @p's virtual time to @vtime. Returns %true on success, %false if the 5895 + * calling scheduler doesn't have authority over @p. 5896 + */ 5897 + __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) 5898 + { 5899 + p->scx.dsq_vtime = vtime; 5900 + return true; 5901 + } 6312 5902 6313 5903 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) 6314 5904 { ··· 6495 6029 sizeof(struct bpf_iter_scx_dsq)); 6496 6030 BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != 6497 6031 __alignof__(struct bpf_iter_scx_dsq)); 6032 + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & 6033 + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); 6498 6034 6499 6035 /* 6500 6036 * next() and destroy() will be called regardless of the return value. ··· 6515 6047 if (!kit->dsq) 6516 6048 return -ENOENT; 6517 6049 6518 - INIT_LIST_HEAD(&kit->cursor.node); 6519 - kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; 6520 - kit->cursor.priv = READ_ONCE(kit->dsq->seq); 6050 + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, 6051 + READ_ONCE(kit->dsq->seq)); 6521 6052 6522 6053 return 0; 6523 6054 } ··· 6588 6121 raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); 6589 6122 } 6590 6123 kit->dsq = NULL; 6124 + } 6125 + 6126 + /** 6127 + * scx_bpf_dsq_peek - Lockless peek at the first element. 6128 + * @dsq_id: DSQ to examine. 6129 + * 6130 + * Read the first element in the DSQ. This is semantically equivalent to using 6131 + * the DSQ iterator, but is lockfree. Of course, like any lockless operation, 6132 + * this provides only a point-in-time snapshot, and the contents may change 6133 + * by the time any subsequent locking operation reads the queue. 6134 + * 6135 + * Returns the pointer, or NULL indicates an empty queue OR internal error. 6136 + */ 6137 + __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) 6138 + { 6139 + struct scx_sched *sch; 6140 + struct scx_dispatch_q *dsq; 6141 + 6142 + sch = rcu_dereference(scx_root); 6143 + if (unlikely(!sch)) 6144 + return NULL; 6145 + 6146 + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { 6147 + scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); 6148 + return NULL; 6149 + } 6150 + 6151 + dsq = find_user_dsq(sch, dsq_id); 6152 + if (unlikely(!dsq)) { 6153 + scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); 6154 + return NULL; 6155 + } 6156 + 6157 + return rcu_dereference(dsq->first_task); 6591 6158 } 6592 6159 6593 6160 __bpf_kfunc_end_defs(); ··· 6775 6274 */ 6776 6275 if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') 6777 6276 ops_dump_flush(); 6277 + } 6278 + 6279 + /** 6280 + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ 6281 + * 6282 + * Iterate over all of the tasks currently enqueued on the local DSQ of the 6283 + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from 6284 + * anywhere. 6285 + */ 6286 + __bpf_kfunc void scx_bpf_reenqueue_local___v2(void) 6287 + { 6288 + struct rq *rq; 6289 + 6290 + guard(preempt)(); 6291 + 6292 + rq = this_rq(); 6293 + local_set(&rq->scx.reenq_local_deferred, 1); 6294 + schedule_deferred(rq); 6778 6295 } 6779 6296 6780 6297 /** ··· 7196 6677 __bpf_kfunc_end_defs(); 7197 6678 7198 6679 BTF_KFUNCS_START(scx_kfunc_ids_any) 6680 + BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU); 6681 + BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); 7199 6682 BTF_ID_FLAGS(func, scx_bpf_kick_cpu) 7200 6683 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) 7201 6684 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) 6685 + BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) 7202 6686 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) 7203 6687 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) 7204 6688 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) 7205 6689 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) 7206 6690 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) 7207 6691 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) 6692 + BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) 7208 6693 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) 7209 6694 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) 7210 6695 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) ··· 7297 6774 if (ret < 0) { 7298 6775 pr_err("sched_ext: Failed to add global attributes\n"); 7299 6776 return ret; 6777 + } 6778 + 6779 + if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || 6780 + !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { 6781 + pr_err("sched_ext: Failed to allocate cpumasks\n"); 6782 + return -ENOMEM; 7300 6783 } 7301 6784 7302 6785 return 0;
+37 -6
kernel/sched/ext_idle.c
··· 995 995 return prev_cpu; 996 996 } 997 997 998 + struct scx_bpf_select_cpu_and_args { 999 + /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */ 1000 + s32 prev_cpu; 1001 + u64 wake_flags; 1002 + u64 flags; 1003 + }; 1004 + 998 1005 /** 999 - * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, 1000 - * prioritizing those in @cpus_allowed 1006 + * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask 1001 1007 * @p: task_struct to select a CPU for 1002 - * @prev_cpu: CPU @p was on previously 1003 - * @wake_flags: %SCX_WAKE_* flags 1004 1008 * @cpus_allowed: cpumask of allowed CPUs 1005 - * @flags: %SCX_PICK_IDLE* flags 1009 + * @args: struct containing the rest of the arguments 1010 + * @args->prev_cpu: CPU @p was on previously 1011 + * @args->wake_flags: %SCX_WAKE_* flags 1012 + * @args->flags: %SCX_PICK_IDLE* flags 1013 + * 1014 + * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument 1015 + * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided 1016 + * as an inline wrapper in common.bpf.h. 1006 1017 * 1007 1018 * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked 1008 1019 * context such as a BPF test_run() call, as long as built-in CPU selection 1009 1020 * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE 1010 1021 * is set. 1011 1022 * 1012 - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). 1023 + * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu(). 1013 1024 * 1014 1025 * Returns the selected idle CPU, which will be automatically awakened upon 1015 1026 * returning from ops.select_cpu() and can be used for direct dispatch, or 1016 1027 * a negative value if no idle CPU is available. 1028 + */ 1029 + __bpf_kfunc s32 1030 + __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, 1031 + struct scx_bpf_select_cpu_and_args *args) 1032 + { 1033 + struct scx_sched *sch; 1034 + 1035 + guard(rcu)(); 1036 + 1037 + sch = rcu_dereference(scx_root); 1038 + if (unlikely(!sch)) 1039 + return -ENODEV; 1040 + 1041 + return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags, 1042 + cpus_allowed, args->flags); 1043 + } 1044 + 1045 + /* 1046 + * COMPAT: Will be removed in v6.22. 1017 1047 */ 1018 1048 __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, 1019 1049 const struct cpumask *cpus_allowed, u64 flags) ··· 1413 1383 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) 1414 1384 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) 1415 1385 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) 1386 + BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU) 1416 1387 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) 1417 1388 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) 1418 1389 BTF_KFUNCS_END(scx_kfunc_ids_idle)
+26 -3
kernel/sched/ext_internal.h
··· 23 23 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. 24 24 */ 25 25 SCX_TASK_ITER_BATCH = 32, 26 + 27 + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, 28 + SCX_BYPASS_LB_DONOR_PCT = 125, 29 + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, 30 + SCX_BYPASS_LB_BATCH = 256, 26 31 }; 27 32 28 33 enum scx_exit_kind { ··· 702 697 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be 703 698 * interpreted in the same fashion and specifies how much @cgrp can 704 699 * burst temporarily. The specific control mechanism and thus the 705 - * interpretation of @period_us and burstiness is upto to the BPF 700 + * interpretation of @period_us and burstiness is up to the BPF 706 701 * scheduler. 707 702 */ 708 703 void (*cgroup_set_bandwidth)(struct cgroup *cgrp, 709 704 u64 period_us, u64 quota_us, u64 burst_us); 705 + 706 + /** 707 + * @cgroup_set_idle: A cgroup's idle state is being changed 708 + * @cgrp: cgroup whose idle state is being updated 709 + * @idle: whether the cgroup is entering or exiting idle state 710 + * 711 + * Update @cgrp's idle state to @idle. This callback is invoked when 712 + * a cgroup transitions between idle and non-idle states, allowing the 713 + * BPF scheduler to adjust its behavior accordingly. 714 + */ 715 + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); 710 716 711 717 #endif /* CONFIG_EXT_GROUP_SCHED */ 712 718 ··· 900 884 struct scx_dispatch_q **global_dsqs; 901 885 struct scx_sched_pcpu __percpu *pcpu; 902 886 887 + /* 888 + * Updates to the following warned bitfields can race causing RMW issues 889 + * but it doesn't really matter. 890 + */ 903 891 bool warned_zero_slice:1; 904 892 bool warned_deprecated_rq:1; 905 893 ··· 968 948 969 949 SCX_ENQ_CLEAR_OPSS = 1LLU << 56, 970 950 SCX_ENQ_DSQ_PRIQ = 1LLU << 57, 951 + SCX_ENQ_NESTED = 1LLU << 58, 971 952 }; 972 953 973 954 enum scx_deq_flags { ··· 1007 986 SCX_KICK_PREEMPT = 1LLU << 1, 1008 987 1009 988 /* 1010 - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will 1011 - * return after the target CPU finishes picking the next task. 989 + * The scx_bpf_kick_cpu() call will return after the current SCX task of 990 + * the target CPU switches out. This can be used to implement e.g. core 991 + * scheduling. This has no effect if the current task on the target CPU 992 + * is not on SCX. 1012 993 */ 1013 994 SCX_KICK_WAIT = 1LLU << 2, 1014 995 };
+3 -1
kernel/sched/sched.h
··· 803 803 cpumask_var_t cpus_to_kick_if_idle; 804 804 cpumask_var_t cpus_to_preempt; 805 805 cpumask_var_t cpus_to_wait; 806 - unsigned long pnt_seq; 806 + unsigned long kick_sync; 807 + local_t reenq_local_deferred; 807 808 struct balance_callback deferred_bal_cb; 808 809 struct irq_work deferred_irq_work; 809 810 struct irq_work kick_cpus_irq_work; 811 + struct scx_dispatch_q bypass_dsq; 810 812 }; 811 813 #endif /* CONFIG_SCHED_CLASS_EXT */ 812 814
+9
kernel/watchdog.c
··· 196 196 #ifdef CONFIG_SYSFS 197 197 ++hardlockup_count; 198 198 #endif 199 + /* 200 + * A poorly behaving BPF scheduler can trigger hard lockup by 201 + * e.g. putting numerous affinitized tasks in a single queue and 202 + * directing all CPUs at it. The following call can return true 203 + * only once when sched_ext is enabled and will immediately 204 + * abort the BPF scheduler and print out a warning message. 205 + */ 206 + if (scx_hardlockup(cpu)) 207 + return; 199 208 200 209 /* Only print hardlockups once. */ 201 210 if (per_cpu(watchdog_hardlockup_warned, cpu))
+3 -1
tools/sched_ext/Makefile
··· 133 133 $(call msg,MKDIR,,$@) 134 134 $(Q)mkdir -p $@ 135 135 136 + ifneq ($(CROSS_COMPILE),) 136 137 $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ 137 138 $(APIDIR)/linux/bpf.h \ 138 139 | $(OBJ_DIR)/libbpf ··· 142 141 EXTRA_CFLAGS='-g -O0 -fPIC' \ 143 142 LDFLAGS="$(LDFLAGS)" \ 144 143 DESTDIR=$(OUTPUT_DIR) prefix= all install_headers 144 + endif 145 145 146 146 $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ 147 147 $(APIDIR)/linux/bpf.h \ ··· 189 187 190 188 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) 191 189 192 - c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg 190 + c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg 193 191 194 192 $(addprefix $(BINDIR)/,$(c-sched-targets)): \ 195 193 $(BINDIR)/%: \
+4 -11
tools/sched_ext/include/scx/common.bpf.h
··· 60 60 61 61 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; 62 62 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; 63 - s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, 64 - const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; 65 - void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; 66 - void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; 63 + s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, 64 + struct scx_bpf_select_cpu_and_args *args) __ksym __weak; 65 + bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak; 67 66 u32 scx_bpf_dispatch_nr_slots(void) __ksym; 68 67 void scx_bpf_dispatch_cancel(void) __ksym; 69 - bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; 70 - void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; 71 - void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; 72 - bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 73 - bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 74 - u32 scx_bpf_reenqueue_local(void) __ksym; 75 68 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; 76 69 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; 77 70 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; 71 + struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; 78 72 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; 79 73 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; 80 74 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; ··· 99 105 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; 100 106 struct rq *scx_bpf_locked_rq(void) __ksym; 101 107 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; 102 - struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; 103 108 u64 scx_bpf_now(void) __ksym __weak; 104 109 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; 105 110
+217 -89
tools/sched_ext/include/scx/compat.bpf.h
··· 16 16 }) 17 17 18 18 /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ 19 - #define __COMPAT_scx_bpf_task_cgroup(p) \ 20 - (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ 21 - scx_bpf_task_cgroup((p)) : NULL) 19 + struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak; 20 + 21 + #define scx_bpf_task_cgroup(p) \ 22 + (bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \ 23 + scx_bpf_task_cgroup___new((p)) : NULL) 22 24 23 25 /* 24 26 * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are 25 27 * renamed to unload the verb. 26 28 * 27 - * Build error is triggered if old names are used. New binaries work with both 28 - * new and old names. The compat macros will be removed on v6.15 release. 29 - * 30 29 * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by 31 30 * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()"). 32 - * Preserve __COMPAT macros until v6.15. 33 31 */ 34 - void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; 35 - void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; 36 - bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak; 37 - void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; 38 - void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; 39 - bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 40 - bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 41 - int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; 32 + bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak; 33 + void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; 34 + void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; 35 + bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 36 + bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 42 37 43 - #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ 44 - (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ 45 - scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \ 46 - scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags))) 47 - 48 - #define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \ 49 - (bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \ 50 - scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \ 51 - scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags))) 38 + bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak; 39 + void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; 40 + void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; 41 + bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 42 + bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 52 43 53 44 #define scx_bpf_dsq_move_to_local(dsq_id) \ 54 - (bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \ 55 - scx_bpf_dsq_move_to_local((dsq_id)) : \ 56 - scx_bpf_consume___compat((dsq_id))) 45 + (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \ 46 + scx_bpf_dsq_move_to_local___new((dsq_id)) : \ 47 + scx_bpf_consume___old((dsq_id))) 57 48 58 - #define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \ 59 - (bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \ 60 - scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \ 61 - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \ 62 - scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \ 49 + #define scx_bpf_dsq_move_set_slice(it__iter, slice) \ 50 + (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \ 51 + scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \ 52 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \ 53 + scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \ 63 54 (void)0)) 64 55 65 - #define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ 66 - (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \ 67 - scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \ 68 - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \ 69 - scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \ 70 - (void) 0)) 56 + #define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ 57 + (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \ 58 + scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \ 59 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \ 60 + scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \ 61 + (void)0)) 71 62 72 - #define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ 73 - (bpf_ksym_exists(scx_bpf_dsq_move) ? \ 74 - scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \ 75 - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \ 76 - scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ 63 + #define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ 64 + (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \ 65 + scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \ 66 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \ 67 + scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ 77 68 false)) 78 69 79 - #define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ 80 - (bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \ 81 - scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \ 82 - (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \ 83 - scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ 70 + #define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ 71 + (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \ 72 + scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \ 73 + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \ 74 + scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ 84 75 false)) 76 + 77 + /* 78 + * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits") 79 + * 80 + * Compat macro will be dropped on v6.19 release. 81 + */ 82 + int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; 85 83 86 84 #define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ 87 85 (bpf_ksym_exists(bpf_cpumask_populate) ? \ 88 86 (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) 89 87 90 - #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ 91 - _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") 88 + /* 89 + * v6.19: Introduce lockless peek API for user DSQs. 90 + * 91 + * Preserve the following macro until v6.21. 92 + */ 93 + static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) 94 + { 95 + struct task_struct *p = NULL; 96 + struct bpf_iter_scx_dsq it; 92 97 93 - #define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \ 94 - _Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()") 95 - 96 - #define scx_bpf_consume(dsq_id) ({ \ 97 - _Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \ 98 - false; \ 99 - }) 100 - 101 - #define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ 102 - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()") 103 - 104 - #define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ 105 - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()") 106 - 107 - #define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ 108 - _Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \ 109 - false; \ 110 - }) 111 - 112 - #define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ 113 - _Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \ 114 - false; \ 115 - }) 116 - 117 - #define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ 118 - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()") 119 - 120 - #define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ 121 - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()") 122 - 123 - #define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ 124 - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \ 125 - false; \ 126 - }) 127 - 128 - #define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ 129 - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \ 130 - false; \ 131 - }) 98 + if (bpf_ksym_exists(scx_bpf_dsq_peek)) 99 + return scx_bpf_dsq_peek(dsq_id); 100 + if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) 101 + p = bpf_iter_scx_dsq_next(&it); 102 + bpf_iter_scx_dsq_destroy(&it); 103 + return p; 104 + } 132 105 133 106 /** 134 107 * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on ··· 218 245 rq = scx_bpf_cpu_rq(cpu); 219 246 220 247 return rq ? rq->curr : NULL; 248 + } 249 + 250 + /* 251 + * v6.19: To work around BPF maximum parameter limit, the following kfuncs are 252 + * replaced with variants that pack scalar arguments in a struct. Wrappers are 253 + * provided to maintain source compatibility. 254 + * 255 + * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the 256 + * block on dispatch renaming above for more details. 257 + * 258 + * The kernel will carry the compat variants until v6.23 to maintain binary 259 + * compatibility. After v6.23 release, remove the compat handling and move the 260 + * wrappers to common.bpf.h. 261 + */ 262 + s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags, 263 + const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; 264 + void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; 265 + void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; 266 + 267 + /** 268 + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p 269 + * @p: task_struct to select a CPU for 270 + * @prev_cpu: CPU @p was on previously 271 + * @wake_flags: %SCX_WAKE_* flags 272 + * @cpus_allowed: cpumask of allowed CPUs 273 + * @flags: %SCX_PICK_IDLE* flags 274 + * 275 + * Inline wrapper that packs scalar arguments into a struct and calls 276 + * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details. 277 + */ 278 + static inline s32 279 + scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, 280 + const struct cpumask *cpus_allowed, u64 flags) 281 + { 282 + if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) { 283 + struct scx_bpf_select_cpu_and_args args = { 284 + .prev_cpu = prev_cpu, 285 + .wake_flags = wake_flags, 286 + .flags = flags, 287 + }; 288 + 289 + return __scx_bpf_select_cpu_and(p, cpus_allowed, &args); 290 + } else { 291 + return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags, 292 + cpus_allowed, flags); 293 + } 294 + } 295 + 296 + /** 297 + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ 298 + * @p: task_struct to insert 299 + * @dsq_id: DSQ to insert into 300 + * @slice: duration @p can run for in nsecs, 0 to keep the current value 301 + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ 302 + * @enq_flags: SCX_ENQ_* 303 + * 304 + * Inline wrapper that packs scalar arguments into a struct and calls 305 + * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details. 306 + */ 307 + static inline bool 308 + scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, 309 + u64 enq_flags) 310 + { 311 + if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) { 312 + struct scx_bpf_dsq_insert_vtime_args args = { 313 + .dsq_id = dsq_id, 314 + .slice = slice, 315 + .vtime = vtime, 316 + .enq_flags = enq_flags, 317 + }; 318 + 319 + return __scx_bpf_dsq_insert_vtime(p, &args); 320 + } else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) { 321 + scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime, 322 + enq_flags); 323 + return true; 324 + } else { 325 + scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime, 326 + enq_flags); 327 + return true; 328 + } 329 + } 330 + 331 + /* 332 + * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move 333 + * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22. 334 + * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on 335 + * kernel side. The entire suffix can be dropped later. 336 + * 337 + * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on 338 + * dispatch renaming above for more details. 339 + */ 340 + bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; 341 + void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; 342 + void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; 343 + 344 + static inline bool 345 + scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) 346 + { 347 + if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) { 348 + return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags); 349 + } else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) { 350 + scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags); 351 + return true; 352 + } else { 353 + scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags); 354 + return true; 355 + } 356 + } 357 + 358 + /* 359 + * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for 360 + * sub-sched authority checks. Drop the wrappers and move the decls to 361 + * common.bpf.h after v6.22. 362 + */ 363 + bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak; 364 + bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak; 365 + 366 + static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice) 367 + { 368 + if (bpf_ksym_exists(scx_bpf_task_set_slice___new)) 369 + scx_bpf_task_set_slice___new(p, slice); 370 + else 371 + p->scx.slice = slice; 372 + } 373 + 374 + static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) 375 + { 376 + if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new)) 377 + scx_bpf_task_set_dsq_vtime___new(p, vtime); 378 + else 379 + p->scx.dsq_vtime = vtime; 380 + } 381 + 382 + /* 383 + * v6.19: The new void variant can be called from anywhere while the older v1 384 + * variant can only be called from ops.cpu_release(). The double ___ prefixes on 385 + * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix 386 + * on kernel side. Drop the wrapper and move the decl to common.bpf.h after 387 + * v6.22. 388 + */ 389 + u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak; 390 + void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak; 391 + 392 + static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void) 393 + { 394 + return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat); 395 + } 396 + 397 + static inline void scx_bpf_reenqueue_local(void) 398 + { 399 + if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 400 + scx_bpf_reenqueue_local___v2___compat(); 401 + else 402 + scx_bpf_reenqueue_local___v1(); 221 403 } 222 404 223 405 /*
+14
tools/sched_ext/include/scx/compat.h
··· 151 151 * 152 152 * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is 153 153 * the current minimum required kernel version. 154 + * 155 + * COMPAT: 156 + * - v6.17: ops.cgroup_set_bandwidth() 157 + * - v6.19: ops.cgroup_set_idle() 154 158 */ 155 159 #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ 156 160 struct __scx_name *__skel; \ ··· 166 162 SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ 167 163 __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ 168 164 SCX_ENUM_INIT(__skel); \ 165 + if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \ 166 + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \ 167 + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \ 168 + __skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \ 169 + } \ 170 + if (__skel->struct_ops.__ops_name->cgroup_set_idle && \ 171 + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \ 172 + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \ 173 + __skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \ 174 + } \ 169 175 __skel; \ 170 176 }) 171 177
+88
tools/sched_ext/scx_cpu0.bpf.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A CPU0 scheduler. 4 + * 5 + * This scheduler queues all tasks to a shared DSQ and only dispatches them on 6 + * CPU0 in FIFO order. This is useful for testing bypass behavior when many 7 + * tasks are concentrated on a single CPU. If the load balancer doesn't work, 8 + * bypass mode can trigger task hangs or RCU stalls as the queue is long and 9 + * there's only one CPU working on it. 10 + * 11 + * - Statistics tracking how many tasks are queued to local and CPU0 DSQs. 12 + * - Termination notification for userspace. 13 + * 14 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 15 + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> 16 + */ 17 + #include <scx/common.bpf.h> 18 + 19 + char _license[] SEC("license") = "GPL"; 20 + 21 + const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ 22 + 23 + UEI_DEFINE(uei); 24 + 25 + /* 26 + * We create a custom DSQ with ID 0 that we dispatch to and consume from on 27 + * CPU0. 28 + */ 29 + #define DSQ_CPU0 0 30 + 31 + struct { 32 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 33 + __uint(key_size, sizeof(u32)); 34 + __uint(value_size, sizeof(u64)); 35 + __uint(max_entries, 2); /* [local, cpu0] */ 36 + } stats SEC(".maps"); 37 + 38 + static void stat_inc(u32 idx) 39 + { 40 + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); 41 + if (cnt_p) 42 + (*cnt_p)++; 43 + } 44 + 45 + s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 46 + { 47 + return 0; 48 + } 49 + 50 + void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags) 51 + { 52 + /* 53 + * select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on 54 + * CPU 0. Queue on whichever CPU it's currently only. 55 + */ 56 + if (scx_bpf_task_cpu(p) != 0) { 57 + stat_inc(0); /* count local queueing */ 58 + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 59 + return; 60 + } 61 + 62 + stat_inc(1); /* count cpu0 queueing */ 63 + scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags); 64 + } 65 + 66 + void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev) 67 + { 68 + if (cpu == 0) 69 + scx_bpf_dsq_move_to_local(DSQ_CPU0); 70 + } 71 + 72 + s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init) 73 + { 74 + return scx_bpf_create_dsq(DSQ_CPU0, -1); 75 + } 76 + 77 + void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei) 78 + { 79 + UEI_RECORD(uei, ei); 80 + } 81 + 82 + SCX_OPS_DEFINE(cpu0_ops, 83 + .select_cpu = (void *)cpu0_select_cpu, 84 + .enqueue = (void *)cpu0_enqueue, 85 + .dispatch = (void *)cpu0_dispatch, 86 + .init = (void *)cpu0_init, 87 + .exit = (void *)cpu0_exit, 88 + .name = "cpu0");
+106
tools/sched_ext/scx_cpu0.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 4 + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> 5 + */ 6 + #include <stdio.h> 7 + #include <unistd.h> 8 + #include <signal.h> 9 + #include <assert.h> 10 + #include <libgen.h> 11 + #include <bpf/bpf.h> 12 + #include <scx/common.h> 13 + #include "scx_cpu0.bpf.skel.h" 14 + 15 + const char help_fmt[] = 16 + "A cpu0 sched_ext scheduler.\n" 17 + "\n" 18 + "See the top-level comment in .bpf.c for more details.\n" 19 + "\n" 20 + "Usage: %s [-v]\n" 21 + "\n" 22 + " -v Print libbpf debug messages\n" 23 + " -h Display this help and exit\n"; 24 + 25 + static bool verbose; 26 + static volatile int exit_req; 27 + 28 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 29 + { 30 + if (level == LIBBPF_DEBUG && !verbose) 31 + return 0; 32 + return vfprintf(stderr, format, args); 33 + } 34 + 35 + static void sigint_handler(int sig) 36 + { 37 + exit_req = 1; 38 + } 39 + 40 + static void read_stats(struct scx_cpu0 *skel, __u64 *stats) 41 + { 42 + int nr_cpus = libbpf_num_possible_cpus(); 43 + assert(nr_cpus > 0); 44 + __u64 cnts[2][nr_cpus]; 45 + __u32 idx; 46 + 47 + memset(stats, 0, sizeof(stats[0]) * 2); 48 + 49 + for (idx = 0; idx < 2; idx++) { 50 + int ret, cpu; 51 + 52 + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), 53 + &idx, cnts[idx]); 54 + if (ret < 0) 55 + continue; 56 + for (cpu = 0; cpu < nr_cpus; cpu++) 57 + stats[idx] += cnts[idx][cpu]; 58 + } 59 + } 60 + 61 + int main(int argc, char **argv) 62 + { 63 + struct scx_cpu0 *skel; 64 + struct bpf_link *link; 65 + __u32 opt; 66 + __u64 ecode; 67 + 68 + libbpf_set_print(libbpf_print_fn); 69 + signal(SIGINT, sigint_handler); 70 + signal(SIGTERM, sigint_handler); 71 + restart: 72 + skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0); 73 + 74 + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); 75 + 76 + while ((opt = getopt(argc, argv, "vh")) != -1) { 77 + switch (opt) { 78 + case 'v': 79 + verbose = true; 80 + break; 81 + default: 82 + fprintf(stderr, help_fmt, basename(argv[0])); 83 + return opt != 'h'; 84 + } 85 + } 86 + 87 + SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei); 88 + link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0); 89 + 90 + while (!exit_req && !UEI_EXITED(skel, uei)) { 91 + __u64 stats[2]; 92 + 93 + read_stats(skel, stats); 94 + printf("local=%llu cpu0=%llu\n", stats[0], stats[1]); 95 + fflush(stdout); 96 + sleep(1); 97 + } 98 + 99 + bpf_link__destroy(link); 100 + ecode = UEI_REPORT(skel, uei); 101 + scx_cpu0__destroy(skel); 102 + 103 + if (UEI_ECODE_RESTART(ecode)) 104 + goto restart; 105 + return 0; 106 + }
+5 -5
tools/sched_ext/scx_flatcg.bpf.c
··· 382 382 return; 383 383 } 384 384 385 - cgrp = __COMPAT_scx_bpf_task_cgroup(p); 385 + cgrp = scx_bpf_task_cgroup(p); 386 386 cgc = find_cgrp_ctx(cgrp); 387 387 if (!cgc) 388 388 goto out_release; ··· 508 508 { 509 509 struct cgroup *cgrp; 510 510 511 - cgrp = __COMPAT_scx_bpf_task_cgroup(p); 511 + cgrp = scx_bpf_task_cgroup(p); 512 512 update_active_weight_sums(cgrp, true); 513 513 bpf_cgroup_release(cgrp); 514 514 } ··· 521 521 if (fifo_sched) 522 522 return; 523 523 524 - cgrp = __COMPAT_scx_bpf_task_cgroup(p); 524 + cgrp = scx_bpf_task_cgroup(p); 525 525 cgc = find_cgrp_ctx(cgrp); 526 526 if (cgc) { 527 527 /* ··· 564 564 if (!taskc->bypassed_at) 565 565 return; 566 566 567 - cgrp = __COMPAT_scx_bpf_task_cgroup(p); 567 + cgrp = scx_bpf_task_cgroup(p); 568 568 cgc = find_cgrp_ctx(cgrp); 569 569 if (cgc) { 570 570 __sync_fetch_and_add(&cgc->cvtime_delta, ··· 578 578 { 579 579 struct cgroup *cgrp; 580 580 581 - cgrp = __COMPAT_scx_bpf_task_cgroup(p); 581 + cgrp = scx_bpf_task_cgroup(p); 582 582 update_active_weight_sums(cgrp, false); 583 583 bpf_cgroup_release(cgrp); 584 584 }
+33 -19
tools/sched_ext/scx_qmap.bpf.c
··· 202 202 void *ring; 203 203 s32 cpu; 204 204 205 + if (enq_flags & SCX_ENQ_REENQ) 206 + __sync_fetch_and_add(&nr_reenqueued, 1); 207 + 205 208 if (p->flags & PF_KTHREAD) { 206 209 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) 207 210 return; ··· 323 320 324 321 if (tctx->highpri) { 325 322 /* exercise the set_*() and vtime interface too */ 326 - __COMPAT_scx_bpf_dsq_move_set_slice( 327 - BPF_FOR_EACH_ITER, slice_ns * 2); 328 - __COMPAT_scx_bpf_dsq_move_set_vtime( 329 - BPF_FOR_EACH_ITER, highpri_seq++); 330 - __COMPAT_scx_bpf_dsq_move_vtime( 331 - BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 323 + scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); 324 + scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); 325 + scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 332 326 } 333 327 } 334 328 ··· 342 342 else 343 343 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 344 344 345 - if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, 346 - SCX_DSQ_LOCAL_ON | cpu, 347 - SCX_ENQ_PREEMPT)) { 345 + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, 346 + SCX_ENQ_PREEMPT)) { 348 347 if (cpu == this_cpu) { 349 348 dispatched = true; 350 349 __sync_fetch_and_add(&nr_expedited_local, 1); ··· 532 533 return task_qdist(a) > task_qdist(b); 533 534 } 534 535 535 - void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 536 + SEC("tp_btf/sched_switch") 537 + int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev, 538 + struct task_struct *next, unsigned long prev_state) 536 539 { 537 - u32 cnt; 540 + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 541 + return 0; 538 542 539 543 /* 540 - * Called when @cpu is taken by a higher priority scheduling class. This 541 - * makes @cpu no longer available for executing sched_ext tasks. As we 542 - * don't want the tasks in @cpu's local dsq to sit there until @cpu 543 - * becomes available again, re-enqueue them into the global dsq. See 544 - * %SCX_ENQ_REENQ handling in qmap_enqueue(). 544 + * If @cpu is taken by a higher priority scheduling class, it is no 545 + * longer available for executing sched_ext tasks. As we don't want the 546 + * tasks in @cpu's local dsq to sit there until @cpu becomes available 547 + * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ 548 + * handling in qmap_enqueue(). 545 549 */ 546 - cnt = scx_bpf_reenqueue_local(); 547 - if (cnt) 548 - __sync_fetch_and_add(&nr_reenqueued, cnt); 550 + switch (next->policy) { 551 + case 1: /* SCHED_FIFO */ 552 + case 2: /* SCHED_RR */ 553 + case 6: /* SCHED_DEADLINE */ 554 + scx_bpf_reenqueue_local(); 555 + } 556 + 557 + return 0; 558 + } 559 + 560 + void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) 561 + { 562 + /* see qmap_sched_switch() to learn how to do this on newer kernels */ 563 + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) 564 + scx_bpf_reenqueue_local(); 549 565 } 550 566 551 567 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
+1
tools/testing/selftests/sched_ext/Makefile
··· 174 174 minimal \ 175 175 numa \ 176 176 allowed_cpus \ 177 + peek_dsq \ 177 178 prog_run \ 178 179 reload_loop \ 179 180 select_cpu_dfl \
+251
tools/testing/selftests/sched_ext/peek_dsq.bpf.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * A BPF program for testing DSQ operations and peek in particular. 4 + * 5 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 6 + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> 7 + */ 8 + 9 + #include <scx/common.bpf.h> 10 + #include <scx/compat.bpf.h> 11 + 12 + char _license[] SEC("license") = "GPL"; 13 + 14 + UEI_DEFINE(uei); /* Error handling */ 15 + 16 + #define MAX_SAMPLES 100 17 + #define MAX_CPUS 512 18 + #define DSQ_POOL_SIZE 8 19 + int max_samples = MAX_SAMPLES; 20 + int max_cpus = MAX_CPUS; 21 + int dsq_pool_size = DSQ_POOL_SIZE; 22 + 23 + /* Global variables to store test results */ 24 + int dsq_peek_result1 = -1; 25 + long dsq_inserted_pid = -1; 26 + int insert_test_cpu = -1; /* Set to the cpu that performs the test */ 27 + long dsq_peek_result2 = -1; 28 + long dsq_peek_result2_pid = -1; 29 + long dsq_peek_result2_expected = -1; 30 + int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */ 31 + int real_dsq_id = 1235; /* DSQ for normal operation */ 32 + int enqueue_count = -1; 33 + int dispatch_count = -1; 34 + bool debug_ksym_exists; 35 + 36 + /* DSQ pool for stress testing */ 37 + int dsq_pool_base_id = 2000; 38 + int phase1_complete = -1; 39 + long total_peek_attempts = -1; 40 + long successful_peeks = -1; 41 + 42 + /* BPF map for sharing peek results with userspace */ 43 + struct { 44 + __uint(type, BPF_MAP_TYPE_ARRAY); 45 + __uint(max_entries, MAX_SAMPLES); 46 + __type(key, u32); 47 + __type(value, long); 48 + } peek_results SEC(".maps"); 49 + 50 + static int get_random_dsq_id(void) 51 + { 52 + u64 time = bpf_ktime_get_ns(); 53 + 54 + return dsq_pool_base_id + (time % DSQ_POOL_SIZE); 55 + } 56 + 57 + static void record_peek_result(long pid) 58 + { 59 + u32 slot_key; 60 + long *slot_pid_ptr; 61 + int ix; 62 + 63 + if (pid <= 0) 64 + return; 65 + 66 + /* Find an empty slot or one with the same PID */ 67 + bpf_for(ix, 0, 10) { 68 + slot_key = (pid + ix) % MAX_SAMPLES; 69 + slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key); 70 + if (!slot_pid_ptr) 71 + continue; 72 + 73 + if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) { 74 + *slot_pid_ptr = pid; 75 + break; 76 + } 77 + } 78 + } 79 + 80 + /* Scan all DSQs in the pool and try to move a task to local */ 81 + static int scan_dsq_pool(void) 82 + { 83 + struct task_struct *task; 84 + int moved = 0; 85 + int i; 86 + 87 + bpf_for(i, 0, DSQ_POOL_SIZE) { 88 + int dsq_id = dsq_pool_base_id + i; 89 + 90 + total_peek_attempts++; 91 + 92 + task = __COMPAT_scx_bpf_dsq_peek(dsq_id); 93 + if (task) { 94 + successful_peeks++; 95 + record_peek_result(task->pid); 96 + 97 + /* Try to move this task to local */ 98 + if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) { 99 + moved = 1; 100 + break; 101 + } 102 + } 103 + } 104 + return moved; 105 + } 106 + 107 + /* Struct_ops scheduler for testing DSQ peek operations */ 108 + void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags) 109 + { 110 + struct task_struct *peek_result; 111 + int last_insert_test_cpu, cpu; 112 + 113 + enqueue_count++; 114 + cpu = bpf_get_smp_processor_id(); 115 + last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu); 116 + 117 + /* Phase 1: Simple insert-then-peek test (only on first task) */ 118 + if (last_insert_test_cpu == -1) { 119 + bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu); 120 + 121 + /* Test 1: Peek empty DSQ - should return NULL */ 122 + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); 123 + dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */ 124 + 125 + /* Test 2: Insert task into test DSQ for testing in dispatch callback */ 126 + dsq_inserted_pid = p->pid; 127 + scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags); 128 + dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */ 129 + } else if (!phase1_complete) { 130 + /* Still in phase 1, use real DSQ */ 131 + scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags); 132 + } else { 133 + /* Phase 2: Random DSQ insertion for stress testing */ 134 + int random_dsq_id = get_random_dsq_id(); 135 + 136 + scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags); 137 + } 138 + } 139 + 140 + void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev) 141 + { 142 + dispatch_count++; 143 + 144 + /* Phase 1: Complete the simple peek test if we inserted a task but 145 + * haven't tested peek yet 146 + */ 147 + if (insert_test_cpu == cpu && dsq_peek_result2 == -1) { 148 + struct task_struct *peek_result; 149 + 150 + bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu); 151 + 152 + /* Test 3: Peek DSQ after insert - should return the task we inserted */ 153 + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); 154 + /* Store the PID of the peeked task for comparison */ 155 + dsq_peek_result2 = (long)peek_result; 156 + dsq_peek_result2_pid = peek_result ? peek_result->pid : -1; 157 + 158 + /* Now consume the task since we've peeked at it */ 159 + scx_bpf_dsq_move_to_local(test_dsq_id); 160 + 161 + /* Mark phase 1 as complete */ 162 + phase1_complete = 1; 163 + bpf_printk("Phase 1 complete, starting phase 2 stress testing"); 164 + } else if (!phase1_complete) { 165 + /* Still in phase 1, use real DSQ */ 166 + scx_bpf_dsq_move_to_local(real_dsq_id); 167 + } else { 168 + /* Phase 2: Scan all DSQs in the pool and try to move a task */ 169 + if (!scan_dsq_pool()) { 170 + /* No tasks found in DSQ pool, fall back to real DSQ */ 171 + scx_bpf_dsq_move_to_local(real_dsq_id); 172 + } 173 + } 174 + } 175 + 176 + s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init) 177 + { 178 + s32 err; 179 + int i; 180 + 181 + /* Always set debug values so we can see which version we're using */ 182 + debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0; 183 + 184 + /* Initialize state first */ 185 + insert_test_cpu = -1; 186 + enqueue_count = 0; 187 + dispatch_count = 0; 188 + phase1_complete = 0; 189 + total_peek_attempts = 0; 190 + successful_peeks = 0; 191 + 192 + /* Create the test and real DSQs */ 193 + err = scx_bpf_create_dsq(test_dsq_id, -1); 194 + if (err) { 195 + scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); 196 + return err; 197 + } 198 + err = scx_bpf_create_dsq(real_dsq_id, -1); 199 + if (err) { 200 + scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); 201 + return err; 202 + } 203 + 204 + /* Create the DSQ pool for stress testing */ 205 + bpf_for(i, 0, DSQ_POOL_SIZE) { 206 + int dsq_id = dsq_pool_base_id + i; 207 + 208 + err = scx_bpf_create_dsq(dsq_id, -1); 209 + if (err) { 210 + scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err); 211 + return err; 212 + } 213 + } 214 + 215 + /* Initialize the peek results map */ 216 + bpf_for(i, 0, MAX_SAMPLES) { 217 + u32 key = i; 218 + long pid = -1; 219 + 220 + bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY); 221 + } 222 + 223 + return 0; 224 + } 225 + 226 + void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei) 227 + { 228 + int i; 229 + 230 + /* Destroy the primary DSQs */ 231 + scx_bpf_destroy_dsq(test_dsq_id); 232 + scx_bpf_destroy_dsq(real_dsq_id); 233 + 234 + /* Destroy the DSQ pool */ 235 + bpf_for(i, 0, DSQ_POOL_SIZE) { 236 + int dsq_id = dsq_pool_base_id + i; 237 + 238 + scx_bpf_destroy_dsq(dsq_id); 239 + } 240 + 241 + UEI_RECORD(uei, ei); 242 + } 243 + 244 + SEC(".struct_ops.link") 245 + struct sched_ext_ops peek_dsq_ops = { 246 + .enqueue = (void *)peek_dsq_enqueue, 247 + .dispatch = (void *)peek_dsq_dispatch, 248 + .init = (void *)peek_dsq_init, 249 + .exit = (void *)peek_dsq_exit, 250 + .name = "peek_dsq", 251 + };
+224
tools/testing/selftests/sched_ext/peek_dsq.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Test for DSQ operations including create, destroy, and peek operations. 4 + * 5 + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. 6 + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> 7 + */ 8 + #include <bpf/bpf.h> 9 + #include <scx/common.h> 10 + #include <sys/wait.h> 11 + #include <unistd.h> 12 + #include <pthread.h> 13 + #include <string.h> 14 + #include <sched.h> 15 + #include "peek_dsq.bpf.skel.h" 16 + #include "scx_test.h" 17 + 18 + #define NUM_WORKERS 4 19 + 20 + static bool workload_running = true; 21 + static pthread_t workload_threads[NUM_WORKERS]; 22 + 23 + /** 24 + * Background workload thread that sleeps and wakes rapidly to exercise 25 + * the scheduler's enqueue operations and ensure DSQ operations get tested. 26 + */ 27 + static void *workload_thread_fn(void *arg) 28 + { 29 + while (workload_running) { 30 + /* Sleep for a very short time to trigger scheduler activity */ 31 + usleep(1000); /* 1ms sleep */ 32 + /* Yield to ensure we go through the scheduler */ 33 + sched_yield(); 34 + } 35 + return NULL; 36 + } 37 + 38 + static enum scx_test_status setup(void **ctx) 39 + { 40 + struct peek_dsq *skel; 41 + 42 + skel = peek_dsq__open(); 43 + SCX_FAIL_IF(!skel, "Failed to open"); 44 + SCX_ENUM_INIT(skel); 45 + SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel"); 46 + 47 + *ctx = skel; 48 + 49 + return SCX_TEST_PASS; 50 + } 51 + 52 + static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name) 53 + { 54 + long count = 0; 55 + 56 + printf("Observed %s DSQ peek pids:\n", dsq_name); 57 + for (int i = 0; i < max_samples; i++) { 58 + long pid; 59 + int err; 60 + 61 + err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid); 62 + if (err == 0) { 63 + if (pid == 0) { 64 + printf(" Sample %d: NULL peek\n", i); 65 + } else if (pid > 0) { 66 + printf(" Sample %d: pid %ld\n", i, pid); 67 + count++; 68 + } 69 + } else { 70 + printf(" Sample %d: error reading pid (err=%d)\n", i, err); 71 + } 72 + } 73 + printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name); 74 + return count; 75 + } 76 + 77 + static enum scx_test_status run(void *ctx) 78 + { 79 + struct peek_dsq *skel = ctx; 80 + bool failed = false; 81 + int seconds = 3; 82 + int err; 83 + 84 + /* Enable the scheduler to test DSQ operations */ 85 + printf("Enabling scheduler to test DSQ insert operations...\n"); 86 + 87 + struct bpf_link *link = 88 + bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops); 89 + 90 + if (!link) { 91 + SCX_ERR("Failed to attach struct_ops"); 92 + return SCX_TEST_FAIL; 93 + } 94 + 95 + printf("Starting %d background workload threads...\n", NUM_WORKERS); 96 + workload_running = true; 97 + for (int i = 0; i < NUM_WORKERS; i++) { 98 + err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL); 99 + if (err) { 100 + SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err)); 101 + /* Stop already created threads */ 102 + workload_running = false; 103 + for (int j = 0; j < i; j++) 104 + pthread_join(workload_threads[j], NULL); 105 + bpf_link__destroy(link); 106 + return SCX_TEST_FAIL; 107 + } 108 + } 109 + 110 + printf("Waiting for enqueue events.\n"); 111 + sleep(seconds); 112 + while (skel->data->enqueue_count <= 0) { 113 + printf("."); 114 + fflush(stdout); 115 + sleep(1); 116 + seconds++; 117 + if (seconds >= 30) { 118 + printf("\n\u2717 Timeout waiting for enqueue events\n"); 119 + /* Stop workload threads and cleanup */ 120 + workload_running = false; 121 + for (int i = 0; i < NUM_WORKERS; i++) 122 + pthread_join(workload_threads[i], NULL); 123 + bpf_link__destroy(link); 124 + return SCX_TEST_FAIL; 125 + } 126 + } 127 + 128 + workload_running = false; 129 + for (int i = 0; i < NUM_WORKERS; i++) { 130 + err = pthread_join(workload_threads[i], NULL); 131 + if (err) { 132 + SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err)); 133 + bpf_link__destroy(link); 134 + return SCX_TEST_FAIL; 135 + } 136 + } 137 + printf("Background workload threads stopped.\n"); 138 + 139 + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); 140 + 141 + /* Detach the scheduler */ 142 + bpf_link__destroy(link); 143 + 144 + printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds, 145 + skel->data->enqueue_count, skel->data->dispatch_count); 146 + printf("Debug: ksym_exists=%d\n", 147 + skel->bss->debug_ksym_exists); 148 + 149 + /* Check DSQ insert result */ 150 + printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu); 151 + if (skel->data->insert_test_cpu != -1) 152 + printf("\u2713 DSQ insert succeeded !\n"); 153 + else { 154 + printf("\u2717 DSQ insert failed or not attempted\n"); 155 + failed = true; 156 + } 157 + 158 + /* Check DSQ peek results */ 159 + printf(" DSQ peek result 1 (before insert): %d\n", 160 + skel->data->dsq_peek_result1); 161 + if (skel->data->dsq_peek_result1 == 0) 162 + printf("\u2713 DSQ peek verification success: peek returned NULL!\n"); 163 + else { 164 + printf("\u2717 DSQ peek verification failed\n"); 165 + failed = true; 166 + } 167 + 168 + printf(" DSQ peek result 2 (after insert): %ld\n", 169 + skel->data->dsq_peek_result2); 170 + printf(" DSQ peek result 2, expected: %ld\n", 171 + skel->data->dsq_peek_result2_expected); 172 + if (skel->data->dsq_peek_result2 == 173 + skel->data->dsq_peek_result2_expected) 174 + printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n"); 175 + else { 176 + printf("\u2717 DSQ peek verification failed\n"); 177 + failed = true; 178 + } 179 + 180 + printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid); 181 + printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid); 182 + 183 + int pid_count; 184 + 185 + pid_count = print_observed_pids(skel->maps.peek_results, 186 + skel->data->max_samples, "DSQ pool"); 187 + printf("Total non-null peek observations: %ld out of %ld\n", 188 + skel->data->successful_peeks, skel->data->total_peek_attempts); 189 + 190 + if (skel->bss->debug_ksym_exists && pid_count == 0) { 191 + printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n"); 192 + failed = true; 193 + } 194 + if (skel->bss->debug_ksym_exists && pid_count > 0) 195 + printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n"); 196 + 197 + if (failed) 198 + return SCX_TEST_FAIL; 199 + else 200 + return SCX_TEST_PASS; 201 + } 202 + 203 + static void cleanup(void *ctx) 204 + { 205 + struct peek_dsq *skel = ctx; 206 + 207 + if (workload_running) { 208 + workload_running = false; 209 + for (int i = 0; i < NUM_WORKERS; i++) 210 + pthread_join(workload_threads[i], NULL); 211 + } 212 + 213 + peek_dsq__destroy(skel); 214 + } 215 + 216 + struct scx_test peek_dsq = { 217 + .name = "peek_dsq", 218 + .description = 219 + "Test DSQ create/destroy operations and future peek functionality", 220 + .setup = setup, 221 + .run = run, 222 + .cleanup = cleanup, 223 + }; 224 + REGISTER_SCX_TEST(&peek_dsq)