Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tools/sched_ext: add scx_userland scheduler

Add in the scx_userland scheduler that does vruntime-based
scheduling in userspace code and communicates scheduling
decisions to BPF by accessing and modifying globals through
the skeleton.

Cc: Tejun Heo <tj@kernel.org>
Cc: David Vernet <dvernet@meta.com>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Emil Tsalapatis and committed by
Tejun Heo
cc4448d0 2f8d4898

+799 -1
+1 -1
tools/sched_ext/Makefile
··· 189 189 190 190 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) 191 191 192 - c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg 192 + c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland 193 193 194 194 $(addprefix $(BINDIR)/,$(c-sched-targets)): \ 195 195 $(BINDIR)/%: \
+344
tools/sched_ext/scx_userland.bpf.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A minimal userland scheduler. 4 + * 5 + * In terms of scheduling, this provides two different types of behaviors: 6 + * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. 7 + * All such tasks are direct-dispatched from the kernel, and are never 8 + * enqueued in user space. 9 + * 2. A primitive vruntime scheduler that is implemented in user space, for all 10 + * other tasks. 11 + * 12 + * Some parts of this example user space scheduler could be implemented more 13 + * efficiently using more complex and sophisticated data structures. For 14 + * example, rather than using BPF_MAP_TYPE_QUEUE's, 15 + * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between 16 + * user space and kernel space. Similarly, we use a simple vruntime-sorted list 17 + * in user space, but an rbtree could be used instead. 18 + * 19 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 20 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 21 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 22 + */ 23 + #include <scx/common.bpf.h> 24 + #include "scx_userland.h" 25 + 26 + /* 27 + * Maximum amount of tasks enqueued/dispatched between kernel and user-space. 28 + */ 29 + #define MAX_ENQUEUED_TASKS 4096 30 + 31 + char _license[] SEC("license") = "GPL"; 32 + 33 + const volatile s32 usersched_pid; 34 + 35 + /* !0 for veristat, set during init */ 36 + const volatile u32 num_possible_cpus = 64; 37 + 38 + /* Stats that are printed by user space. */ 39 + u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; 40 + 41 + /* 42 + * Number of tasks that are queued for scheduling. 43 + * 44 + * This number is incremented by the BPF component when a task is queued to the 45 + * user-space scheduler and it must be decremented by the user-space scheduler 46 + * when a task is consumed. 47 + */ 48 + volatile u64 nr_queued; 49 + 50 + /* 51 + * Number of tasks that are waiting for scheduling. 52 + * 53 + * This number must be updated by the user-space scheduler to keep track if 54 + * there is still some scheduling work to do. 55 + */ 56 + volatile u64 nr_scheduled; 57 + 58 + UEI_DEFINE(uei); 59 + 60 + /* 61 + * The map containing tasks that are enqueued in user space from the kernel. 62 + * 63 + * This map is drained by the user space scheduler. 64 + */ 65 + struct { 66 + __uint(type, BPF_MAP_TYPE_QUEUE); 67 + __uint(max_entries, MAX_ENQUEUED_TASKS); 68 + __type(value, struct scx_userland_enqueued_task); 69 + } enqueued SEC(".maps"); 70 + 71 + /* 72 + * The map containing tasks that are dispatched to the kernel from user space. 73 + * 74 + * Drained by the kernel in userland_dispatch(). 75 + */ 76 + struct { 77 + __uint(type, BPF_MAP_TYPE_QUEUE); 78 + __uint(max_entries, MAX_ENQUEUED_TASKS); 79 + __type(value, s32); 80 + } dispatched SEC(".maps"); 81 + 82 + /* Per-task scheduling context */ 83 + struct task_ctx { 84 + bool force_local; /* Dispatch directly to local DSQ */ 85 + }; 86 + 87 + /* Map that contains task-local storage. */ 88 + struct { 89 + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 90 + __uint(map_flags, BPF_F_NO_PREALLOC); 91 + __type(key, int); 92 + __type(value, struct task_ctx); 93 + } task_ctx_stor SEC(".maps"); 94 + 95 + /* 96 + * Flag used to wake-up the user-space scheduler. 97 + */ 98 + static volatile u32 usersched_needed; 99 + 100 + /* 101 + * Set user-space scheduler wake-up flag (equivalent to an atomic release 102 + * operation). 103 + */ 104 + static void set_usersched_needed(void) 105 + { 106 + __sync_fetch_and_or(&usersched_needed, 1); 107 + } 108 + 109 + /* 110 + * Check and clear user-space scheduler wake-up flag (equivalent to an atomic 111 + * acquire operation). 112 + */ 113 + static bool test_and_clear_usersched_needed(void) 114 + { 115 + return __sync_fetch_and_and(&usersched_needed, 0) == 1; 116 + } 117 + 118 + static bool is_usersched_task(const struct task_struct *p) 119 + { 120 + return p->pid == usersched_pid; 121 + } 122 + 123 + static bool keep_in_kernel(const struct task_struct *p) 124 + { 125 + return p->nr_cpus_allowed < num_possible_cpus; 126 + } 127 + 128 + static struct task_struct *usersched_task(void) 129 + { 130 + struct task_struct *p; 131 + 132 + p = bpf_task_from_pid(usersched_pid); 133 + /* 134 + * Should never happen -- the usersched task should always be managed 135 + * by sched_ext. 136 + */ 137 + if (!p) 138 + scx_bpf_error("Failed to find usersched task %d", usersched_pid); 139 + 140 + return p; 141 + } 142 + 143 + s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, 144 + s32 prev_cpu, u64 wake_flags) 145 + { 146 + if (keep_in_kernel(p)) { 147 + s32 cpu; 148 + struct task_ctx *tctx; 149 + 150 + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 151 + if (!tctx) { 152 + scx_bpf_error("Failed to look up task-local storage for %s", p->comm); 153 + return -ESRCH; 154 + } 155 + 156 + if (p->nr_cpus_allowed == 1 || 157 + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { 158 + tctx->force_local = true; 159 + return prev_cpu; 160 + } 161 + 162 + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); 163 + if (cpu >= 0) { 164 + tctx->force_local = true; 165 + return cpu; 166 + } 167 + } 168 + 169 + return prev_cpu; 170 + } 171 + 172 + static void dispatch_user_scheduler(void) 173 + { 174 + struct task_struct *p; 175 + 176 + p = usersched_task(); 177 + if (p) { 178 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 179 + bpf_task_release(p); 180 + } 181 + } 182 + 183 + static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) 184 + { 185 + struct scx_userland_enqueued_task task = {}; 186 + 187 + task.pid = p->pid; 188 + task.sum_exec_runtime = p->se.sum_exec_runtime; 189 + task.weight = p->scx.weight; 190 + 191 + if (bpf_map_push_elem(&enqueued, &task, 0)) { 192 + /* 193 + * If we fail to enqueue the task in user space, put it 194 + * directly on the global DSQ. 195 + */ 196 + __sync_fetch_and_add(&nr_failed_enqueues, 1); 197 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 198 + } else { 199 + __sync_fetch_and_add(&nr_user_enqueues, 1); 200 + set_usersched_needed(); 201 + } 202 + } 203 + 204 + void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) 205 + { 206 + if (keep_in_kernel(p)) { 207 + u64 dsq_id = SCX_DSQ_GLOBAL; 208 + struct task_ctx *tctx; 209 + 210 + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); 211 + if (!tctx) { 212 + scx_bpf_error("Failed to lookup task ctx for %s", p->comm); 213 + return; 214 + } 215 + 216 + if (tctx->force_local) 217 + dsq_id = SCX_DSQ_LOCAL; 218 + tctx->force_local = false; 219 + scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags); 220 + __sync_fetch_and_add(&nr_kernel_enqueues, 1); 221 + return; 222 + } else if (!is_usersched_task(p)) { 223 + enqueue_task_in_user_space(p, enq_flags); 224 + } 225 + } 226 + 227 + void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) 228 + { 229 + if (test_and_clear_usersched_needed()) 230 + dispatch_user_scheduler(); 231 + 232 + bpf_repeat(MAX_ENQUEUED_TASKS) { 233 + s32 pid; 234 + struct task_struct *p; 235 + 236 + if (bpf_map_pop_elem(&dispatched, &pid)) 237 + break; 238 + 239 + /* 240 + * The task could have exited by the time we get around to 241 + * dispatching it. Treat this as a normal occurrence, and simply 242 + * move onto the next iteration. 243 + */ 244 + p = bpf_task_from_pid(pid); 245 + if (!p) 246 + continue; 247 + 248 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); 249 + bpf_task_release(p); 250 + } 251 + } 252 + 253 + /* 254 + * A CPU is about to change its idle state. If the CPU is going idle, ensure 255 + * that the user-space scheduler has a chance to run if there is any remaining 256 + * work to do. 257 + */ 258 + void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle) 259 + { 260 + /* 261 + * Don't do anything if we exit from and idle state, a CPU owner will 262 + * be assigned in .running(). 263 + */ 264 + if (!idle) 265 + return; 266 + /* 267 + * A CPU is now available, notify the user-space scheduler that tasks 268 + * can be dispatched, if there is at least one task waiting to be 269 + * scheduled, either queued (accounted in nr_queued) or scheduled 270 + * (accounted in nr_scheduled). 271 + * 272 + * NOTE: nr_queued is incremented by the BPF component, more exactly in 273 + * enqueue(), when a task is sent to the user-space scheduler, then 274 + * the scheduler drains the queued tasks (updating nr_queued) and adds 275 + * them to its internal data structures / state; at this point tasks 276 + * become "scheduled" and the user-space scheduler will take care of 277 + * updating nr_scheduled accordingly; lastly tasks will be dispatched 278 + * and the user-space scheduler will update nr_scheduled again. 279 + * 280 + * Checking both counters allows to determine if there is still some 281 + * pending work to do for the scheduler: new tasks have been queued 282 + * since last check, or there are still tasks "queued" or "scheduled" 283 + * since the previous user-space scheduler run. If the counters are 284 + * both zero it is pointless to wake-up the scheduler (even if a CPU 285 + * becomes idle), because there is nothing to do. 286 + * 287 + * Keep in mind that update_idle() doesn't run concurrently with the 288 + * user-space scheduler (that is single-threaded): this function is 289 + * naturally serialized with the user-space scheduler code, therefore 290 + * this check here is also safe from a concurrency perspective. 291 + */ 292 + if (nr_queued || nr_scheduled) { 293 + /* 294 + * Kick the CPU to make it immediately ready to accept 295 + * dispatched tasks. 296 + */ 297 + set_usersched_needed(); 298 + scx_bpf_kick_cpu(cpu, 0); 299 + } 300 + } 301 + 302 + s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p, 303 + struct scx_init_task_args *args) 304 + { 305 + if (bpf_task_storage_get(&task_ctx_stor, p, 0, 306 + BPF_LOCAL_STORAGE_GET_F_CREATE)) 307 + return 0; 308 + else 309 + return -ENOMEM; 310 + } 311 + 312 + s32 BPF_STRUCT_OPS(userland_init) 313 + { 314 + if (num_possible_cpus == 0) { 315 + scx_bpf_error("User scheduler # CPUs uninitialized (%d)", 316 + num_possible_cpus); 317 + return -EINVAL; 318 + } 319 + 320 + if (usersched_pid <= 0) { 321 + scx_bpf_error("User scheduler pid uninitialized (%d)", 322 + usersched_pid); 323 + return -EINVAL; 324 + } 325 + 326 + return 0; 327 + } 328 + 329 + void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) 330 + { 331 + UEI_RECORD(uei, ei); 332 + } 333 + 334 + SCX_OPS_DEFINE(userland_ops, 335 + .select_cpu = (void *)userland_select_cpu, 336 + .enqueue = (void *)userland_enqueue, 337 + .dispatch = (void *)userland_dispatch, 338 + .update_idle = (void *)userland_update_idle, 339 + .init_task = (void *)userland_init_task, 340 + .init = (void *)userland_init, 341 + .exit = (void *)userland_exit, 342 + .flags = SCX_OPS_ENQ_LAST | 343 + SCX_OPS_KEEP_BUILTIN_IDLE, 344 + .name = "userland");
+437
tools/sched_ext/scx_userland.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * A demo sched_ext user space scheduler which provides vruntime semantics 4 + * using a simple ordered-list implementation. 5 + * 6 + * Each CPU in the system resides in a single, global domain. This precludes 7 + * the need to do any load balancing between domains. The scheduler could 8 + * easily be extended to support multiple domains, with load balancing 9 + * happening in user space. 10 + * 11 + * Any task which has any CPU affinity is scheduled entirely in BPF. This 12 + * program only schedules tasks which may run on any CPU. 13 + * 14 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 15 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 16 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 17 + */ 18 + #include <stdio.h> 19 + #include <unistd.h> 20 + #include <sched.h> 21 + #include <signal.h> 22 + #include <assert.h> 23 + #include <libgen.h> 24 + #include <pthread.h> 25 + #include <bpf/bpf.h> 26 + #include <sys/mman.h> 27 + #include <sys/queue.h> 28 + #include <sys/syscall.h> 29 + 30 + #include <scx/common.h> 31 + #include "scx_userland.h" 32 + #include "scx_userland.bpf.skel.h" 33 + 34 + const char help_fmt[] = 35 + "A minimal userland sched_ext scheduler.\n" 36 + "\n" 37 + "See the top-level comment in .bpf.c for more details.\n" 38 + "\n" 39 + "Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" 40 + "\n" 41 + "Usage: %s [-b BATCH]\n" 42 + "\n" 43 + " -b BATCH The number of tasks to batch when dispatching (default: 8)\n" 44 + " -v Print libbpf debug messages\n" 45 + " -h Display this help and exit\n"; 46 + 47 + /* Defined in UAPI */ 48 + #define SCHED_EXT 7 49 + 50 + /* Number of tasks to batch when dispatching to user space. */ 51 + static __u32 batch_size = 8; 52 + 53 + static bool verbose; 54 + static volatile int exit_req; 55 + static int enqueued_fd, dispatched_fd; 56 + 57 + static struct scx_userland *skel; 58 + static struct bpf_link *ops_link; 59 + 60 + /* Stats collected in user space. */ 61 + static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; 62 + 63 + /* Number of tasks currently enqueued. */ 64 + static __u64 nr_curr_enqueued; 65 + 66 + /* The data structure containing tasks that are enqueued in user space. */ 67 + struct enqueued_task { 68 + LIST_ENTRY(enqueued_task) entries; 69 + __u64 sum_exec_runtime; 70 + double vruntime; 71 + }; 72 + 73 + /* 74 + * Use a vruntime-sorted list to store tasks. This could easily be extended to 75 + * a more optimal data structure, such as an rbtree as is done in CFS. We 76 + * currently elect to use a sorted list to simplify the example for 77 + * illustrative purposes. 78 + */ 79 + LIST_HEAD(listhead, enqueued_task); 80 + 81 + /* 82 + * A vruntime-sorted list of tasks. The head of the list contains the task with 83 + * the lowest vruntime. That is, the task that has the "highest" claim to be 84 + * scheduled. 85 + */ 86 + static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); 87 + 88 + /* 89 + * The main array of tasks. The array is allocated all at once during 90 + * initialization, based on /proc/sys/kernel/pid_max, to avoid having to 91 + * dynamically allocate memory on the enqueue path, which could cause a 92 + * deadlock. A more substantive user space scheduler could e.g. provide a hook 93 + * for newly enabled tasks that are passed to the scheduler from the 94 + * .prep_enable() callback to allows the scheduler to allocate on safe paths. 95 + */ 96 + struct enqueued_task *tasks; 97 + static int pid_max; 98 + 99 + static double min_vruntime; 100 + 101 + static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) 102 + { 103 + if (level == LIBBPF_DEBUG && !verbose) 104 + return 0; 105 + return vfprintf(stderr, format, args); 106 + } 107 + 108 + static void sigint_handler(int userland) 109 + { 110 + exit_req = 1; 111 + } 112 + 113 + static int get_pid_max(void) 114 + { 115 + FILE *fp; 116 + int pid_max; 117 + 118 + fp = fopen("/proc/sys/kernel/pid_max", "r"); 119 + if (fp == NULL) { 120 + fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); 121 + return -1; 122 + } 123 + if (fscanf(fp, "%d", &pid_max) != 1) { 124 + fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); 125 + fclose(fp); 126 + return -1; 127 + } 128 + fclose(fp); 129 + 130 + return pid_max; 131 + } 132 + 133 + static int init_tasks(void) 134 + { 135 + pid_max = get_pid_max(); 136 + if (pid_max < 0) 137 + return pid_max; 138 + 139 + tasks = calloc(pid_max, sizeof(*tasks)); 140 + if (!tasks) { 141 + fprintf(stderr, "Error allocating tasks array\n"); 142 + return -ENOMEM; 143 + } 144 + 145 + return 0; 146 + } 147 + 148 + static __u32 task_pid(const struct enqueued_task *task) 149 + { 150 + return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); 151 + } 152 + 153 + static int dispatch_task(__s32 pid) 154 + { 155 + int err; 156 + 157 + err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); 158 + if (err) { 159 + nr_vruntime_failed++; 160 + } else { 161 + nr_vruntime_dispatches++; 162 + } 163 + 164 + return err; 165 + } 166 + 167 + static struct enqueued_task *get_enqueued_task(__s32 pid) 168 + { 169 + if (pid >= pid_max) 170 + return NULL; 171 + 172 + return &tasks[pid]; 173 + } 174 + 175 + static double calc_vruntime_delta(__u64 weight, __u64 delta) 176 + { 177 + double weight_f = (double)weight / 100.0; 178 + double delta_f = (double)delta; 179 + 180 + return delta_f / weight_f; 181 + } 182 + 183 + static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) 184 + { 185 + __u64 delta; 186 + 187 + delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; 188 + 189 + enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); 190 + if (min_vruntime > enqueued->vruntime) 191 + enqueued->vruntime = min_vruntime; 192 + enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; 193 + } 194 + 195 + static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) 196 + { 197 + struct enqueued_task *curr, *enqueued, *prev; 198 + 199 + curr = get_enqueued_task(bpf_task->pid); 200 + if (!curr) 201 + return ENOENT; 202 + 203 + update_enqueued(curr, bpf_task); 204 + nr_vruntime_enqueues++; 205 + nr_curr_enqueued++; 206 + 207 + /* 208 + * Enqueue the task in a vruntime-sorted list. A more optimal data 209 + * structure such as an rbtree could easily be used as well. We elect 210 + * to use a list here simply because it's less code, and thus the 211 + * example is less convoluted and better serves to illustrate what a 212 + * user space scheduler could look like. 213 + */ 214 + 215 + if (LIST_EMPTY(&vruntime_head)) { 216 + LIST_INSERT_HEAD(&vruntime_head, curr, entries); 217 + return 0; 218 + } 219 + 220 + LIST_FOREACH(enqueued, &vruntime_head, entries) { 221 + if (curr->vruntime <= enqueued->vruntime) { 222 + LIST_INSERT_BEFORE(enqueued, curr, entries); 223 + return 0; 224 + } 225 + prev = enqueued; 226 + } 227 + 228 + LIST_INSERT_AFTER(prev, curr, entries); 229 + 230 + return 0; 231 + } 232 + 233 + static void drain_enqueued_map(void) 234 + { 235 + while (1) { 236 + struct scx_userland_enqueued_task task; 237 + int err; 238 + 239 + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { 240 + skel->bss->nr_queued = 0; 241 + skel->bss->nr_scheduled = nr_curr_enqueued; 242 + return; 243 + } 244 + 245 + err = vruntime_enqueue(&task); 246 + if (err) { 247 + fprintf(stderr, "Failed to enqueue task %d: %s\n", 248 + task.pid, strerror(err)); 249 + exit_req = 1; 250 + return; 251 + } 252 + } 253 + } 254 + 255 + static void dispatch_batch(void) 256 + { 257 + __u32 i; 258 + 259 + for (i = 0; i < batch_size; i++) { 260 + struct enqueued_task *task; 261 + int err; 262 + __s32 pid; 263 + 264 + task = LIST_FIRST(&vruntime_head); 265 + if (!task) 266 + break; 267 + 268 + min_vruntime = task->vruntime; 269 + pid = task_pid(task); 270 + LIST_REMOVE(task, entries); 271 + err = dispatch_task(pid); 272 + if (err) { 273 + /* 274 + * If we fail to dispatch, put the task back to the 275 + * vruntime_head list and stop dispatching additional 276 + * tasks in this batch. 277 + */ 278 + LIST_INSERT_HEAD(&vruntime_head, task, entries); 279 + break; 280 + } 281 + nr_curr_enqueued--; 282 + } 283 + skel->bss->nr_scheduled = nr_curr_enqueued; 284 + } 285 + 286 + static void *run_stats_printer(void *arg) 287 + { 288 + while (!exit_req) { 289 + __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; 290 + 291 + nr_failed_enqueues = skel->bss->nr_failed_enqueues; 292 + nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; 293 + nr_user_enqueues = skel->bss->nr_user_enqueues; 294 + total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; 295 + 296 + printf("o-----------------------o\n"); 297 + printf("| BPF ENQUEUES |\n"); 298 + printf("|-----------------------|\n"); 299 + printf("| kern: %10llu |\n", nr_kernel_enqueues); 300 + printf("| user: %10llu |\n", nr_user_enqueues); 301 + printf("| failed: %10llu |\n", nr_failed_enqueues); 302 + printf("| -------------------- |\n"); 303 + printf("| total: %10llu |\n", total); 304 + printf("| |\n"); 305 + printf("|-----------------------|\n"); 306 + printf("| VRUNTIME / USER |\n"); 307 + printf("|-----------------------|\n"); 308 + printf("| enq: %10llu |\n", nr_vruntime_enqueues); 309 + printf("| disp: %10llu |\n", nr_vruntime_dispatches); 310 + printf("| failed: %10llu |\n", nr_vruntime_failed); 311 + printf("o-----------------------o\n"); 312 + printf("\n\n"); 313 + fflush(stdout); 314 + sleep(1); 315 + } 316 + 317 + return NULL; 318 + } 319 + 320 + static int spawn_stats_thread(void) 321 + { 322 + pthread_t stats_printer; 323 + 324 + return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); 325 + } 326 + 327 + static void pre_bootstrap(int argc, char **argv) 328 + { 329 + int err; 330 + __u32 opt; 331 + struct sched_param sched_param = { 332 + .sched_priority = sched_get_priority_max(SCHED_EXT), 333 + }; 334 + 335 + err = init_tasks(); 336 + if (err) 337 + exit(err); 338 + 339 + libbpf_set_print(libbpf_print_fn); 340 + signal(SIGINT, sigint_handler); 341 + signal(SIGTERM, sigint_handler); 342 + 343 + /* 344 + * Enforce that the user scheduler task is managed by sched_ext. The 345 + * task eagerly drains the list of enqueued tasks in its main work 346 + * loop, and then yields the CPU. The BPF scheduler only schedules the 347 + * user space scheduler task when at least one other task in the system 348 + * needs to be scheduled. 349 + */ 350 + err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); 351 + SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); 352 + 353 + while ((opt = getopt(argc, argv, "b:vh")) != -1) { 354 + switch (opt) { 355 + case 'b': 356 + batch_size = strtoul(optarg, NULL, 0); 357 + break; 358 + case 'v': 359 + verbose = true; 360 + break; 361 + default: 362 + fprintf(stderr, help_fmt, basename(argv[0])); 363 + exit(opt != 'h'); 364 + } 365 + } 366 + 367 + /* 368 + * It's not always safe to allocate in a user space scheduler, as an 369 + * enqueued task could hold a lock that we require in order to be able 370 + * to allocate. 371 + */ 372 + err = mlockall(MCL_CURRENT | MCL_FUTURE); 373 + SCX_BUG_ON(err, "Failed to prefault and lock address space"); 374 + } 375 + 376 + static void bootstrap(char *comm) 377 + { 378 + skel = SCX_OPS_OPEN(userland_ops, scx_userland); 379 + 380 + skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); 381 + assert(skel->rodata->num_possible_cpus > 0); 382 + skel->rodata->usersched_pid = getpid(); 383 + assert(skel->rodata->usersched_pid > 0); 384 + 385 + SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei); 386 + 387 + enqueued_fd = bpf_map__fd(skel->maps.enqueued); 388 + dispatched_fd = bpf_map__fd(skel->maps.dispatched); 389 + assert(enqueued_fd > 0); 390 + assert(dispatched_fd > 0); 391 + 392 + SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); 393 + 394 + ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland); 395 + } 396 + 397 + static void sched_main_loop(void) 398 + { 399 + while (!exit_req) { 400 + /* 401 + * Perform the following work in the main user space scheduler 402 + * loop: 403 + * 404 + * 1. Drain all tasks from the enqueued map, and enqueue them 405 + * to the vruntime sorted list. 406 + * 407 + * 2. Dispatch a batch of tasks from the vruntime sorted list 408 + * down to the kernel. 409 + * 410 + * 3. Yield the CPU back to the system. The BPF scheduler will 411 + * reschedule the user space scheduler once another task has 412 + * been enqueued to user space. 413 + */ 414 + drain_enqueued_map(); 415 + dispatch_batch(); 416 + sched_yield(); 417 + } 418 + } 419 + 420 + int main(int argc, char **argv) 421 + { 422 + __u64 ecode; 423 + 424 + pre_bootstrap(argc, argv); 425 + restart: 426 + bootstrap(argv[0]); 427 + sched_main_loop(); 428 + 429 + exit_req = 1; 430 + bpf_link__destroy(ops_link); 431 + ecode = UEI_REPORT(skel, uei); 432 + scx_userland__destroy(skel); 433 + 434 + if (UEI_ECODE_RESTART(ecode)) 435 + goto restart; 436 + return 0; 437 + }
+17
tools/sched_ext/scx_userland.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2022 Meta, Inc */ 3 + 4 + #ifndef __SCX_USERLAND_COMMON_H 5 + #define __SCX_USERLAND_COMMON_H 6 + 7 + /* 8 + * An instance of a task that has been enqueued by the kernel for consumption 9 + * by a user space global scheduler thread. 10 + */ 11 + struct scx_userland_enqueued_task { 12 + __s32 pid; 13 + u64 sum_exec_runtime; 14 + u64 weight; 15 + }; 16 + 17 + #endif // __SCX_USERLAND_COMMON_H