Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * A simple five-level FIFO queue scheduler.
4 *
5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
6 * assigned to one depending on its compound weight. Each CPU round robins
7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
8 * queue0, 2 from queue1, 4 from queue2 and so on.
9 *
10 * This scheduler demonstrates:
11 *
12 * - BPF-side queueing using PIDs.
13 * - Sleepable per-task storage allocation using ops.prep_enable().
14 * - Core-sched support.
15 *
16 * This scheduler is primarily for demonstration and testing of sched_ext
17 * features and unlikely to be useful for actual workloads.
18 *
19 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
20 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
21 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
22 */
23#include <scx/common.bpf.h>
24
25enum consts {
26 ONE_SEC_IN_NS = 1000000000,
27 ONE_MSEC_IN_NS = 1000000,
28 LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS,
29 SHARED_DSQ = 0,
30 HIGHPRI_DSQ = 1,
31 LOWPRI_DSQ = 2,
32 HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
33};
34
35char _license[] SEC("license") = "GPL";
36
37const volatile u64 slice_ns;
38const volatile u32 stall_user_nth;
39const volatile u32 stall_kernel_nth;
40const volatile u32 dsp_inf_loop_after;
41const volatile u32 dsp_batch;
42const volatile bool highpri_boosting;
43const volatile bool print_dsqs_and_events;
44const volatile bool print_msgs;
45const volatile u64 sub_cgroup_id;
46const volatile s32 disallow_tgid;
47const volatile bool suppress_dump;
48const volatile bool always_enq_immed;
49const volatile u32 immed_stress_nth;
50
51u64 nr_highpri_queued;
52u32 test_error_cnt;
53
54#define MAX_SUB_SCHEDS 8
55u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
56
57UEI_DEFINE(uei);
58
59struct qmap {
60 __uint(type, BPF_MAP_TYPE_QUEUE);
61 __uint(max_entries, 4096);
62 __type(value, u32);
63} queue0 SEC(".maps"),
64 queue1 SEC(".maps"),
65 queue2 SEC(".maps"),
66 queue3 SEC(".maps"),
67 queue4 SEC(".maps"),
68 dump_store SEC(".maps");
69
70struct {
71 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
72 __uint(max_entries, 5);
73 __type(key, int);
74 __array(values, struct qmap);
75} queue_arr SEC(".maps") = {
76 .values = {
77 [0] = &queue0,
78 [1] = &queue1,
79 [2] = &queue2,
80 [3] = &queue3,
81 [4] = &queue4,
82 },
83};
84
85/*
86 * If enabled, CPU performance target is set according to the queue index
87 * according to the following table.
88 */
89static const u32 qidx_to_cpuperf_target[] = {
90 [0] = SCX_CPUPERF_ONE * 0 / 4,
91 [1] = SCX_CPUPERF_ONE * 1 / 4,
92 [2] = SCX_CPUPERF_ONE * 2 / 4,
93 [3] = SCX_CPUPERF_ONE * 3 / 4,
94 [4] = SCX_CPUPERF_ONE * 4 / 4,
95};
96
97/*
98 * Per-queue sequence numbers to implement core-sched ordering.
99 *
100 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
101 * sequence number of the latest dispatched task. The distance between the a
102 * task's seq and the associated queue's head seq is called the queue distance
103 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
104 */
105static u64 core_sched_head_seqs[5];
106static u64 core_sched_tail_seqs[5];
107
108/* Per-task scheduling context */
109struct task_ctx {
110 bool force_local; /* Dispatch directly to local_dsq */
111 bool highpri;
112 u64 core_sched_seq;
113};
114
115struct {
116 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
117 __uint(map_flags, BPF_F_NO_PREALLOC);
118 __type(key, int);
119 __type(value, struct task_ctx);
120} task_ctx_stor SEC(".maps");
121
122struct cpu_ctx {
123 u64 dsp_idx; /* dispatch index */
124 u64 dsp_cnt; /* remaining count */
125 u32 avg_weight;
126 u32 cpuperf_target;
127};
128
129struct {
130 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
131 __uint(max_entries, 1);
132 __type(key, u32);
133 __type(value, struct cpu_ctx);
134} cpu_ctx_stor SEC(".maps");
135
136/* Statistics */
137u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
138u64 nr_core_sched_execed;
139u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
140u32 cpuperf_min, cpuperf_avg, cpuperf_max;
141u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
142
143static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
144{
145 s32 cpu;
146
147 if (!always_enq_immed && p->nr_cpus_allowed == 1)
148 return prev_cpu;
149
150 if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
151 return prev_cpu;
152
153 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
154 if (cpu >= 0)
155 return cpu;
156
157 return -1;
158}
159
160static struct task_ctx *lookup_task_ctx(struct task_struct *p)
161{
162 return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
163}
164
165s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
166 s32 prev_cpu, u64 wake_flags)
167{
168 struct task_ctx *tctx;
169 s32 cpu;
170
171 if (!(tctx = lookup_task_ctx(p)))
172 return prev_cpu;
173
174 if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
175 return prev_cpu;
176
177 cpu = pick_direct_dispatch_cpu(p, prev_cpu);
178
179 if (cpu >= 0) {
180 tctx->force_local = true;
181 return cpu;
182 } else {
183 return prev_cpu;
184 }
185}
186
187static int weight_to_idx(u32 weight)
188{
189 /* Coarsely map the compound weight to a FIFO. */
190 if (weight <= 25)
191 return 0;
192 else if (weight <= 50)
193 return 1;
194 else if (weight < 200)
195 return 2;
196 else if (weight < 400)
197 return 3;
198 else
199 return 4;
200}
201
202void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
203{
204 static u32 user_cnt, kernel_cnt;
205 struct task_ctx *tctx;
206 u32 pid = p->pid;
207 int idx = weight_to_idx(p->scx.weight);
208 void *ring;
209 s32 cpu;
210
211 if (enq_flags & SCX_ENQ_REENQ) {
212 __sync_fetch_and_add(&nr_reenqueued, 1);
213 if (scx_bpf_task_cpu(p) == 0)
214 __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
215 }
216
217 if (p->flags & PF_KTHREAD) {
218 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
219 return;
220 } else {
221 if (stall_user_nth && !(++user_cnt % stall_user_nth))
222 return;
223 }
224
225 if (test_error_cnt && !--test_error_cnt)
226 scx_bpf_error("test triggering error");
227
228 if (!(tctx = lookup_task_ctx(p)))
229 return;
230
231 /*
232 * All enqueued tasks must have their core_sched_seq updated for correct
233 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
234 */
235 tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
236
237 /*
238 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
239 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
240 * and exercise the kernel IMMED reenqueue trigger paths.
241 */
242 if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
243 static u32 immed_stress_cnt;
244
245 if (!(++immed_stress_cnt % immed_stress_nth)) {
246 tctx->force_local = false;
247 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
248 slice_ns, enq_flags);
249 return;
250 }
251 }
252
253 /*
254 * If qmap_select_cpu() is telling us to or this is the last runnable
255 * task on the CPU, enqueue locally.
256 */
257 if (tctx->force_local) {
258 tctx->force_local = false;
259 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
260 return;
261 }
262
263 /* see lowpri_timerfn() */
264 if (__COMPAT_has_generic_reenq() &&
265 p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
266 scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
267 return;
268 }
269
270 /* if select_cpu() wasn't called, try direct dispatch */
271 if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
272 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
273 __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
274 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
275 return;
276 }
277
278 /*
279 * If the task was re-enqueued due to the CPU being preempted by a
280 * higher priority scheduling class, just re-enqueue the task directly
281 * on the global DSQ. As we want another CPU to pick it up, find and
282 * kick an idle CPU.
283 */
284 if (enq_flags & SCX_ENQ_REENQ) {
285 s32 cpu;
286
287 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
288 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
289 if (cpu >= 0)
290 scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
291 return;
292 }
293
294 ring = bpf_map_lookup_elem(&queue_arr, &idx);
295 if (!ring) {
296 scx_bpf_error("failed to find ring %d", idx);
297 return;
298 }
299
300 /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
301 if (bpf_map_push_elem(ring, &pid, 0)) {
302 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
303 return;
304 }
305
306 if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
307 tctx->highpri = true;
308 __sync_fetch_and_add(&nr_highpri_queued, 1);
309 }
310 __sync_fetch_and_add(&nr_enqueued, 1);
311}
312
313/*
314 * The BPF queue map doesn't support removal and sched_ext can handle spurious
315 * dispatches. qmap_dequeue() is only used to collect statistics.
316 */
317void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
318{
319 __sync_fetch_and_add(&nr_dequeued, 1);
320 if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
321 __sync_fetch_and_add(&nr_core_sched_execed, 1);
322}
323
324static void update_core_sched_head_seq(struct task_struct *p)
325{
326 int idx = weight_to_idx(p->scx.weight);
327 struct task_ctx *tctx;
328
329 if ((tctx = lookup_task_ctx(p)))
330 core_sched_head_seqs[idx] = tctx->core_sched_seq;
331}
332
333/*
334 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
335 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
336 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
337 * difference only when dsp_batch is larger than 1.
338 *
339 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
340 * non-rq-lock holding BPF programs. As demonstration, this function is called
341 * from qmap_dispatch() and monitor_timerfn().
342 */
343static bool dispatch_highpri(bool from_timer)
344{
345 struct task_struct *p;
346 s32 this_cpu = bpf_get_smp_processor_id();
347
348 /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
349 bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
350 static u64 highpri_seq;
351 struct task_ctx *tctx;
352
353 if (!(tctx = lookup_task_ctx(p)))
354 return false;
355
356 if (tctx->highpri) {
357 /* exercise the set_*() and vtime interface too */
358 scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
359 scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
360 scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
361 }
362 }
363
364 /*
365 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
366 * is found.
367 */
368 bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
369 bool dispatched = false;
370 s32 cpu;
371
372 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
373 cpu = this_cpu;
374 else
375 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
376
377 if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
378 SCX_ENQ_PREEMPT)) {
379 if (cpu == this_cpu) {
380 dispatched = true;
381 __sync_fetch_and_add(&nr_expedited_local, 1);
382 } else {
383 __sync_fetch_and_add(&nr_expedited_remote, 1);
384 }
385 if (from_timer)
386 __sync_fetch_and_add(&nr_expedited_from_timer, 1);
387 } else {
388 __sync_fetch_and_add(&nr_expedited_lost, 1);
389 }
390
391 if (dispatched)
392 return true;
393 }
394
395 return false;
396}
397
398void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
399{
400 struct task_struct *p;
401 struct cpu_ctx *cpuc;
402 struct task_ctx *tctx;
403 u32 zero = 0, batch = dsp_batch ?: 1;
404 void *fifo;
405 s32 i, pid;
406
407 if (dispatch_highpri(false))
408 return;
409
410 if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
411 return;
412
413 if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
414 /*
415 * PID 2 should be kthreadd which should mostly be idle and off
416 * the scheduler. Let's keep dispatching it to force the kernel
417 * to call this function over and over again.
418 */
419 p = bpf_task_from_pid(2);
420 if (p) {
421 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
422 bpf_task_release(p);
423 return;
424 }
425 }
426
427 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
428 scx_bpf_error("failed to look up cpu_ctx");
429 return;
430 }
431
432 for (i = 0; i < 5; i++) {
433 /* Advance the dispatch cursor and pick the fifo. */
434 if (!cpuc->dsp_cnt) {
435 cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
436 cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
437 }
438
439 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
440 if (!fifo) {
441 scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
442 return;
443 }
444
445 /* Dispatch or advance. */
446 bpf_repeat(BPF_MAX_LOOPS) {
447 struct task_ctx *tctx;
448
449 if (bpf_map_pop_elem(fifo, &pid))
450 break;
451
452 p = bpf_task_from_pid(pid);
453 if (!p)
454 continue;
455
456 if (!(tctx = lookup_task_ctx(p))) {
457 bpf_task_release(p);
458 return;
459 }
460
461 if (tctx->highpri)
462 __sync_fetch_and_sub(&nr_highpri_queued, 1);
463
464 update_core_sched_head_seq(p);
465 __sync_fetch_and_add(&nr_dispatched, 1);
466
467 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
468
469 /*
470 * scx_qmap uses a global BPF queue that any CPU's
471 * dispatch can pop from. If this CPU popped a task that
472 * can't run here, it gets stranded on SHARED_DSQ after
473 * consume_dispatch_q() skips it. Kick the task's home
474 * CPU so it drains SHARED_DSQ.
475 *
476 * There's a race between the pop and the flush of the
477 * buffered dsq_insert:
478 *
479 * CPU 0 (dispatching) CPU 1 (home, idle)
480 * ~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
481 * pop from BPF queue
482 * dsq_insert(buffered)
483 * balance:
484 * SHARED_DSQ empty
485 * BPF queue empty
486 * -> goes idle
487 * flush -> on SHARED
488 * kick CPU 1
489 * wakes, drains task
490 *
491 * The kick prevents indefinite stalls but a per-CPU
492 * kthread like ksoftirqd can be briefly stranded when
493 * its home CPU enters idle with softirq pending,
494 * triggering:
495 *
496 * "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
497 *
498 * from report_idle_softirq(). The kick lands shortly
499 * after and the home CPU drains the task. This could be
500 * avoided by e.g. dispatching pinned tasks to local or
501 * global DSQs, but the current code is left as-is to
502 * document this class of issue -- other schedulers
503 * seeing similar warnings can use this as a reference.
504 */
505 if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
506 scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
507
508 bpf_task_release(p);
509
510 batch--;
511 cpuc->dsp_cnt--;
512 if (!batch || !scx_bpf_dispatch_nr_slots()) {
513 if (dispatch_highpri(false))
514 return;
515 scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
516 return;
517 }
518 if (!cpuc->dsp_cnt)
519 break;
520 }
521
522 cpuc->dsp_cnt = 0;
523 }
524
525 for (i = 0; i < MAX_SUB_SCHEDS; i++) {
526 if (sub_sched_cgroup_ids[i] &&
527 scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
528 return;
529 }
530
531 /*
532 * No other tasks. @prev will keep running. Update its core_sched_seq as
533 * if the task were enqueued and dispatched immediately.
534 */
535 if (prev) {
536 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
537 if (tctx)
538 tctx->core_sched_seq =
539 core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
540 }
541}
542
543void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
544{
545 struct cpu_ctx *cpuc;
546 u32 zero = 0;
547 int idx;
548
549 if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
550 scx_bpf_error("failed to look up cpu_ctx");
551 return;
552 }
553
554 /*
555 * Use the running avg of weights to select the target cpuperf level.
556 * This is a demonstration of the cpuperf feature rather than a
557 * practical strategy to regulate CPU frequency.
558 */
559 cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
560 idx = weight_to_idx(cpuc->avg_weight);
561 cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
562
563 scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
564}
565
566/*
567 * The distance from the head of the queue scaled by the weight of the queue.
568 * The lower the number, the older the task and the higher the priority.
569 */
570static s64 task_qdist(struct task_struct *p)
571{
572 int idx = weight_to_idx(p->scx.weight);
573 struct task_ctx *tctx;
574 s64 qdist;
575
576 tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
577 if (!tctx)
578 return 0;
579
580 qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
581
582 /*
583 * As queue index increments, the priority doubles. The queue w/ index 3
584 * is dispatched twice more frequently than 2. Reflect the difference by
585 * scaling qdists accordingly. Note that the shift amount needs to be
586 * flipped depending on the sign to avoid flipping priority direction.
587 */
588 if (qdist >= 0)
589 return qdist << (4 - idx);
590 else
591 return qdist << idx;
592}
593
594/*
595 * This is called to determine the task ordering when core-sched is picking
596 * tasks to execute on SMT siblings and should encode about the same ordering as
597 * the regular scheduling path. Use the priority-scaled distances from the head
598 * of the queues to compare the two tasks which should be consistent with the
599 * dispatch path behavior.
600 */
601bool BPF_STRUCT_OPS(qmap_core_sched_before,
602 struct task_struct *a, struct task_struct *b)
603{
604 return task_qdist(a) > task_qdist(b);
605}
606
607/*
608 * sched_switch tracepoint and cpu_release handlers are no longer needed.
609 * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
610 * tasks when a higher-priority scheduling class takes the CPU.
611 */
612
613s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
614 struct scx_init_task_args *args)
615{
616 if (p->tgid == disallow_tgid)
617 p->scx.disallow = true;
618
619 /*
620 * @p is new. Let's ensure that its task_ctx is available. We can sleep
621 * in this function and the following will automatically use GFP_KERNEL.
622 */
623 if (bpf_task_storage_get(&task_ctx_stor, p, 0,
624 BPF_LOCAL_STORAGE_GET_F_CREATE))
625 return 0;
626 else
627 return -ENOMEM;
628}
629
630void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
631{
632 s32 i, pid;
633
634 if (suppress_dump)
635 return;
636
637 bpf_for(i, 0, 5) {
638 void *fifo;
639
640 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
641 return;
642
643 scx_bpf_dump("QMAP FIFO[%d]:", i);
644
645 /*
646 * Dump can be invoked anytime and there is no way to iterate in
647 * a non-destructive way. Pop and store in dump_store and then
648 * restore afterwards. If racing against new enqueues, ordering
649 * can get mixed up.
650 */
651 bpf_repeat(4096) {
652 if (bpf_map_pop_elem(fifo, &pid))
653 break;
654 bpf_map_push_elem(&dump_store, &pid, 0);
655 scx_bpf_dump(" %d", pid);
656 }
657
658 bpf_repeat(4096) {
659 if (bpf_map_pop_elem(&dump_store, &pid))
660 break;
661 bpf_map_push_elem(fifo, &pid, 0);
662 }
663
664 scx_bpf_dump("\n");
665 }
666}
667
668void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
669{
670 u32 zero = 0;
671 struct cpu_ctx *cpuc;
672
673 if (suppress_dump || idle)
674 return;
675 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
676 return;
677
678 scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
679 cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
680 cpuc->cpuperf_target);
681}
682
683void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
684{
685 struct task_ctx *taskc;
686
687 if (suppress_dump)
688 return;
689 if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
690 return;
691
692 scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
693 taskc->force_local, taskc->core_sched_seq);
694}
695
696s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
697{
698 if (print_msgs)
699 bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
700 cgrp->kn->id, args->weight, args->bw_period_us,
701 args->bw_quota_us, args->bw_burst_us);
702 return 0;
703}
704
705void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
706{
707 if (print_msgs)
708 bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
709}
710
711void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
712 u64 period_us, u64 quota_us, u64 burst_us)
713{
714 if (print_msgs)
715 bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
716 cgrp->kn->id, period_us, quota_us, burst_us);
717}
718
719/*
720 * Print out the online and possible CPU map using bpf_printk() as a
721 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
722 */
723static void print_cpus(void)
724{
725 const struct cpumask *possible, *online;
726 s32 cpu;
727 char buf[128] = "", *p;
728 int idx;
729
730 possible = scx_bpf_get_possible_cpumask();
731 online = scx_bpf_get_online_cpumask();
732
733 idx = 0;
734 bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
735 if (!(p = MEMBER_VPTR(buf, [idx++])))
736 break;
737 if (bpf_cpumask_test_cpu(cpu, online))
738 *p++ = 'O';
739 else if (bpf_cpumask_test_cpu(cpu, possible))
740 *p++ = 'X';
741 else
742 *p++ = ' ';
743
744 if ((cpu & 7) == 7) {
745 if (!(p = MEMBER_VPTR(buf, [idx++])))
746 break;
747 *p++ = '|';
748 }
749 }
750 buf[sizeof(buf) - 1] = '\0';
751
752 scx_bpf_put_cpumask(online);
753 scx_bpf_put_cpumask(possible);
754
755 bpf_printk("CPUS: |%s", buf);
756}
757
758void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
759{
760 if (print_msgs) {
761 bpf_printk("CPU %d coming online", cpu);
762 /* @cpu is already online at this point */
763 print_cpus();
764 }
765}
766
767void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
768{
769 if (print_msgs) {
770 bpf_printk("CPU %d going offline", cpu);
771 /* @cpu is still online at this point */
772 print_cpus();
773 }
774}
775
776struct monitor_timer {
777 struct bpf_timer timer;
778};
779
780struct {
781 __uint(type, BPF_MAP_TYPE_ARRAY);
782 __uint(max_entries, 1);
783 __type(key, u32);
784 __type(value, struct monitor_timer);
785} monitor_timer SEC(".maps");
786
787/*
788 * Print out the min, avg and max performance levels of CPUs every second to
789 * demonstrate the cpuperf interface.
790 */
791static void monitor_cpuperf(void)
792{
793 u32 zero = 0, nr_cpu_ids;
794 u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
795 u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
796 const struct cpumask *online;
797 int i, nr_online_cpus = 0;
798
799 nr_cpu_ids = scx_bpf_nr_cpu_ids();
800 online = scx_bpf_get_online_cpumask();
801
802 bpf_for(i, 0, nr_cpu_ids) {
803 struct cpu_ctx *cpuc;
804 u32 cap, cur;
805
806 if (!bpf_cpumask_test_cpu(i, online))
807 continue;
808 nr_online_cpus++;
809
810 /* collect the capacity and current cpuperf */
811 cap = scx_bpf_cpuperf_cap(i);
812 cur = scx_bpf_cpuperf_cur(i);
813
814 cur_min = cur < cur_min ? cur : cur_min;
815 cur_max = cur > cur_max ? cur : cur_max;
816
817 /*
818 * $cur is relative to $cap. Scale it down accordingly so that
819 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
820 * makes sense.
821 */
822 cur_sum += cur * cap / SCX_CPUPERF_ONE;
823 cap_sum += cap;
824
825 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
826 scx_bpf_error("failed to look up cpu_ctx");
827 goto out;
828 }
829
830 /* collect target */
831 cur = cpuc->cpuperf_target;
832 target_sum += cur;
833 target_min = cur < target_min ? cur : target_min;
834 target_max = cur > target_max ? cur : target_max;
835 }
836
837 cpuperf_min = cur_min;
838 cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
839 cpuperf_max = cur_max;
840
841 cpuperf_target_min = target_min;
842 cpuperf_target_avg = target_sum / nr_online_cpus;
843 cpuperf_target_max = target_max;
844out:
845 scx_bpf_put_cpumask(online);
846}
847
848/*
849 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
850 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
851 * see meaningful dumps in the trace pipe.
852 */
853static void dump_shared_dsq(void)
854{
855 struct task_struct *p;
856 s32 nr;
857
858 if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
859 return;
860
861 bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
862
863 bpf_rcu_read_lock();
864 bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
865 bpf_printk("%s[%d]", p->comm, p->pid);
866 bpf_rcu_read_unlock();
867}
868
869static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
870{
871 bpf_rcu_read_lock();
872 dispatch_highpri(true);
873 bpf_rcu_read_unlock();
874
875 monitor_cpuperf();
876
877 if (print_dsqs_and_events) {
878 struct scx_event_stats events;
879
880 dump_shared_dsq();
881
882 __COMPAT_scx_bpf_events(&events, sizeof(events));
883
884 bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
885 scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
886 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
887 scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
888 bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
889 scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
890 bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
891 scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
892 bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
893 scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
894 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
895 scx_read_event(&events, SCX_EV_BYPASS_DURATION));
896 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
897 scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
898 bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
899 scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
900 }
901
902 bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
903 return 0;
904}
905
906struct lowpri_timer {
907 struct bpf_timer timer;
908};
909
910struct {
911 __uint(type, BPF_MAP_TYPE_ARRAY);
912 __uint(max_entries, 1);
913 __type(key, u32);
914 __type(value, struct lowpri_timer);
915} lowpri_timer SEC(".maps");
916
917/*
918 * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
919 * the tasks are transferred to SHARED_DSQ.
920 */
921static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
922{
923 scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
924 bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
925 return 0;
926}
927
928s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
929{
930 u32 key = 0;
931 struct bpf_timer *timer;
932 s32 ret;
933
934 if (print_msgs && !sub_cgroup_id)
935 print_cpus();
936
937 ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
938 if (ret) {
939 scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
940 return ret;
941 }
942
943 ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
944 if (ret) {
945 scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
946 return ret;
947 }
948
949 ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
950 if (ret)
951 return ret;
952
953 timer = bpf_map_lookup_elem(&monitor_timer, &key);
954 if (!timer)
955 return -ESRCH;
956 bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
957 bpf_timer_set_callback(timer, monitor_timerfn);
958 ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
959 if (ret)
960 return ret;
961
962 if (__COMPAT_has_generic_reenq()) {
963 /* see lowpri_timerfn() */
964 timer = bpf_map_lookup_elem(&lowpri_timer, &key);
965 if (!timer)
966 return -ESRCH;
967 bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
968 bpf_timer_set_callback(timer, lowpri_timerfn);
969 ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
970 if (ret)
971 return ret;
972 }
973
974 return 0;
975}
976
977void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
978{
979 UEI_RECORD(uei, ei);
980}
981
982s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
983{
984 s32 i;
985
986 for (i = 0; i < MAX_SUB_SCHEDS; i++) {
987 if (!sub_sched_cgroup_ids[i]) {
988 sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
989 bpf_printk("attaching sub-sched[%d] on %s",
990 i, args->cgroup_path);
991 return 0;
992 }
993 }
994
995 return -ENOSPC;
996}
997
998void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
999{
1000 s32 i;
1001
1002 for (i = 0; i < MAX_SUB_SCHEDS; i++) {
1003 if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
1004 sub_sched_cgroup_ids[i] = 0;
1005 bpf_printk("detaching sub-sched[%d] on %s",
1006 i, args->cgroup_path);
1007 break;
1008 }
1009 }
1010}
1011
1012SCX_OPS_DEFINE(qmap_ops,
1013 .select_cpu = (void *)qmap_select_cpu,
1014 .enqueue = (void *)qmap_enqueue,
1015 .dequeue = (void *)qmap_dequeue,
1016 .dispatch = (void *)qmap_dispatch,
1017 .tick = (void *)qmap_tick,
1018 .core_sched_before = (void *)qmap_core_sched_before,
1019 .init_task = (void *)qmap_init_task,
1020 .dump = (void *)qmap_dump,
1021 .dump_cpu = (void *)qmap_dump_cpu,
1022 .dump_task = (void *)qmap_dump_task,
1023 .cgroup_init = (void *)qmap_cgroup_init,
1024 .cgroup_set_weight = (void *)qmap_cgroup_set_weight,
1025 .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
1026 .sub_attach = (void *)qmap_sub_attach,
1027 .sub_detach = (void *)qmap_sub_detach,
1028 .cpu_online = (void *)qmap_cpu_online,
1029 .cpu_offline = (void *)qmap_cpu_offline,
1030 .init = (void *)qmap_init,
1031 .exit = (void *)qmap_exit,
1032 .timeout_ms = 5000U,
1033 .name = "qmap");