tools/sched_ext/scx_qmap.bpf.c at master

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / tools / sched_ext / scx_qmap.bpf.c
at master 1033 lines 28 kB view raw
wrap content
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/*
   3 * A simple five-level FIFO queue scheduler.
   4 *
   5 * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
   6 * assigned to one depending on its compound weight. Each CPU round robins
   7 * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
   8 * queue0, 2 from queue1, 4 from queue2 and so on.
   9 *
  10 * This scheduler demonstrates:
  11 *
  12 * - BPF-side queueing using PIDs.
  13 * - Sleepable per-task storage allocation using ops.prep_enable().
  14 * - Core-sched support.
  15 *
  16 * This scheduler is primarily for demonstration and testing of sched_ext
  17 * features and unlikely to be useful for actual workloads.
  18 *
  19 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  20 * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  21 * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  22 */
  23#include <scx/common.bpf.h>
  24
  25enum consts {
  26	ONE_SEC_IN_NS		= 1000000000,
  27	ONE_MSEC_IN_NS		= 1000000,
  28	LOWPRI_INTV_NS		= 10 * ONE_MSEC_IN_NS,
  29	SHARED_DSQ		= 0,
  30	HIGHPRI_DSQ		= 1,
  31	LOWPRI_DSQ		= 2,
  32	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
  33};
  34
  35char _license[] SEC("license") = "GPL";
  36
  37const volatile u64 slice_ns;
  38const volatile u32 stall_user_nth;
  39const volatile u32 stall_kernel_nth;
  40const volatile u32 dsp_inf_loop_after;
  41const volatile u32 dsp_batch;
  42const volatile bool highpri_boosting;
  43const volatile bool print_dsqs_and_events;
  44const volatile bool print_msgs;
  45const volatile u64 sub_cgroup_id;
  46const volatile s32 disallow_tgid;
  47const volatile bool suppress_dump;
  48const volatile bool always_enq_immed;
  49const volatile u32 immed_stress_nth;
  50
  51u64 nr_highpri_queued;
  52u32 test_error_cnt;
  53
  54#define MAX_SUB_SCHEDS		8
  55u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
  56
  57UEI_DEFINE(uei);
  58
  59struct qmap {
  60	__uint(type, BPF_MAP_TYPE_QUEUE);
  61	__uint(max_entries, 4096);
  62	__type(value, u32);
  63} queue0 SEC(".maps"),
  64  queue1 SEC(".maps"),
  65  queue2 SEC(".maps"),
  66  queue3 SEC(".maps"),
  67  queue4 SEC(".maps"),
  68  dump_store SEC(".maps");
  69
  70struct {
  71	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
  72	__uint(max_entries, 5);
  73	__type(key, int);
  74	__array(values, struct qmap);
  75} queue_arr SEC(".maps") = {
  76	.values = {
  77		[0] = &queue0,
  78		[1] = &queue1,
  79		[2] = &queue2,
  80		[3] = &queue3,
  81		[4] = &queue4,
  82	},
  83};
  84
  85/*
  86 * If enabled, CPU performance target is set according to the queue index
  87 * according to the following table.
  88 */
  89static const u32 qidx_to_cpuperf_target[] = {
  90	[0] = SCX_CPUPERF_ONE * 0 / 4,
  91	[1] = SCX_CPUPERF_ONE * 1 / 4,
  92	[2] = SCX_CPUPERF_ONE * 2 / 4,
  93	[3] = SCX_CPUPERF_ONE * 3 / 4,
  94	[4] = SCX_CPUPERF_ONE * 4 / 4,
  95};
  96
  97/*
  98 * Per-queue sequence numbers to implement core-sched ordering.
  99 *
 100 * Tail seq is assigned to each queued task and incremented. Head seq tracks the
 101 * sequence number of the latest dispatched task. The distance between the a
 102 * task's seq and the associated queue's head seq is called the queue distance
 103 * and used when comparing two tasks for ordering. See qmap_core_sched_before().
 104 */
 105static u64 core_sched_head_seqs[5];
 106static u64 core_sched_tail_seqs[5];
 107
 108/* Per-task scheduling context */
 109struct task_ctx {
 110	bool	force_local;	/* Dispatch directly to local_dsq */
 111	bool	highpri;
 112	u64	core_sched_seq;
 113};
 114
 115struct {
 116	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 117	__uint(map_flags, BPF_F_NO_PREALLOC);
 118	__type(key, int);
 119	__type(value, struct task_ctx);
 120} task_ctx_stor SEC(".maps");
 121
 122struct cpu_ctx {
 123	u64	dsp_idx;	/* dispatch index */
 124	u64	dsp_cnt;	/* remaining count */
 125	u32	avg_weight;
 126	u32	cpuperf_target;
 127};
 128
 129struct {
 130	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 131	__uint(max_entries, 1);
 132	__type(key, u32);
 133	__type(value, struct cpu_ctx);
 134} cpu_ctx_stor SEC(".maps");
 135
 136/* Statistics */
 137u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
 138u64 nr_core_sched_execed;
 139u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
 140u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 141u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 142
 143static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 144{
 145	s32 cpu;
 146
 147	if (!always_enq_immed && p->nr_cpus_allowed == 1)
 148		return prev_cpu;
 149
 150	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
 151		return prev_cpu;
 152
 153	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 154	if (cpu >= 0)
 155		return cpu;
 156
 157	return -1;
 158}
 159
 160static struct task_ctx *lookup_task_ctx(struct task_struct *p)
 161{
 162	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
 163}
 164
 165s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 166		   s32 prev_cpu, u64 wake_flags)
 167{
 168	struct task_ctx *tctx;
 169	s32 cpu;
 170
 171	if (!(tctx = lookup_task_ctx(p)))
 172		return prev_cpu;
 173
 174	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
 175		return prev_cpu;
 176
 177	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 178
 179	if (cpu >= 0) {
 180		tctx->force_local = true;
 181		return cpu;
 182	} else {
 183		return prev_cpu;
 184	}
 185}
 186
 187static int weight_to_idx(u32 weight)
 188{
 189	/* Coarsely map the compound weight to a FIFO. */
 190	if (weight <= 25)
 191		return 0;
 192	else if (weight <= 50)
 193		return 1;
 194	else if (weight < 200)
 195		return 2;
 196	else if (weight < 400)
 197		return 3;
 198	else
 199		return 4;
 200}
 201
 202void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 203{
 204	static u32 user_cnt, kernel_cnt;
 205	struct task_ctx *tctx;
 206	u32 pid = p->pid;
 207	int idx = weight_to_idx(p->scx.weight);
 208	void *ring;
 209	s32 cpu;
 210
 211	if (enq_flags & SCX_ENQ_REENQ) {
 212		__sync_fetch_and_add(&nr_reenqueued, 1);
 213		if (scx_bpf_task_cpu(p) == 0)
 214			__sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
 215	}
 216
 217	if (p->flags & PF_KTHREAD) {
 218		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
 219			return;
 220	} else {
 221		if (stall_user_nth && !(++user_cnt % stall_user_nth))
 222			return;
 223	}
 224
 225	if (test_error_cnt && !--test_error_cnt)
 226		scx_bpf_error("test triggering error");
 227
 228	if (!(tctx = lookup_task_ctx(p)))
 229		return;
 230
 231	/*
 232	 * All enqueued tasks must have their core_sched_seq updated for correct
 233	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 234	 */
 235	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
 236
 237	/*
 238	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
 239	 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
 240	 * and exercise the kernel IMMED reenqueue trigger paths.
 241	 */
 242	if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
 243		static u32 immed_stress_cnt;
 244
 245		if (!(++immed_stress_cnt % immed_stress_nth)) {
 246			tctx->force_local = false;
 247			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
 248					   slice_ns, enq_flags);
 249			return;
 250		}
 251	}
 252
 253	/*
 254	 * If qmap_select_cpu() is telling us to or this is the last runnable
 255	 * task on the CPU, enqueue locally.
 256	 */
 257	if (tctx->force_local) {
 258		tctx->force_local = false;
 259		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 260		return;
 261	}
 262
 263	/* see lowpri_timerfn() */
 264	if (__COMPAT_has_generic_reenq() &&
 265	    p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
 266		scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
 267		return;
 268	}
 269
 270	/* if select_cpu() wasn't called, try direct dispatch */
 271	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
 272	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
 273		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
 274		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 275		return;
 276	}
 277
 278	/*
 279	 * If the task was re-enqueued due to the CPU being preempted by a
 280	 * higher priority scheduling class, just re-enqueue the task directly
 281	 * on the global DSQ. As we want another CPU to pick it up, find and
 282	 * kick an idle CPU.
 283	 */
 284	if (enq_flags & SCX_ENQ_REENQ) {
 285		s32 cpu;
 286
 287		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
 288		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 289		if (cpu >= 0)
 290			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
 291		return;
 292	}
 293
 294	ring = bpf_map_lookup_elem(&queue_arr, &idx);
 295	if (!ring) {
 296		scx_bpf_error("failed to find ring %d", idx);
 297		return;
 298	}
 299
 300	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
 301	if (bpf_map_push_elem(ring, &pid, 0)) {
 302		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
 303		return;
 304	}
 305
 306	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
 307		tctx->highpri = true;
 308		__sync_fetch_and_add(&nr_highpri_queued, 1);
 309	}
 310	__sync_fetch_and_add(&nr_enqueued, 1);
 311}
 312
 313/*
 314 * The BPF queue map doesn't support removal and sched_ext can handle spurious
 315 * dispatches. qmap_dequeue() is only used to collect statistics.
 316 */
 317void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 318{
 319	__sync_fetch_and_add(&nr_dequeued, 1);
 320	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
 321		__sync_fetch_and_add(&nr_core_sched_execed, 1);
 322}
 323
 324static void update_core_sched_head_seq(struct task_struct *p)
 325{
 326	int idx = weight_to_idx(p->scx.weight);
 327	struct task_ctx *tctx;
 328
 329	if ((tctx = lookup_task_ctx(p)))
 330		core_sched_head_seqs[idx] = tctx->core_sched_seq;
 331}
 332
 333/*
 334 * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
 335 * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
 336 * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
 337 * difference only when dsp_batch is larger than 1.
 338 *
 339 * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
 340 * non-rq-lock holding BPF programs. As demonstration, this function is called
 341 * from qmap_dispatch() and monitor_timerfn().
 342 */
 343static bool dispatch_highpri(bool from_timer)
 344{
 345	struct task_struct *p;
 346	s32 this_cpu = bpf_get_smp_processor_id();
 347
 348	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 349	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 350		static u64 highpri_seq;
 351		struct task_ctx *tctx;
 352
 353		if (!(tctx = lookup_task_ctx(p)))
 354			return false;
 355
 356		if (tctx->highpri) {
 357			/* exercise the set_*() and vtime interface too */
 358			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
 359			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
 360			scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
 361		}
 362	}
 363
 364	/*
 365	 * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
 366	 * is found.
 367	 */
 368	bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
 369		bool dispatched = false;
 370		s32 cpu;
 371
 372		if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
 373			cpu = this_cpu;
 374		else
 375			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
 376
 377		if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
 378				     SCX_ENQ_PREEMPT)) {
 379			if (cpu == this_cpu) {
 380				dispatched = true;
 381				__sync_fetch_and_add(&nr_expedited_local, 1);
 382			} else {
 383				__sync_fetch_and_add(&nr_expedited_remote, 1);
 384			}
 385			if (from_timer)
 386				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
 387		} else {
 388			__sync_fetch_and_add(&nr_expedited_lost, 1);
 389		}
 390
 391		if (dispatched)
 392			return true;
 393	}
 394
 395	return false;
 396}
 397
 398void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 399{
 400	struct task_struct *p;
 401	struct cpu_ctx *cpuc;
 402	struct task_ctx *tctx;
 403	u32 zero = 0, batch = dsp_batch ?: 1;
 404	void *fifo;
 405	s32 i, pid;
 406
 407	if (dispatch_highpri(false))
 408		return;
 409
 410	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
 411		return;
 412
 413	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
 414		/*
 415		 * PID 2 should be kthreadd which should mostly be idle and off
 416		 * the scheduler. Let's keep dispatching it to force the kernel
 417		 * to call this function over and over again.
 418		 */
 419		p = bpf_task_from_pid(2);
 420		if (p) {
 421			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
 422			bpf_task_release(p);
 423			return;
 424		}
 425	}
 426
 427	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
 428		scx_bpf_error("failed to look up cpu_ctx");
 429		return;
 430	}
 431
 432	for (i = 0; i < 5; i++) {
 433		/* Advance the dispatch cursor and pick the fifo. */
 434		if (!cpuc->dsp_cnt) {
 435			cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
 436			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 437		}
 438
 439		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
 440		if (!fifo) {
 441			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
 442			return;
 443		}
 444
 445		/* Dispatch or advance. */
 446		bpf_repeat(BPF_MAX_LOOPS) {
 447			struct task_ctx *tctx;
 448
 449			if (bpf_map_pop_elem(fifo, &pid))
 450				break;
 451
 452			p = bpf_task_from_pid(pid);
 453			if (!p)
 454				continue;
 455
 456			if (!(tctx = lookup_task_ctx(p))) {
 457				bpf_task_release(p);
 458				return;
 459			}
 460
 461			if (tctx->highpri)
 462				__sync_fetch_and_sub(&nr_highpri_queued, 1);
 463
 464			update_core_sched_head_seq(p);
 465			__sync_fetch_and_add(&nr_dispatched, 1);
 466
 467			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
 468
 469			/*
 470			 * scx_qmap uses a global BPF queue that any CPU's
 471			 * dispatch can pop from. If this CPU popped a task that
 472			 * can't run here, it gets stranded on SHARED_DSQ after
 473			 * consume_dispatch_q() skips it. Kick the task's home
 474			 * CPU so it drains SHARED_DSQ.
 475			 *
 476			 * There's a race between the pop and the flush of the
 477			 * buffered dsq_insert:
 478			 *
 479			 *  CPU 0 (dispatching)      CPU 1 (home, idle)
 480			 *  ~~~~~~~~~~~~~~~~~~~      ~~~~~~~~~~~~~~~~~~~
 481			 *  pop from BPF queue
 482			 *  dsq_insert(buffered)
 483			 *                           balance:
 484			 *                             SHARED_DSQ empty
 485			 *                             BPF queue empty
 486			 *                             -> goes idle
 487			 *  flush -> on SHARED
 488			 *  kick CPU 1
 489			 *                           wakes, drains task
 490			 *
 491			 * The kick prevents indefinite stalls but a per-CPU
 492			 * kthread like ksoftirqd can be briefly stranded when
 493			 * its home CPU enters idle with softirq pending,
 494			 * triggering:
 495			 *
 496			 *  "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
 497			 *
 498			 * from report_idle_softirq(). The kick lands shortly
 499			 * after and the home CPU drains the task. This could be
 500			 * avoided by e.g. dispatching pinned tasks to local or
 501			 * global DSQs, but the current code is left as-is to
 502			 * document this class of issue -- other schedulers
 503			 * seeing similar warnings can use this as a reference.
 504			 */
 505			if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
 506				scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
 507
 508			bpf_task_release(p);
 509
 510			batch--;
 511			cpuc->dsp_cnt--;
 512			if (!batch || !scx_bpf_dispatch_nr_slots()) {
 513				if (dispatch_highpri(false))
 514					return;
 515				scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
 516				return;
 517			}
 518			if (!cpuc->dsp_cnt)
 519				break;
 520		}
 521
 522		cpuc->dsp_cnt = 0;
 523	}
 524
 525	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
 526		if (sub_sched_cgroup_ids[i] &&
 527		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
 528			return;
 529	}
 530
 531	/*
 532	 * No other tasks. @prev will keep running. Update its core_sched_seq as
 533	 * if the task were enqueued and dispatched immediately.
 534	 */
 535	if (prev) {
 536		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
 537		if (tctx)
 538			tctx->core_sched_seq =
 539				core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 540	}
 541}
 542
 543void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 544{
 545	struct cpu_ctx *cpuc;
 546	u32 zero = 0;
 547	int idx;
 548
 549	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
 550		scx_bpf_error("failed to look up cpu_ctx");
 551		return;
 552	}
 553
 554	/*
 555	 * Use the running avg of weights to select the target cpuperf level.
 556	 * This is a demonstration of the cpuperf feature rather than a
 557	 * practical strategy to regulate CPU frequency.
 558	 */
 559	cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
 560	idx = weight_to_idx(cpuc->avg_weight);
 561	cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
 562
 563	scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
 564}
 565
 566/*
 567 * The distance from the head of the queue scaled by the weight of the queue.
 568 * The lower the number, the older the task and the higher the priority.
 569 */
 570static s64 task_qdist(struct task_struct *p)
 571{
 572	int idx = weight_to_idx(p->scx.weight);
 573	struct task_ctx *tctx;
 574	s64 qdist;
 575
 576	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
 577	if (!tctx)
 578		return 0;
 579
 580	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
 581
 582	/*
 583	 * As queue index increments, the priority doubles. The queue w/ index 3
 584	 * is dispatched twice more frequently than 2. Reflect the difference by
 585	 * scaling qdists accordingly. Note that the shift amount needs to be
 586	 * flipped depending on the sign to avoid flipping priority direction.
 587	 */
 588	if (qdist >= 0)
 589		return qdist << (4 - idx);
 590	else
 591		return qdist << idx;
 592}
 593
 594/*
 595 * This is called to determine the task ordering when core-sched is picking
 596 * tasks to execute on SMT siblings and should encode about the same ordering as
 597 * the regular scheduling path. Use the priority-scaled distances from the head
 598 * of the queues to compare the two tasks which should be consistent with the
 599 * dispatch path behavior.
 600 */
 601bool BPF_STRUCT_OPS(qmap_core_sched_before,
 602		    struct task_struct *a, struct task_struct *b)
 603{
 604	return task_qdist(a) > task_qdist(b);
 605}
 606
 607/*
 608 * sched_switch tracepoint and cpu_release handlers are no longer needed.
 609 * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
 610 * tasks when a higher-priority scheduling class takes the CPU.
 611 */
 612
 613s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 614		   struct scx_init_task_args *args)
 615{
 616	if (p->tgid == disallow_tgid)
 617		p->scx.disallow = true;
 618
 619	/*
 620	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
 621	 * in this function and the following will automatically use GFP_KERNEL.
 622	 */
 623	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
 624				 BPF_LOCAL_STORAGE_GET_F_CREATE))
 625		return 0;
 626	else
 627		return -ENOMEM;
 628}
 629
 630void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 631{
 632	s32 i, pid;
 633
 634	if (suppress_dump)
 635		return;
 636
 637	bpf_for(i, 0, 5) {
 638		void *fifo;
 639
 640		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
 641			return;
 642
 643		scx_bpf_dump("QMAP FIFO[%d]:", i);
 644
 645		/*
 646		 * Dump can be invoked anytime and there is no way to iterate in
 647		 * a non-destructive way. Pop and store in dump_store and then
 648		 * restore afterwards. If racing against new enqueues, ordering
 649		 * can get mixed up.
 650		 */
 651		bpf_repeat(4096) {
 652			if (bpf_map_pop_elem(fifo, &pid))
 653				break;
 654			bpf_map_push_elem(&dump_store, &pid, 0);
 655			scx_bpf_dump(" %d", pid);
 656		}
 657
 658		bpf_repeat(4096) {
 659			if (bpf_map_pop_elem(&dump_store, &pid))
 660				break;
 661			bpf_map_push_elem(fifo, &pid, 0);
 662		}
 663
 664		scx_bpf_dump("\n");
 665	}
 666}
 667
 668void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
 669{
 670	u32 zero = 0;
 671	struct cpu_ctx *cpuc;
 672
 673	if (suppress_dump || idle)
 674		return;
 675	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
 676		return;
 677
 678	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
 679		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
 680		     cpuc->cpuperf_target);
 681}
 682
 683void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
 684{
 685	struct task_ctx *taskc;
 686
 687	if (suppress_dump)
 688		return;
 689	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
 690		return;
 691
 692	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
 693		     taskc->force_local, taskc->core_sched_seq);
 694}
 695
 696s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
 697{
 698	if (print_msgs)
 699		bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
 700			   cgrp->kn->id, args->weight, args->bw_period_us,
 701			   args->bw_quota_us, args->bw_burst_us);
 702	return 0;
 703}
 704
 705void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
 706{
 707	if (print_msgs)
 708		bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
 709}
 710
 711void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
 712		    u64 period_us, u64 quota_us, u64 burst_us)
 713{
 714	if (print_msgs)
 715		bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
 716			   cgrp->kn->id, period_us, quota_us, burst_us);
 717}
 718
 719/*
 720 * Print out the online and possible CPU map using bpf_printk() as a
 721 * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
 722 */
 723static void print_cpus(void)
 724{
 725	const struct cpumask *possible, *online;
 726	s32 cpu;
 727	char buf[128] = "", *p;
 728	int idx;
 729
 730	possible = scx_bpf_get_possible_cpumask();
 731	online = scx_bpf_get_online_cpumask();
 732
 733	idx = 0;
 734	bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
 735		if (!(p = MEMBER_VPTR(buf, [idx++])))
 736			break;
 737		if (bpf_cpumask_test_cpu(cpu, online))
 738			*p++ = 'O';
 739		else if (bpf_cpumask_test_cpu(cpu, possible))
 740			*p++ = 'X';
 741		else
 742			*p++ = ' ';
 743
 744		if ((cpu & 7) == 7) {
 745			if (!(p = MEMBER_VPTR(buf, [idx++])))
 746				break;
 747			*p++ = '|';
 748		}
 749	}
 750	buf[sizeof(buf) - 1] = '\0';
 751
 752	scx_bpf_put_cpumask(online);
 753	scx_bpf_put_cpumask(possible);
 754
 755	bpf_printk("CPUS: |%s", buf);
 756}
 757
 758void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
 759{
 760	if (print_msgs) {
 761		bpf_printk("CPU %d coming online", cpu);
 762		/* @cpu is already online at this point */
 763		print_cpus();
 764	}
 765}
 766
 767void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
 768{
 769	if (print_msgs) {
 770		bpf_printk("CPU %d going offline", cpu);
 771		/* @cpu is still online at this point */
 772		print_cpus();
 773	}
 774}
 775
 776struct monitor_timer {
 777	struct bpf_timer timer;
 778};
 779
 780struct {
 781	__uint(type, BPF_MAP_TYPE_ARRAY);
 782	__uint(max_entries, 1);
 783	__type(key, u32);
 784	__type(value, struct monitor_timer);
 785} monitor_timer SEC(".maps");
 786
 787/*
 788 * Print out the min, avg and max performance levels of CPUs every second to
 789 * demonstrate the cpuperf interface.
 790 */
 791static void monitor_cpuperf(void)
 792{
 793	u32 zero = 0, nr_cpu_ids;
 794	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
 795	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
 796	const struct cpumask *online;
 797	int i, nr_online_cpus = 0;
 798
 799	nr_cpu_ids = scx_bpf_nr_cpu_ids();
 800	online = scx_bpf_get_online_cpumask();
 801
 802	bpf_for(i, 0, nr_cpu_ids) {
 803		struct cpu_ctx *cpuc;
 804		u32 cap, cur;
 805
 806		if (!bpf_cpumask_test_cpu(i, online))
 807			continue;
 808		nr_online_cpus++;
 809
 810		/* collect the capacity and current cpuperf */
 811		cap = scx_bpf_cpuperf_cap(i);
 812		cur = scx_bpf_cpuperf_cur(i);
 813
 814		cur_min = cur < cur_min ? cur : cur_min;
 815		cur_max = cur > cur_max ? cur : cur_max;
 816
 817		/*
 818		 * $cur is relative to $cap. Scale it down accordingly so that
 819		 * it's in the same scale as other CPUs and $cur_sum/$cap_sum
 820		 * makes sense.
 821		 */
 822		cur_sum += cur * cap / SCX_CPUPERF_ONE;
 823		cap_sum += cap;
 824
 825		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
 826			scx_bpf_error("failed to look up cpu_ctx");
 827			goto out;
 828		}
 829
 830		/* collect target */
 831		cur = cpuc->cpuperf_target;
 832		target_sum += cur;
 833		target_min = cur < target_min ? cur : target_min;
 834		target_max = cur > target_max ? cur : target_max;
 835	}
 836
 837	cpuperf_min = cur_min;
 838	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
 839	cpuperf_max = cur_max;
 840
 841	cpuperf_target_min = target_min;
 842	cpuperf_target_avg = target_sum / nr_online_cpus;
 843	cpuperf_target_max = target_max;
 844out:
 845	scx_bpf_put_cpumask(online);
 846}
 847
 848/*
 849 * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
 850 * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
 851 * see meaningful dumps in the trace pipe.
 852 */
 853static void dump_shared_dsq(void)
 854{
 855	struct task_struct *p;
 856	s32 nr;
 857
 858	if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
 859		return;
 860
 861	bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
 862
 863	bpf_rcu_read_lock();
 864	bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
 865		bpf_printk("%s[%d]", p->comm, p->pid);
 866	bpf_rcu_read_unlock();
 867}
 868
 869static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 870{
 871	bpf_rcu_read_lock();
 872	dispatch_highpri(true);
 873	bpf_rcu_read_unlock();
 874
 875	monitor_cpuperf();
 876
 877	if (print_dsqs_and_events) {
 878		struct scx_event_stats events;
 879
 880		dump_shared_dsq();
 881
 882		__COMPAT_scx_bpf_events(&events, sizeof(events));
 883
 884		bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
 885			   scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
 886		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
 887			   scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
 888		bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
 889			   scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
 890		bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
 891			   scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
 892		bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
 893			   scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
 894		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
 895			   scx_read_event(&events, SCX_EV_BYPASS_DURATION));
 896		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
 897			   scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
 898		bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
 899			   scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
 900	}
 901
 902	bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 903	return 0;
 904}
 905
 906struct lowpri_timer {
 907	struct bpf_timer timer;
 908};
 909
 910struct {
 911	__uint(type, BPF_MAP_TYPE_ARRAY);
 912	__uint(max_entries, 1);
 913	__type(key, u32);
 914	__type(value, struct lowpri_timer);
 915} lowpri_timer SEC(".maps");
 916
 917/*
 918 * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
 919 * the tasks are transferred to SHARED_DSQ.
 920 */
 921static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
 922{
 923	scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
 924	bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
 925	return 0;
 926}
 927
 928s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 929{
 930	u32 key = 0;
 931	struct bpf_timer *timer;
 932	s32 ret;
 933
 934	if (print_msgs && !sub_cgroup_id)
 935		print_cpus();
 936
 937	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
 938	if (ret) {
 939		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
 940		return ret;
 941	}
 942
 943	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
 944	if (ret) {
 945		scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
 946		return ret;
 947	}
 948
 949	ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
 950	if (ret)
 951		return ret;
 952
 953	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 954	if (!timer)
 955		return -ESRCH;
 956	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
 957	bpf_timer_set_callback(timer, monitor_timerfn);
 958	ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 959	if (ret)
 960		return ret;
 961
 962	if (__COMPAT_has_generic_reenq()) {
 963		/* see lowpri_timerfn() */
 964		timer = bpf_map_lookup_elem(&lowpri_timer, &key);
 965		if (!timer)
 966			return -ESRCH;
 967		bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
 968		bpf_timer_set_callback(timer, lowpri_timerfn);
 969		ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
 970		if (ret)
 971			return ret;
 972	}
 973
 974	return 0;
 975}
 976
 977void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 978{
 979	UEI_RECORD(uei, ei);
 980}
 981
 982s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
 983{
 984	s32 i;
 985
 986	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
 987		if (!sub_sched_cgroup_ids[i]) {
 988			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
 989			bpf_printk("attaching sub-sched[%d] on %s",
 990				   i, args->cgroup_path);
 991			return 0;
 992		}
 993	}
 994
 995	return -ENOSPC;
 996}
 997
 998void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
 999{
1000	s32 i;
1001
1002	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
1003		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
1004			sub_sched_cgroup_ids[i] = 0;
1005			bpf_printk("detaching sub-sched[%d] on %s",
1006				   i, args->cgroup_path);
1007			break;
1008		}
1009	}
1010}
1011
1012SCX_OPS_DEFINE(qmap_ops,
1013	       .select_cpu		= (void *)qmap_select_cpu,
1014	       .enqueue			= (void *)qmap_enqueue,
1015	       .dequeue			= (void *)qmap_dequeue,
1016	       .dispatch		= (void *)qmap_dispatch,
1017	       .tick			= (void *)qmap_tick,
1018	       .core_sched_before	= (void *)qmap_core_sched_before,
1019	       .init_task		= (void *)qmap_init_task,
1020	       .dump			= (void *)qmap_dump,
1021	       .dump_cpu		= (void *)qmap_dump_cpu,
1022	       .dump_task		= (void *)qmap_dump_task,
1023	       .cgroup_init		= (void *)qmap_cgroup_init,
1024	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
1025	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
1026	       .sub_attach		= (void *)qmap_sub_attach,
1027	       .sub_detach		= (void *)qmap_sub_detach,
1028	       .cpu_online		= (void *)qmap_cpu_online,
1029	       .cpu_offline		= (void *)qmap_cpu_offline,
1030	       .init			= (void *)qmap_init,
1031	       .exit			= (void *)qmap_exit,
1032	       .timeout_ms		= 5000U,
1033	       .name			= "qmap");
Configure Feed

Configure Feed