Merge tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

+36

Documentation/scheduler/sched-ext.rst

··· 294 294 the function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` 295 295 for more information. 296 296 297 + Task Lifecycle 298 + -------------- 299 + 300 + The following pseudo-code summarizes the entire lifecycle of a task managed 301 + by a sched_ext scheduler: 302 + 303 + .. code-block:: c 304 + 305 + ops.init_task(); /* A new task is created */ 306 + ops.enable(); /* Enable BPF scheduling for the task */ 307 + 308 + while (task in SCHED_EXT) { 309 + if (task can migrate) 310 + ops.select_cpu(); /* Called on wakeup (optimization) */ 311 + 312 + ops.runnable(); /* Task becomes ready to run */ 313 + 314 + while (task is runnable) { 315 + if (task is not in a DSQ) { 316 + ops.enqueue(); /* Task can be added to a DSQ */ 317 + 318 + /* A CPU becomes available */ 319 + 320 + ops.dispatch(); /* Task is moved to a local DSQ */ 321 + } 322 + ops.running(); /* Task starts running on its assigned CPU */ 323 + ops.tick(); /* Called every 1/HZ seconds */ 324 + ops.stopping(); /* Task stops running (time slice expires or wait) */ 325 + } 326 + 327 + ops.quiescent(); /* Task releases its assigned CPU (wait) */ 328 + } 329 + 330 + ops.disable(); /* Disable BPF scheduling for the task */ 331 + ops.exit_task(); /* Task is destroyed */ 332 + 297 333 Where to Look 298 334 ============= 299 335

+1 -2

MAINTAINERS

··· 21196 21196 W: https://github.com/sched-ext/scx 21197 21197 T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git 21198 21198 F: include/linux/sched/ext.h 21199 - F: kernel/sched/ext.h 21200 - F: kernel/sched/ext.c 21199 + F: kernel/sched/ext* 21201 21200 F: tools/sched_ext/ 21202 21201 F: tools/testing/selftests/sched_ext 21203 21202

+7 -1

include/linux/nodemask.h

··· 94 94 #include <linux/bitmap.h> 95 95 #include <linux/minmax.h> 96 96 #include <linux/nodemask_types.h> 97 - #include <linux/numa.h> 98 97 #include <linux/random.h> 99 98 100 99 extern nodemask_t _unused_nodemask_arg_; ··· 188 189 const nodemask_t *src2p, unsigned int nbits) 189 190 { 190 191 bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); 192 + } 193 + 194 + #define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) 195 + static __always_inline void __nodes_copy(nodemask_t *dstp, 196 + const nodemask_t *srcp, unsigned int nbits) 197 + { 198 + bitmap_copy(dstp->bits, srcp->bits, nbits); 191 199 } 192 200 193 201 #define nodes_complement(dst, src) \

+10 -1

include/linux/nodemask_types.h

··· 3 3 #define __LINUX_NODEMASK_TYPES_H 4 4 5 5 #include <linux/bitops.h> 6 - #include <linux/numa.h> 6 + 7 + #ifdef CONFIG_NODES_SHIFT 8 + #define NODES_SHIFT CONFIG_NODES_SHIFT 9 + #else 10 + #define NODES_SHIFT 0 11 + #endif 12 + 13 + #define MAX_NUMNODES (1 << NODES_SHIFT) 14 + 15 + #define NUMA_NO_NODE (-1) 7 16 8 17 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; 9 18

+8 -9

include/linux/numa.h

··· 3 3 #define _LINUX_NUMA_H 4 4 #include <linux/init.h> 5 5 #include <linux/types.h> 6 + #include <linux/nodemask.h> 6 7 7 - #ifdef CONFIG_NODES_SHIFT 8 - #define NODES_SHIFT CONFIG_NODES_SHIFT 9 - #else 10 - #define NODES_SHIFT 0 11 - #endif 12 - 13 - #define MAX_NUMNODES (1 << NODES_SHIFT) 14 - 15 - #define NUMA_NO_NODE (-1) 16 8 #define NUMA_NO_MEMBLK (-1) 17 9 18 10 static inline bool numa_valid_node(int nid) ··· 31 39 /* Generic implementation available */ 32 40 int numa_nearest_node(int node, unsigned int state); 33 41 42 + int nearest_node_nodemask(int node, nodemask_t *mask); 43 + 34 44 #ifndef memory_add_physaddr_to_nid 35 45 int memory_add_physaddr_to_nid(u64 start); 36 46 #endif ··· 45 51 46 52 #else /* !CONFIG_NUMA */ 47 53 static inline int numa_nearest_node(int node, unsigned int state) 54 + { 55 + return NUMA_NO_NODE; 56 + } 57 + 58 + static inline int nearest_node_nodemask(int node, nodemask_t *mask) 48 59 { 49 60 return NUMA_NO_NODE; 50 61 }

+1

include/linux/sched/ext.h

··· 146 146 u32 weight; 147 147 s32 sticky_cpu; 148 148 s32 holding_cpu; 149 + s32 selected_cpu; 149 150 u32 kf_mask; /* see scx_kf_mask above */ 150 151 struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ 151 152 atomic_long_t ops_state;

+30

include/linux/topology.h

··· 262 262 #endif /* CONFIG_NUMA */ 263 263 264 264 /** 265 + * for_each_node_numadist() - iterate over nodes in increasing distance 266 + * order, starting from a given node 267 + * @node: the iteration variable and the starting node. 268 + * @unvisited: a nodemask to keep track of the unvisited nodes. 269 + * 270 + * This macro iterates over NUMA node IDs in increasing distance from the 271 + * starting @node and yields MAX_NUMNODES when all the nodes have been 272 + * visited. 273 + * 274 + * Note that by the time the loop completes, the @unvisited nodemask will 275 + * be fully cleared, unless the loop exits early. 276 + * 277 + * The difference between for_each_node() and for_each_node_numadist() is 278 + * that the former allows to iterate over nodes in numerical order, whereas 279 + * the latter iterates over nodes in increasing order of distance. 280 + * 281 + * This complexity of this iterator is O(N^2), where N represents the 282 + * number of nodes, as each iteration involves scanning all nodes to 283 + * find the one with the shortest distance. 284 + * 285 + * Requires rcu_lock to be held. 286 + */ 287 + #define for_each_node_numadist(node, unvisited) \ 288 + for (int __start = (node), \ 289 + (node) = nearest_node_nodemask((__start), &(unvisited)); \ 290 + (node) < MAX_NUMNODES; \ 291 + node_clear((node), (unvisited)), \ 292 + (node) = nearest_node_nodemask((__start), &(unvisited))) 293 + 294 + /** 265 295 * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance 266 296 * from a given node. 267 297 * @mask: the iteration variable.

+19

include/trace/events/sched_ext.h

··· 26 26 ) 27 27 ); 28 28 29 + TRACE_EVENT(sched_ext_event, 30 + TP_PROTO(const char *name, __s64 delta), 31 + TP_ARGS(name, delta), 32 + 33 + TP_STRUCT__entry( 34 + __string(name, name) 35 + __field( __s64, delta ) 36 + ), 37 + 38 + TP_fast_assign( 39 + __assign_str(name); 40 + __entry->delta = delta; 41 + ), 42 + 43 + TP_printk("name %s delta %lld", 44 + __get_str(name), __entry->delta 45 + ) 46 + ); 47 + 29 48 #endif /* _TRACE_SCHED_EXT_H */ 30 49 31 50 /* This part must be outside protection */

+1

kernel/sched/build_policy.c

··· 61 61 62 62 #ifdef CONFIG_SCHED_CLASS_EXT 63 63 # include "ext.c" 64 + # include "ext_idle.c" 64 65 #endif 65 66 66 67 #include "syscalls.c"

+2 -7

kernel/sched/core.c

··· 3922 3922 3923 3923 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) 3924 3924 { 3925 - /* 3926 - * The BPF scheduler may depend on select_task_rq() being invoked during 3927 - * wakeups. In addition, @p may end up executing on a different CPU 3928 - * regardless of what happens in the wakeup path making the ttwu_queue 3929 - * optimization less meaningful. Skip if on SCX. 3930 - */ 3931 - if (task_on_scx(p)) 3925 + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ 3926 + if (!scx_allow_ttwu_queue(p)) 3932 3927 return false; 3933 3928 3934 3929 /*

+328 -757

kernel/sched/ext.c

··· 6 6 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 7 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 8 */ 9 + #include <linux/btf_ids.h> 10 + #include "ext_idle.h" 11 + 9 12 #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 10 13 11 14 enum scx_consts { ··· 96 93 /* 97 94 * Keep built-in idle tracking even if ops.update_idle() is implemented. 98 95 */ 99 - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 96 + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, 100 97 101 98 /* 102 99 * By default, if there are no other task to run on the CPU, ext core ··· 104 101 * flag is specified, such tasks are passed to ops.enqueue() with 105 102 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. 106 103 */ 107 - SCX_OPS_ENQ_LAST = 1LLU << 1, 104 + SCX_OPS_ENQ_LAST = 1LLU << 1, 108 105 109 106 /* 110 107 * An exiting task may schedule after PF_EXITING is set. In such cases, ··· 117 114 * depend on pid lookups and wants to handle these tasks directly, the 118 115 * following flag can be used. 119 116 */ 120 - SCX_OPS_ENQ_EXITING = 1LLU << 2, 117 + SCX_OPS_ENQ_EXITING = 1LLU << 2, 121 118 122 119 /* 123 120 * If set, only tasks with policy set to SCHED_EXT are attached to 124 121 * sched_ext. If clear, SCHED_NORMAL tasks are also included. 125 122 */ 126 - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 123 + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 127 124 128 125 /* 129 126 * A migration disabled task can only execute on its current CPU. By ··· 136 133 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr 137 134 * and thus may disagree with cpumask_weight(p->cpus_ptr). 138 135 */ 139 - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, 136 + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, 137 + 138 + /* 139 + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes 140 + * ops.enqueue() on the ops.select_cpu() selected or the wakee's 141 + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline 142 + * transfers. When this optimization is enabled, ops.select_cpu() is 143 + * skipped in some cases (when racing against the wakee switching out). 144 + * As the BPF scheduler may depend on ops.select_cpu() being invoked 145 + * during wakeups, queued wakeup is disabled by default. 146 + * 147 + * If this ops flag is set, queued wakeup optimization is enabled and 148 + * the BPF scheduler must be able to handle ops.enqueue() invoked on the 149 + * wakee's CPU without preceding ops.select_cpu() even for tasks which 150 + * may be executed on multiple CPUs. 151 + */ 152 + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, 153 + 154 + /* 155 + * If set, enable per-node idle cpumasks. If clear, use a single global 156 + * flat idle cpumask. 157 + */ 158 + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, 140 159 141 160 /* 142 161 * CPU cgroup support flags ··· 169 144 SCX_OPS_ENQ_LAST | 170 145 SCX_OPS_ENQ_EXITING | 171 146 SCX_OPS_ENQ_MIGRATION_DISABLED | 147 + SCX_OPS_ALLOW_QUEUED_WAKEUP | 172 148 SCX_OPS_SWITCH_PARTIAL | 149 + SCX_OPS_BUILTIN_IDLE_PER_NODE | 173 150 SCX_OPS_HAS_CGROUP_WEIGHT, 174 151 }; 175 152 ··· 806 779 807 780 enum scx_pick_idle_cpu_flags { 808 781 SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ 782 + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ 809 783 }; 810 784 811 785 enum scx_kick_flags { ··· 922 894 static struct sched_ext_ops scx_ops; 923 895 static bool scx_warned_zero_slice; 924 896 897 + DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); 925 898 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); 926 899 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); 927 900 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); 928 901 static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); 929 - static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); 930 - 931 - #ifdef CONFIG_SMP 932 - static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); 933 - static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); 934 - #endif 935 902 936 903 static struct static_key_false scx_has_op[SCX_OPI_END] = 937 904 { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; ··· 960 937 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; 961 938 962 939 static struct delayed_work scx_watchdog_work; 963 - 964 - /* idle tracking */ 965 - #ifdef CONFIG_SMP 966 - #ifdef CONFIG_CPUMASK_OFFSTACK 967 - #define CL_ALIGNED_IF_ONSTACK 968 - #else 969 - #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp 970 - #endif 971 - 972 - static struct { 973 - cpumask_var_t cpu; 974 - cpumask_var_t smt; 975 - } idle_masks CL_ALIGNED_IF_ONSTACK; 976 - 977 - #endif /* CONFIG_SMP */ 978 940 979 941 /* for %SCX_KICK_WAIT */ 980 942 static unsigned long __percpu *scx_kick_cpus_pnt_seqs; ··· 1480 1472 1481 1473 return p; 1482 1474 } 1475 + 1476 + /* 1477 + * Collection of event counters. Event types are placed in descending order. 1478 + */ 1479 + struct scx_event_stats { 1480 + /* 1481 + * If ops.select_cpu() returns a CPU which can't be used by the task, 1482 + * the core scheduler code silently picks a fallback CPU. 1483 + */ 1484 + s64 SCX_EV_SELECT_CPU_FALLBACK; 1485 + 1486 + /* 1487 + * When dispatching to a local DSQ, the CPU may have gone offline in 1488 + * the meantime. In this case, the task is bounced to the global DSQ. 1489 + */ 1490 + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; 1491 + 1492 + /* 1493 + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task 1494 + * continued to run because there were no other tasks on the CPU. 1495 + */ 1496 + s64 SCX_EV_DISPATCH_KEEP_LAST; 1497 + 1498 + /* 1499 + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task 1500 + * is dispatched to a local DSQ when exiting. 1501 + */ 1502 + s64 SCX_EV_ENQ_SKIP_EXITING; 1503 + 1504 + /* 1505 + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a 1506 + * migration disabled task skips ops.enqueue() and is dispatched to its 1507 + * local DSQ. 1508 + */ 1509 + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; 1510 + 1511 + /* 1512 + * The total number of tasks enqueued (or pick_task-ed) with a 1513 + * default time slice (SCX_SLICE_DFL). 1514 + */ 1515 + s64 SCX_EV_ENQ_SLICE_DFL; 1516 + 1517 + /* 1518 + * The total duration of bypass modes in nanoseconds. 1519 + */ 1520 + s64 SCX_EV_BYPASS_DURATION; 1521 + 1522 + /* 1523 + * The number of tasks dispatched in the bypassing mode. 1524 + */ 1525 + s64 SCX_EV_BYPASS_DISPATCH; 1526 + 1527 + /* 1528 + * The number of times the bypassing mode has been activated. 1529 + */ 1530 + s64 SCX_EV_BYPASS_ACTIVATE; 1531 + }; 1532 + 1533 + /* 1534 + * The event counter is organized by a per-CPU variable to minimize the 1535 + * accounting overhead without synchronization. A system-wide view on the 1536 + * event counter is constructed when requested by scx_bpf_get_event_stat(). 1537 + */ 1538 + static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); 1539 + 1540 + /** 1541 + * scx_add_event - Increase an event counter for 'name' by 'cnt' 1542 + * @name: an event name defined in struct scx_event_stats 1543 + * @cnt: the number of the event occured 1544 + * 1545 + * This can be used when preemption is not disabled. 1546 + */ 1547 + #define scx_add_event(name, cnt) do { \ 1548 + this_cpu_add(event_stats_cpu.name, cnt); \ 1549 + trace_sched_ext_event(#name, cnt); \ 1550 + } while(0) 1551 + 1552 + /** 1553 + * __scx_add_event - Increase an event counter for 'name' by 'cnt' 1554 + * @name: an event name defined in struct scx_event_stats 1555 + * @cnt: the number of the event occured 1556 + * 1557 + * This should be used only when preemption is disabled. 1558 + */ 1559 + #define __scx_add_event(name, cnt) do { \ 1560 + __this_cpu_add(event_stats_cpu.name, cnt); \ 1561 + trace_sched_ext_event(#name, cnt); \ 1562 + } while(0) 1563 + 1564 + /** 1565 + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' 1566 + * @dst_e: destination event stats 1567 + * @src_e: source event stats 1568 + * @kind: a kind of event to be aggregated 1569 + */ 1570 + #define scx_agg_event(dst_e, src_e, kind) do { \ 1571 + (dst_e)->kind += READ_ONCE((src_e)->kind); \ 1572 + } while(0) 1573 + 1574 + /** 1575 + * scx_dump_event - Dump an event 'kind' in 'events' to 's' 1576 + * @s: output seq_buf 1577 + * @events: event stats 1578 + * @kind: a kind of event to dump 1579 + */ 1580 + #define scx_dump_event(s, events, kind) do { \ 1581 + dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ 1582 + } while (0) 1583 + 1584 + 1585 + static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); 1483 1586 1484 1587 static enum scx_ops_enable_state scx_ops_enable_state(void) 1485 1588 { ··· 2137 2018 if (!scx_rq_online(rq)) 2138 2019 goto local; 2139 2020 2140 - if (scx_rq_bypassing(rq)) 2021 + if (scx_rq_bypassing(rq)) { 2022 + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); 2141 2023 goto global; 2024 + } 2142 2025 2143 2026 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) 2144 2027 goto direct; 2145 2028 2146 2029 /* see %SCX_OPS_ENQ_EXITING */ 2147 2030 if (!static_branch_unlikely(&scx_ops_enq_exiting) && 2148 - unlikely(p->flags & PF_EXITING)) 2031 + unlikely(p->flags & PF_EXITING)) { 2032 + __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); 2149 2033 goto local; 2034 + } 2150 2035 2151 2036 /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ 2152 2037 if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && 2153 - is_migration_disabled(p)) 2038 + is_migration_disabled(p)) { 2039 + __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); 2154 2040 goto local; 2041 + } 2155 2042 2156 2043 if (!SCX_HAS_OP(enqueue)) 2157 2044 goto global; ··· 2197 2072 */ 2198 2073 touch_core_sched(rq, p); 2199 2074 p->scx.slice = SCX_SLICE_DFL; 2075 + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); 2200 2076 local_norefill: 2201 2077 dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); 2202 2078 return; ··· 2205 2079 global: 2206 2080 touch_core_sched(rq, p); /* see the comment in local: */ 2207 2081 p->scx.slice = SCX_SLICE_DFL; 2082 + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); 2208 2083 dispatch_enqueue(find_global_dsq(p), p, enq_flags); 2209 2084 } 2210 2085 ··· 2277 2150 do_enqueue_task(rq, p, enq_flags, sticky_cpu); 2278 2151 out: 2279 2152 rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; 2153 + 2154 + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && 2155 + unlikely(cpu_of(rq) != p->scx.selected_cpu)) 2156 + __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); 2280 2157 } 2281 2158 2282 2159 static void ops_dequeue(struct task_struct *p, u64 deq_flags) ··· 2468 2337 * The caller must ensure that @p and @rq are on different CPUs. 2469 2338 */ 2470 2339 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, 2471 - bool trigger_error) 2340 + bool enforce) 2472 2341 { 2473 2342 int cpu = cpu_of(rq); 2474 2343 ··· 2487 2356 * easily be masked if task_allowed_on_cpu() is done first. 2488 2357 */ 2489 2358 if (unlikely(is_migration_disabled(p))) { 2490 - if (trigger_error) 2359 + if (enforce) 2491 2360 scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", 2492 2361 p->comm, p->pid, task_cpu(p), cpu); 2493 2362 return false; ··· 2500 2369 * picked CPU is outside the allowed mask. 2501 2370 */ 2502 2371 if (!task_allowed_on_cpu(p, cpu)) { 2503 - if (trigger_error) 2372 + if (enforce) 2504 2373 scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", 2505 2374 cpu, p->comm, p->pid); 2506 2375 return false; 2507 2376 } 2508 2377 2509 - if (!scx_rq_online(rq)) 2378 + if (!scx_rq_online(rq)) { 2379 + if (enforce) 2380 + __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); 2510 2381 return false; 2382 + } 2511 2383 2512 2384 return true; 2513 2385 } ··· 2580 2446 } 2581 2447 #else /* CONFIG_SMP */ 2582 2448 static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } 2583 - static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } 2449 + static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } 2584 2450 static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } 2585 2451 #endif /* CONFIG_SMP */ 2586 2452 ··· 3027 2893 if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || 3028 2894 scx_rq_bypassing(rq))) { 3029 2895 rq->scx.flags |= SCX_RQ_BAL_KEEP; 2896 + __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); 3030 2897 goto has_tasks; 3031 2898 } 3032 2899 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; ··· 3294 3159 */ 3295 3160 if (keep_prev) { 3296 3161 p = prev; 3297 - if (!p->scx.slice) 3162 + if (!p->scx.slice) { 3298 3163 p->scx.slice = SCX_SLICE_DFL; 3164 + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); 3165 + } 3299 3166 } else { 3300 3167 p = first_local_task(rq); 3301 3168 if (!p) { ··· 3313 3176 scx_warned_zero_slice = true; 3314 3177 } 3315 3178 p->scx.slice = SCX_SLICE_DFL; 3179 + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); 3316 3180 } 3317 3181 } 3318 3182 ··· 3358 3220 3359 3221 #ifdef CONFIG_SMP 3360 3222 3361 - static bool test_and_clear_cpu_idle(int cpu) 3362 - { 3363 - #ifdef CONFIG_SCHED_SMT 3364 - /* 3365 - * SMT mask should be cleared whether we can claim @cpu or not. The SMT 3366 - * cluster is not wholly idle either way. This also prevents 3367 - * scx_pick_idle_cpu() from getting caught in an infinite loop. 3368 - */ 3369 - if (sched_smt_active()) { 3370 - const struct cpumask *smt = cpu_smt_mask(cpu); 3371 - 3372 - /* 3373 - * If offline, @cpu is not its own sibling and 3374 - * scx_pick_idle_cpu() can get caught in an infinite loop as 3375 - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu 3376 - * is eventually cleared. 3377 - * 3378 - * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to 3379 - * reduce memory writes, which may help alleviate cache 3380 - * coherence pressure. 3381 - */ 3382 - if (cpumask_intersects(smt, idle_masks.smt)) 3383 - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 3384 - else if (cpumask_test_cpu(cpu, idle_masks.smt)) 3385 - __cpumask_clear_cpu(cpu, idle_masks.smt); 3386 - } 3387 - #endif 3388 - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); 3389 - } 3390 - 3391 - static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) 3392 - { 3393 - int cpu; 3394 - 3395 - retry: 3396 - if (sched_smt_active()) { 3397 - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); 3398 - if (cpu < nr_cpu_ids) 3399 - goto found; 3400 - 3401 - if (flags & SCX_PICK_IDLE_CORE) 3402 - return -EBUSY; 3403 - } 3404 - 3405 - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); 3406 - if (cpu >= nr_cpu_ids) 3407 - return -EBUSY; 3408 - 3409 - found: 3410 - if (test_and_clear_cpu_idle(cpu)) 3411 - return cpu; 3412 - else 3413 - goto retry; 3414 - } 3415 - 3416 - /* 3417 - * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC 3418 - * domain is not defined). 3419 - */ 3420 - static unsigned int llc_weight(s32 cpu) 3421 - { 3422 - struct sched_domain *sd; 3423 - 3424 - sd = rcu_dereference(per_cpu(sd_llc, cpu)); 3425 - if (!sd) 3426 - return 0; 3427 - 3428 - return sd->span_weight; 3429 - } 3430 - 3431 - /* 3432 - * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC 3433 - * domain is not defined). 3434 - */ 3435 - static struct cpumask *llc_span(s32 cpu) 3436 - { 3437 - struct sched_domain *sd; 3438 - 3439 - sd = rcu_dereference(per_cpu(sd_llc, cpu)); 3440 - if (!sd) 3441 - return 0; 3442 - 3443 - return sched_domain_span(sd); 3444 - } 3445 - 3446 - /* 3447 - * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the 3448 - * NUMA domain is not defined). 3449 - */ 3450 - static unsigned int numa_weight(s32 cpu) 3451 - { 3452 - struct sched_domain *sd; 3453 - struct sched_group *sg; 3454 - 3455 - sd = rcu_dereference(per_cpu(sd_numa, cpu)); 3456 - if (!sd) 3457 - return 0; 3458 - sg = sd->groups; 3459 - if (!sg) 3460 - return 0; 3461 - 3462 - return sg->group_weight; 3463 - } 3464 - 3465 - /* 3466 - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA 3467 - * domain is not defined). 3468 - */ 3469 - static struct cpumask *numa_span(s32 cpu) 3470 - { 3471 - struct sched_domain *sd; 3472 - struct sched_group *sg; 3473 - 3474 - sd = rcu_dereference(per_cpu(sd_numa, cpu)); 3475 - if (!sd) 3476 - return NULL; 3477 - sg = sd->groups; 3478 - if (!sg) 3479 - return NULL; 3480 - 3481 - return sched_group_span(sg); 3482 - } 3483 - 3484 - /* 3485 - * Return true if the LLC domains do not perfectly overlap with the NUMA 3486 - * domains, false otherwise. 3487 - */ 3488 - static bool llc_numa_mismatch(void) 3489 - { 3490 - int cpu; 3491 - 3492 - /* 3493 - * We need to scan all online CPUs to verify whether their scheduling 3494 - * domains overlap. 3495 - * 3496 - * While it is rare to encounter architectures with asymmetric NUMA 3497 - * topologies, CPU hotplugging or virtualized environments can result 3498 - * in asymmetric configurations. 3499 - * 3500 - * For example: 3501 - * 3502 - * NUMA 0: 3503 - * - LLC 0: cpu0..cpu7 3504 - * - LLC 1: cpu8..cpu15 [offline] 3505 - * 3506 - * NUMA 1: 3507 - * - LLC 0: cpu16..cpu23 3508 - * - LLC 1: cpu24..cpu31 3509 - * 3510 - * In this case, if we only check the first online CPU (cpu0), we might 3511 - * incorrectly assume that the LLC and NUMA domains are fully 3512 - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC 3513 - * domains). 3514 - */ 3515 - for_each_online_cpu(cpu) 3516 - if (llc_weight(cpu) != numa_weight(cpu)) 3517 - return true; 3518 - 3519 - return false; 3520 - } 3521 - 3522 - /* 3523 - * Initialize topology-aware scheduling. 3524 - * 3525 - * Detect if the system has multiple LLC or multiple NUMA domains and enable 3526 - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle 3527 - * selection policy. 3528 - * 3529 - * Assumption: the kernel's internal topology representation assumes that each 3530 - * CPU belongs to a single LLC domain, and that each LLC domain is entirely 3531 - * contained within a single NUMA node. 3532 - */ 3533 - static void update_selcpu_topology(void) 3534 - { 3535 - bool enable_llc = false, enable_numa = false; 3536 - unsigned int nr_cpus; 3537 - s32 cpu = cpumask_first(cpu_online_mask); 3538 - 3539 - /* 3540 - * Enable LLC domain optimization only when there are multiple LLC 3541 - * domains among the online CPUs. If all online CPUs are part of a 3542 - * single LLC domain, the idle CPU selection logic can choose any 3543 - * online CPU without bias. 3544 - * 3545 - * Note that it is sufficient to check the LLC domain of the first 3546 - * online CPU to determine whether a single LLC domain includes all 3547 - * CPUs. 3548 - */ 3549 - rcu_read_lock(); 3550 - nr_cpus = llc_weight(cpu); 3551 - if (nr_cpus > 0) { 3552 - if (nr_cpus < num_online_cpus()) 3553 - enable_llc = true; 3554 - pr_debug("sched_ext: LLC=%*pb weight=%u\n", 3555 - cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); 3556 - } 3557 - 3558 - /* 3559 - * Enable NUMA optimization only when there are multiple NUMA domains 3560 - * among the online CPUs and the NUMA domains don't perfectly overlaps 3561 - * with the LLC domains. 3562 - * 3563 - * If all CPUs belong to the same NUMA node and the same LLC domain, 3564 - * enabling both NUMA and LLC optimizations is unnecessary, as checking 3565 - * for an idle CPU in the same domain twice is redundant. 3566 - */ 3567 - nr_cpus = numa_weight(cpu); 3568 - if (nr_cpus > 0) { 3569 - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) 3570 - enable_numa = true; 3571 - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", 3572 - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); 3573 - } 3574 - rcu_read_unlock(); 3575 - 3576 - pr_debug("sched_ext: LLC idle selection %s\n", 3577 - str_enabled_disabled(enable_llc)); 3578 - pr_debug("sched_ext: NUMA idle selection %s\n", 3579 - str_enabled_disabled(enable_numa)); 3580 - 3581 - if (enable_llc) 3582 - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); 3583 - else 3584 - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); 3585 - if (enable_numa) 3586 - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); 3587 - else 3588 - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); 3589 - } 3590 - 3591 - /* 3592 - * Built-in CPU idle selection policy: 3593 - * 3594 - * 1. Prioritize full-idle cores: 3595 - * - always prioritize CPUs from fully idle cores (both logical CPUs are 3596 - * idle) to avoid interference caused by SMT. 3597 - * 3598 - * 2. Reuse the same CPU: 3599 - * - prefer the last used CPU to take advantage of cached data (L1, L2) and 3600 - * branch prediction optimizations. 3601 - * 3602 - * 3. Pick a CPU within the same LLC (Last-Level Cache): 3603 - * - if the above conditions aren't met, pick a CPU that shares the same LLC 3604 - * to maintain cache locality. 3605 - * 3606 - * 4. Pick a CPU within the same NUMA node, if enabled: 3607 - * - choose a CPU from the same NUMA node to reduce memory access latency. 3608 - * 3609 - * 5. Pick any idle CPU usable by the task. 3610 - * 3611 - * Step 3 and 4 are performed only if the system has, respectively, multiple 3612 - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and 3613 - * scx_selcpu_topo_numa). 3614 - * 3615 - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because 3616 - * we never call ops.select_cpu() for them, see select_task_rq(). 3617 - */ 3618 - static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 3619 - u64 wake_flags, bool *found) 3620 - { 3621 - const struct cpumask *llc_cpus = NULL; 3622 - const struct cpumask *numa_cpus = NULL; 3623 - s32 cpu; 3624 - 3625 - *found = false; 3626 - 3627 - /* 3628 - * This is necessary to protect llc_cpus. 3629 - */ 3630 - rcu_read_lock(); 3631 - 3632 - /* 3633 - * Determine the scheduling domain only if the task is allowed to run 3634 - * on all CPUs. 3635 - * 3636 - * This is done primarily for efficiency, as it avoids the overhead of 3637 - * updating a cpumask every time we need to select an idle CPU (which 3638 - * can be costly in large SMP systems), but it also aligns logically: 3639 - * if a task's scheduling domain is restricted by user-space (through 3640 - * CPU affinity), the task will simply use the flat scheduling domain 3641 - * defined by user-space. 3642 - */ 3643 - if (p->nr_cpus_allowed >= num_possible_cpus()) { 3644 - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) 3645 - numa_cpus = numa_span(prev_cpu); 3646 - 3647 - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) 3648 - llc_cpus = llc_span(prev_cpu); 3649 - } 3650 - 3651 - /* 3652 - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. 3653 - */ 3654 - if (wake_flags & SCX_WAKE_SYNC) { 3655 - cpu = smp_processor_id(); 3656 - 3657 - /* 3658 - * If the waker's CPU is cache affine and prev_cpu is idle, 3659 - * then avoid a migration. 3660 - */ 3661 - if (cpus_share_cache(cpu, prev_cpu) && 3662 - test_and_clear_cpu_idle(prev_cpu)) { 3663 - cpu = prev_cpu; 3664 - goto cpu_found; 3665 - } 3666 - 3667 - /* 3668 - * If the waker's local DSQ is empty, and the system is under 3669 - * utilized, try to wake up @p to the local DSQ of the waker. 3670 - * 3671 - * Checking only for an empty local DSQ is insufficient as it 3672 - * could give the wakee an unfair advantage when the system is 3673 - * oversaturated. 3674 - * 3675 - * Checking only for the presence of idle CPUs is also 3676 - * insufficient as the local DSQ of the waker could have tasks 3677 - * piled up on it even if there is an idle core elsewhere on 3678 - * the system. 3679 - */ 3680 - if (!cpumask_empty(idle_masks.cpu) && 3681 - !(current->flags & PF_EXITING) && 3682 - cpu_rq(cpu)->scx.local_dsq.nr == 0) { 3683 - if (cpumask_test_cpu(cpu, p->cpus_ptr)) 3684 - goto cpu_found; 3685 - } 3686 - } 3687 - 3688 - /* 3689 - * If CPU has SMT, any wholly idle CPU is likely a better pick than 3690 - * partially idle @prev_cpu. 3691 - */ 3692 - if (sched_smt_active()) { 3693 - /* 3694 - * Keep using @prev_cpu if it's part of a fully idle core. 3695 - */ 3696 - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && 3697 - test_and_clear_cpu_idle(prev_cpu)) { 3698 - cpu = prev_cpu; 3699 - goto cpu_found; 3700 - } 3701 - 3702 - /* 3703 - * Search for any fully idle core in the same LLC domain. 3704 - */ 3705 - if (llc_cpus) { 3706 - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); 3707 - if (cpu >= 0) 3708 - goto cpu_found; 3709 - } 3710 - 3711 - /* 3712 - * Search for any fully idle core in the same NUMA node. 3713 - */ 3714 - if (numa_cpus) { 3715 - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); 3716 - if (cpu >= 0) 3717 - goto cpu_found; 3718 - } 3719 - 3720 - /* 3721 - * Search for any full idle core usable by the task. 3722 - */ 3723 - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); 3724 - if (cpu >= 0) 3725 - goto cpu_found; 3726 - } 3727 - 3728 - /* 3729 - * Use @prev_cpu if it's idle. 3730 - */ 3731 - if (test_and_clear_cpu_idle(prev_cpu)) { 3732 - cpu = prev_cpu; 3733 - goto cpu_found; 3734 - } 3735 - 3736 - /* 3737 - * Search for any idle CPU in the same LLC domain. 3738 - */ 3739 - if (llc_cpus) { 3740 - cpu = scx_pick_idle_cpu(llc_cpus, 0); 3741 - if (cpu >= 0) 3742 - goto cpu_found; 3743 - } 3744 - 3745 - /* 3746 - * Search for any idle CPU in the same NUMA node. 3747 - */ 3748 - if (numa_cpus) { 3749 - cpu = scx_pick_idle_cpu(numa_cpus, 0); 3750 - if (cpu >= 0) 3751 - goto cpu_found; 3752 - } 3753 - 3754 - /* 3755 - * Search for any idle CPU usable by the task. 3756 - */ 3757 - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); 3758 - if (cpu >= 0) 3759 - goto cpu_found; 3760 - 3761 - rcu_read_unlock(); 3762 - return prev_cpu; 3763 - 3764 - cpu_found: 3765 - rcu_read_unlock(); 3766 - 3767 - *found = true; 3768 - return cpu; 3769 - } 3770 - 3771 3223 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) 3772 3224 { 3225 + bool rq_bypass; 3226 + 3773 3227 /* 3774 3228 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it 3775 3229 * can be a good migration opportunity with low cache and memory ··· 3375 3645 if (unlikely(wake_flags & WF_EXEC)) 3376 3646 return prev_cpu; 3377 3647 3378 - if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { 3648 + rq_bypass = scx_rq_bypassing(task_rq(p)); 3649 + if (SCX_HAS_OP(select_cpu) && !rq_bypass) { 3379 3650 s32 cpu; 3380 3651 struct task_struct **ddsp_taskp; 3381 3652 ··· 3386 3655 3387 3656 cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, 3388 3657 select_cpu, p, prev_cpu, wake_flags); 3658 + p->scx.selected_cpu = cpu; 3389 3659 *ddsp_taskp = NULL; 3390 3660 if (ops_cpu_valid(cpu, "from ops.select_cpu()")) 3391 3661 return cpu; 3392 3662 else 3393 3663 return prev_cpu; 3394 3664 } else { 3395 - bool found; 3396 3665 s32 cpu; 3397 3666 3398 - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); 3399 - if (found) { 3667 + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); 3668 + if (cpu >= 0) { 3400 3669 p->scx.slice = SCX_SLICE_DFL; 3401 3670 p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; 3671 + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); 3672 + } else { 3673 + cpu = prev_cpu; 3402 3674 } 3675 + p->scx.selected_cpu = cpu; 3676 + 3677 + if (rq_bypass) 3678 + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); 3403 3679 return cpu; 3404 3680 } 3405 3681 } ··· 3434 3696 (struct cpumask *)p->cpus_ptr); 3435 3697 } 3436 3698 3437 - static void reset_idle_masks(void) 3438 - { 3439 - /* 3440 - * Consider all online cpus idle. Should converge to the actual state 3441 - * quickly. 3442 - */ 3443 - cpumask_copy(idle_masks.cpu, cpu_online_mask); 3444 - cpumask_copy(idle_masks.smt, cpu_online_mask); 3445 - } 3446 - 3447 - static void update_builtin_idle(int cpu, bool idle) 3448 - { 3449 - assign_cpu(cpu, idle_masks.cpu, idle); 3450 - 3451 - #ifdef CONFIG_SCHED_SMT 3452 - if (sched_smt_active()) { 3453 - const struct cpumask *smt = cpu_smt_mask(cpu); 3454 - 3455 - if (idle) { 3456 - /* 3457 - * idle_masks.smt handling is racy but that's fine as 3458 - * it's only for optimization and self-correcting. 3459 - */ 3460 - if (!cpumask_subset(smt, idle_masks.cpu)) 3461 - return; 3462 - cpumask_or(idle_masks.smt, idle_masks.smt, smt); 3463 - } else { 3464 - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); 3465 - } 3466 - } 3467 - #endif 3468 - } 3469 - 3470 - /* 3471 - * Update the idle state of a CPU to @idle. 3472 - * 3473 - * If @do_notify is true, ops.update_idle() is invoked to notify the scx 3474 - * scheduler of an actual idle state transition (idle to busy or vice 3475 - * versa). If @do_notify is false, only the idle state in the idle masks is 3476 - * refreshed without invoking ops.update_idle(). 3477 - * 3478 - * This distinction is necessary, because an idle CPU can be "reserved" and 3479 - * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as 3480 - * busy even if no tasks are dispatched. In this case, the CPU may return 3481 - * to idle without a true state transition. Refreshing the idle masks 3482 - * without invoking ops.update_idle() ensures accurate idle state tracking 3483 - * while avoiding unnecessary updates and maintaining balanced state 3484 - * transitions. 3485 - */ 3486 - void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) 3487 - { 3488 - int cpu = cpu_of(rq); 3489 - 3490 - lockdep_assert_rq_held(rq); 3491 - 3492 - /* 3493 - * Trigger ops.update_idle() only when transitioning from a task to 3494 - * the idle thread and vice versa. 3495 - * 3496 - * Idle transitions are indicated by do_notify being set to true, 3497 - * managed by put_prev_task_idle()/set_next_task_idle(). 3498 - */ 3499 - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) 3500 - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); 3501 - 3502 - /* 3503 - * Update the idle masks: 3504 - * - for real idle transitions (do_notify == true) 3505 - * - for idle-to-idle transitions (indicated by the previous task 3506 - * being the idle thread, managed by pick_task_idle()) 3507 - * 3508 - * Skip updating idle masks if the previous task is not the idle 3509 - * thread, since set_next_task_idle() has already handled it when 3510 - * transitioning from a task to the idle thread (calling this 3511 - * function with do_notify == true). 3512 - * 3513 - * In this way we can avoid updating the idle masks twice, 3514 - * unnecessarily. 3515 - */ 3516 - if (static_branch_likely(&scx_builtin_idle_enabled)) 3517 - if (do_notify || is_idle_task(rq->curr)) 3518 - update_builtin_idle(cpu, idle); 3519 - } 3520 - 3521 3699 static void handle_hotplug(struct rq *rq, bool online) 3522 3700 { 3523 3701 int cpu = cpu_of(rq); ··· 3441 3787 atomic_long_inc(&scx_hotplug_seq); 3442 3788 3443 3789 if (scx_enabled()) 3444 - update_selcpu_topology(); 3790 + scx_idle_update_selcpu_topology(&scx_ops); 3445 3791 3446 3792 if (online && SCX_HAS_OP(cpu_online)) 3447 3793 SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); ··· 3472 3818 { 3473 3819 rq->scx.flags &= ~SCX_RQ_ONLINE; 3474 3820 } 3475 - 3476 - #else /* CONFIG_SMP */ 3477 - 3478 - static bool test_and_clear_cpu_idle(int cpu) { return false; } 3479 - static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } 3480 - static void reset_idle_masks(void) {} 3481 3821 3482 3822 #endif /* CONFIG_SMP */ 3483 3823 ··· 4397 4749 } 4398 4750 SCX_ATTR(ops); 4399 4751 4752 + #define scx_attr_event_show(buf, at, events, kind) ({ \ 4753 + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ 4754 + }) 4755 + 4756 + static ssize_t scx_attr_events_show(struct kobject *kobj, 4757 + struct kobj_attribute *ka, char *buf) 4758 + { 4759 + struct scx_event_stats events; 4760 + int at = 0; 4761 + 4762 + scx_bpf_events(&events, sizeof(events)); 4763 + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); 4764 + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 4765 + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); 4766 + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); 4767 + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 4768 + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); 4769 + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); 4770 + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); 4771 + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); 4772 + return at; 4773 + } 4774 + SCX_ATTR(events); 4775 + 4400 4776 static struct attribute *scx_sched_attrs[] = { 4401 4777 &scx_attr_ops.attr, 4778 + &scx_attr_events.attr, 4402 4779 NULL, 4403 4780 }; 4404 4781 ATTRIBUTE_GROUPS(scx_sched); ··· 4535 4862 static void scx_ops_bypass(bool bypass) 4536 4863 { 4537 4864 static DEFINE_RAW_SPINLOCK(bypass_lock); 4865 + static unsigned long bypass_timestamp; 4866 + 4538 4867 int cpu; 4539 4868 unsigned long flags; 4540 4869 ··· 4546 4871 WARN_ON_ONCE(scx_ops_bypass_depth <= 0); 4547 4872 if (scx_ops_bypass_depth != 1) 4548 4873 goto unlock; 4874 + bypass_timestamp = ktime_get_ns(); 4875 + scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); 4549 4876 } else { 4550 4877 scx_ops_bypass_depth--; 4551 4878 WARN_ON_ONCE(scx_ops_bypass_depth < 0); 4552 4879 if (scx_ops_bypass_depth != 0) 4553 4880 goto unlock; 4881 + scx_add_event(SCX_EV_BYPASS_DURATION, 4882 + ktime_get_ns() - bypass_timestamp); 4554 4883 } 4555 4884 4556 4885 atomic_inc(&scx_ops_breather_depth); ··· 4774 5095 static_branch_disable(&__scx_ops_enabled); 4775 5096 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 4776 5097 static_branch_disable(&scx_has_op[i]); 5098 + static_branch_disable(&scx_ops_allow_queued_wakeup); 4777 5099 static_branch_disable(&scx_ops_enq_last); 4778 5100 static_branch_disable(&scx_ops_enq_exiting); 4779 5101 static_branch_disable(&scx_ops_enq_migration_disabled); 4780 5102 static_branch_disable(&scx_ops_cpu_preempt); 4781 - static_branch_disable(&scx_builtin_idle_enabled); 5103 + scx_idle_disable(); 4782 5104 synchronize_rcu(); 4783 5105 4784 5106 if (ei->kind >= SCX_EXIT_ERROR) { ··· 5029 5349 .at_jiffies = jiffies, 5030 5350 }; 5031 5351 struct seq_buf s; 5352 + struct scx_event_stats events; 5032 5353 unsigned long flags; 5033 5354 char *buf; 5034 5355 int cpu; ··· 5138 5457 rq_unlock(rq, &rf); 5139 5458 } 5140 5459 5460 + dump_newline(&s); 5461 + dump_line(&s, "Event counters"); 5462 + dump_line(&s, "--------------"); 5463 + 5464 + scx_bpf_events(&events, sizeof(events)); 5465 + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); 5466 + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 5467 + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); 5468 + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); 5469 + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 5470 + scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); 5471 + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); 5472 + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); 5473 + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); 5474 + 5141 5475 if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) 5142 5476 memcpy(ei->dump + dump_len - sizeof(trunc_marker), 5143 5477 trunc_marker, sizeof(trunc_marker)); ··· 5242 5546 return -EINVAL; 5243 5547 } 5244 5548 5549 + /* 5550 + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle 5551 + * selection policy to be enabled. 5552 + */ 5553 + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && 5554 + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { 5555 + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); 5556 + return -EINVAL; 5557 + } 5558 + 5245 5559 return 0; 5246 5560 } 5247 5561 ··· 5269 5563 } 5270 5564 5271 5565 mutex_lock(&scx_ops_enable_mutex); 5566 + 5567 + /* 5568 + * Clear event counters so a new scx scheduler gets 5569 + * fresh event counter values. 5570 + */ 5571 + for_each_possible_cpu(cpu) { 5572 + struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); 5573 + memset(e, 0, sizeof(*e)); 5574 + } 5272 5575 5273 5576 if (!scx_ops_helper) { 5274 5577 WRITE_ONCE(scx_ops_helper, ··· 5376 5661 static_branch_enable_cpuslocked(&scx_has_op[i]); 5377 5662 5378 5663 check_hotplug_seq(ops); 5379 - #ifdef CONFIG_SMP 5380 - update_selcpu_topology(); 5381 - #endif 5664 + scx_idle_update_selcpu_topology(ops); 5665 + 5382 5666 cpus_read_unlock(); 5383 5667 5384 5668 ret = validate_ops(ops); ··· 5416 5702 if (((void (**)(void))ops)[i]) 5417 5703 static_branch_enable(&scx_has_op[i]); 5418 5704 5705 + if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) 5706 + static_branch_enable(&scx_ops_allow_queued_wakeup); 5419 5707 if (ops->flags & SCX_OPS_ENQ_LAST) 5420 5708 static_branch_enable(&scx_ops_enq_last); 5421 - 5422 5709 if (ops->flags & SCX_OPS_ENQ_EXITING) 5423 5710 static_branch_enable(&scx_ops_enq_exiting); 5424 5711 if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) ··· 5427 5712 if (scx_ops.cpu_acquire || scx_ops.cpu_release) 5428 5713 static_branch_enable(&scx_ops_cpu_preempt); 5429 5714 5430 - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { 5431 - reset_idle_masks(); 5432 - static_branch_enable(&scx_builtin_idle_enabled); 5433 - } else { 5434 - static_branch_disable(&scx_builtin_idle_enabled); 5435 - } 5715 + scx_idle_enable(ops); 5436 5716 5437 5717 /* 5438 5718 * Lock out forks, cgroup on/offlining and moves before opening the ··· 6066 6356 SCX_TG_ONLINE); 6067 6357 6068 6358 BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); 6069 - #ifdef CONFIG_SMP 6070 - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); 6071 - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); 6072 - #endif 6359 + scx_idle_init_masks(); 6360 + 6073 6361 scx_kick_cpus_pnt_seqs = 6074 6362 __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, 6075 6363 __alignof__(scx_kick_cpus_pnt_seqs[0])); ··· 6075 6367 6076 6368 for_each_possible_cpu(cpu) { 6077 6369 struct rq *rq = cpu_rq(cpu); 6370 + int n = cpu_to_node(cpu); 6078 6371 6079 6372 init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); 6080 6373 INIT_LIST_HEAD(&rq->scx.runnable_list); 6081 6374 INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); 6082 6375 6083 - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); 6084 - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); 6085 - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); 6086 - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); 6376 + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); 6377 + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 6378 + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 6379 + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 6087 6380 init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); 6088 6381 init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); 6089 6382 ··· 6101 6392 /******************************************************************************** 6102 6393 * Helpers that can be called from the BPF scheduler. 6103 6394 */ 6104 - #include <linux/btf_ids.h> 6105 - 6106 - __bpf_kfunc_start_defs(); 6107 - 6108 - static bool check_builtin_idle_enabled(void) 6109 - { 6110 - if (static_branch_likely(&scx_builtin_idle_enabled)) 6111 - return true; 6112 - 6113 - scx_ops_error("built-in idle tracking is disabled"); 6114 - return false; 6115 - } 6116 - 6117 - /** 6118 - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() 6119 - * @p: task_struct to select a CPU for 6120 - * @prev_cpu: CPU @p was on previously 6121 - * @wake_flags: %SCX_WAKE_* flags 6122 - * @is_idle: out parameter indicating whether the returned CPU is idle 6123 - * 6124 - * Can only be called from ops.select_cpu() if the built-in CPU selection is 6125 - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. 6126 - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). 6127 - * 6128 - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is 6129 - * currently idle and thus a good candidate for direct dispatching. 6130 - */ 6131 - __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 6132 - u64 wake_flags, bool *is_idle) 6133 - { 6134 - if (!ops_cpu_valid(prev_cpu, NULL)) 6135 - goto prev_cpu; 6136 - 6137 - if (!check_builtin_idle_enabled()) 6138 - goto prev_cpu; 6139 - 6140 - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) 6141 - goto prev_cpu; 6142 - 6143 - #ifdef CONFIG_SMP 6144 - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); 6145 - #endif 6146 - 6147 - prev_cpu: 6148 - *is_idle = false; 6149 - return prev_cpu; 6150 - } 6151 - 6152 - __bpf_kfunc_end_defs(); 6153 - 6154 - BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) 6155 - BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) 6156 - BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) 6157 - 6158 - static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { 6159 - .owner = THIS_MODULE, 6160 - .set = &scx_kfunc_ids_select_cpu, 6161 - }; 6162 - 6163 6395 static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) 6164 6396 { 6165 6397 if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) ··· 6622 6972 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to 6623 6973 * the current local DSQ for running tasks and thus are not 6624 6974 * visible to the BPF scheduler. 6975 + * 6976 + * Also skip re-enqueueing tasks that can only run on this 6977 + * CPU, as they would just be re-added to the same local 6978 + * DSQ without any benefit. 6625 6979 */ 6626 - if (p->migration_pending) 6980 + if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) 6627 6981 continue; 6628 6982 6629 6983 dispatch_dequeue(rq, p); ··· 7124 7470 } 7125 7471 7126 7472 /** 7473 + * scx_bpf_nr_node_ids - Return the number of possible node IDs 7474 + * 7475 + * All valid node IDs in the system are smaller than the returned value. 7476 + */ 7477 + __bpf_kfunc u32 scx_bpf_nr_node_ids(void) 7478 + { 7479 + return nr_node_ids; 7480 + } 7481 + 7482 + /** 7127 7483 * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs 7128 7484 * 7129 7485 * All valid CPU IDs in the system are smaller than the returned value. ··· 7171 7507 * is never released. The acquire / release semantics here are just used 7172 7508 * to make the cpumask is a trusted pointer in the caller. 7173 7509 */ 7174 - } 7175 - 7176 - /** 7177 - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking 7178 - * per-CPU cpumask. 7179 - * 7180 - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 7181 - */ 7182 - __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) 7183 - { 7184 - if (!check_builtin_idle_enabled()) 7185 - return cpu_none_mask; 7186 - 7187 - #ifdef CONFIG_SMP 7188 - return idle_masks.cpu; 7189 - #else 7190 - return cpu_none_mask; 7191 - #endif 7192 - } 7193 - 7194 - /** 7195 - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, 7196 - * per-physical-core cpumask. Can be used to determine if an entire physical 7197 - * core is free. 7198 - * 7199 - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. 7200 - */ 7201 - __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) 7202 - { 7203 - if (!check_builtin_idle_enabled()) 7204 - return cpu_none_mask; 7205 - 7206 - #ifdef CONFIG_SMP 7207 - if (sched_smt_active()) 7208 - return idle_masks.smt; 7209 - else 7210 - return idle_masks.cpu; 7211 - #else 7212 - return cpu_none_mask; 7213 - #endif 7214 - } 7215 - 7216 - /** 7217 - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to 7218 - * either the percpu, or SMT idle-tracking cpumask. 7219 - * @idle_mask: &cpumask to use 7220 - */ 7221 - __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) 7222 - { 7223 - /* 7224 - * Empty function body because we aren't actually acquiring or releasing 7225 - * a reference to a global idle cpumask, which is read-only in the 7226 - * caller and is never released. The acquire / release semantics here 7227 - * are just used to make the cpumask a trusted pointer in the caller. 7228 - */ 7229 - } 7230 - 7231 - /** 7232 - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state 7233 - * @cpu: cpu to test and clear idle for 7234 - * 7235 - * Returns %true if @cpu was idle and its idle state was successfully cleared. 7236 - * %false otherwise. 7237 - * 7238 - * Unavailable if ops.update_idle() is implemented and 7239 - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 7240 - */ 7241 - __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) 7242 - { 7243 - if (!check_builtin_idle_enabled()) 7244 - return false; 7245 - 7246 - if (ops_cpu_valid(cpu, NULL)) 7247 - return test_and_clear_cpu_idle(cpu); 7248 - else 7249 - return false; 7250 - } 7251 - 7252 - /** 7253 - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu 7254 - * @cpus_allowed: Allowed cpumask 7255 - * @flags: %SCX_PICK_IDLE_CPU_* flags 7256 - * 7257 - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu 7258 - * number on success. -%EBUSY if no matching cpu was found. 7259 - * 7260 - * Idle CPU tracking may race against CPU scheduling state transitions. For 7261 - * example, this function may return -%EBUSY as CPUs are transitioning into the 7262 - * idle state. If the caller then assumes that there will be dispatch events on 7263 - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs 7264 - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and 7265 - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch 7266 - * event in the near future. 7267 - * 7268 - * Unavailable if ops.update_idle() is implemented and 7269 - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 7270 - */ 7271 - __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, 7272 - u64 flags) 7273 - { 7274 - if (!check_builtin_idle_enabled()) 7275 - return -EBUSY; 7276 - 7277 - return scx_pick_idle_cpu(cpus_allowed, flags); 7278 - } 7279 - 7280 - /** 7281 - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU 7282 - * @cpus_allowed: Allowed cpumask 7283 - * @flags: %SCX_PICK_IDLE_CPU_* flags 7284 - * 7285 - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any 7286 - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu 7287 - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is 7288 - * empty. 7289 - * 7290 - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not 7291 - * set, this function can't tell which CPUs are idle and will always pick any 7292 - * CPU. 7293 - */ 7294 - __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, 7295 - u64 flags) 7296 - { 7297 - s32 cpu; 7298 - 7299 - if (static_branch_likely(&scx_builtin_idle_enabled)) { 7300 - cpu = scx_pick_idle_cpu(cpus_allowed, flags); 7301 - if (cpu >= 0) 7302 - return cpu; 7303 - } 7304 - 7305 - cpu = cpumask_any_distribute(cpus_allowed); 7306 - if (cpu < nr_cpu_ids) 7307 - return cpu; 7308 - else 7309 - return -EBUSY; 7310 7510 } 7311 7511 7312 7512 /** ··· 7293 7765 return clock; 7294 7766 } 7295 7767 7768 + /* 7769 + * scx_bpf_events - Get a system-wide event counter to 7770 + * @events: output buffer from a BPF program 7771 + * @events__sz: @events len, must end in '__sz'' for the verifier 7772 + */ 7773 + __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, 7774 + size_t events__sz) 7775 + { 7776 + struct scx_event_stats e_sys, *e_cpu; 7777 + int cpu; 7778 + 7779 + /* Aggregate per-CPU event counters into the system-wide counters. */ 7780 + memset(&e_sys, 0, sizeof(e_sys)); 7781 + for_each_possible_cpu(cpu) { 7782 + e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); 7783 + scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); 7784 + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); 7785 + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); 7786 + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); 7787 + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); 7788 + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); 7789 + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); 7790 + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); 7791 + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); 7792 + } 7793 + 7794 + /* 7795 + * We cannot entirely trust a BPF-provided size since a BPF program 7796 + * might be compiled against a different vmlinux.h, of which 7797 + * scx_event_stats would be larger (a newer vmlinux.h) or smaller 7798 + * (an older vmlinux.h). Hence, we use the smaller size to avoid 7799 + * memory corruption. 7800 + */ 7801 + events__sz = min(events__sz, sizeof(*events)); 7802 + memcpy(events, &e_sys, events__sz); 7803 + } 7804 + 7296 7805 __bpf_kfunc_end_defs(); 7297 7806 7298 7807 BTF_KFUNCS_START(scx_kfunc_ids_any) ··· 7345 7780 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) 7346 7781 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) 7347 7782 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) 7783 + BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) 7348 7784 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) 7349 7785 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) 7350 7786 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) ··· 7363 7797 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) 7364 7798 #endif 7365 7799 BTF_ID_FLAGS(func, scx_bpf_now) 7800 + BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) 7366 7801 BTF_KFUNCS_END(scx_kfunc_ids_any) 7367 7802 7368 7803 static const struct btf_kfunc_id_set scx_kfunc_set_any = { ··· 7387 7820 * check using scx_kf_allowed(). 7388 7821 */ 7389 7822 if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7390 - &scx_kfunc_set_select_cpu)) || 7391 - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7392 7823 &scx_kfunc_set_enqueue_dispatch)) || 7393 7824 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 7394 7825 &scx_kfunc_set_dispatch)) || ··· 7403 7838 (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, 7404 7839 &scx_kfunc_set_any))) { 7405 7840 pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); 7841 + return ret; 7842 + } 7843 + 7844 + ret = scx_idle_init(); 7845 + if (ret) { 7846 + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); 7406 7847 return ret; 7407 7848 } 7408 7849

+10

kernel/sched/ext.h

··· 8 8 */ 9 9 #ifdef CONFIG_SCHED_CLASS_EXT 10 10 11 + DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); 12 + 11 13 void scx_tick(struct rq *rq); 12 14 void init_scx_entity(struct sched_ext_entity *scx); 13 15 void scx_pre_fork(struct task_struct *p); ··· 36 34 return scx_enabled() && p->sched_class == &ext_sched_class; 37 35 } 38 36 37 + static inline bool scx_allow_ttwu_queue(const struct task_struct *p) 38 + { 39 + return !scx_enabled() || 40 + static_branch_likely(&scx_ops_allow_queued_wakeup) || 41 + p->sched_class != &ext_sched_class; 42 + } 43 + 39 44 #ifdef CONFIG_SCHED_CORE 40 45 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, 41 46 bool in_fi); ··· 61 52 static inline void scx_rq_deactivate(struct rq *rq) {} 62 53 static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } 63 54 static inline bool task_on_scx(const struct task_struct *p) { return false; } 55 + static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } 64 56 static inline void init_sched_ext_class(void) {} 65 57 66 58 #endif /* CONFIG_SCHED_CLASS_EXT */

+1171

kernel/sched/ext_idle.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 + * 5 + * Built-in idle CPU tracking policy. 6 + * 7 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 8 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 9 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 10 + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> 11 + */ 12 + #include "ext_idle.h" 13 + 14 + /* Enable/disable built-in idle CPU selection policy */ 15 + static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); 16 + 17 + /* Enable/disable per-node idle cpumasks */ 18 + static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); 19 + 20 + #ifdef CONFIG_SMP 21 + /* Enable/disable LLC aware optimizations */ 22 + static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); 23 + 24 + /* Enable/disable NUMA aware optimizations */ 25 + static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); 26 + 27 + /* 28 + * cpumasks to track idle CPUs within each NUMA node. 29 + * 30 + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask 31 + * from is used to track all the idle CPUs in the system. 32 + */ 33 + struct scx_idle_cpus { 34 + cpumask_var_t cpu; 35 + cpumask_var_t smt; 36 + }; 37 + 38 + /* 39 + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE 40 + * is not enabled). 41 + */ 42 + static struct scx_idle_cpus scx_idle_global_masks; 43 + 44 + /* 45 + * Per-node idle cpumasks. 46 + */ 47 + static struct scx_idle_cpus **scx_idle_node_masks; 48 + 49 + /* 50 + * Return the idle masks associated to a target @node. 51 + * 52 + * NUMA_NO_NODE identifies the global idle cpumask. 53 + */ 54 + static struct scx_idle_cpus *idle_cpumask(int node) 55 + { 56 + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; 57 + } 58 + 59 + /* 60 + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if 61 + * per-node idle cpumasks are disabled. 62 + */ 63 + static int scx_cpu_node_if_enabled(int cpu) 64 + { 65 + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) 66 + return NUMA_NO_NODE; 67 + 68 + return cpu_to_node(cpu); 69 + } 70 + 71 + bool scx_idle_test_and_clear_cpu(int cpu) 72 + { 73 + int node = scx_cpu_node_if_enabled(cpu); 74 + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; 75 + 76 + #ifdef CONFIG_SCHED_SMT 77 + /* 78 + * SMT mask should be cleared whether we can claim @cpu or not. The SMT 79 + * cluster is not wholly idle either way. This also prevents 80 + * scx_pick_idle_cpu() from getting caught in an infinite loop. 81 + */ 82 + if (sched_smt_active()) { 83 + const struct cpumask *smt = cpu_smt_mask(cpu); 84 + struct cpumask *idle_smts = idle_cpumask(node)->smt; 85 + 86 + /* 87 + * If offline, @cpu is not its own sibling and 88 + * scx_pick_idle_cpu() can get caught in an infinite loop as 89 + * @cpu is never cleared from the idle SMT mask. Ensure that 90 + * @cpu is eventually cleared. 91 + * 92 + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to 93 + * reduce memory writes, which may help alleviate cache 94 + * coherence pressure. 95 + */ 96 + if (cpumask_intersects(smt, idle_smts)) 97 + cpumask_andnot(idle_smts, idle_smts, smt); 98 + else if (cpumask_test_cpu(cpu, idle_smts)) 99 + __cpumask_clear_cpu(cpu, idle_smts); 100 + } 101 + #endif 102 + 103 + return cpumask_test_and_clear_cpu(cpu, idle_cpus); 104 + } 105 + 106 + /* 107 + * Pick an idle CPU in a specific NUMA node. 108 + */ 109 + static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) 110 + { 111 + int cpu; 112 + 113 + retry: 114 + if (sched_smt_active()) { 115 + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); 116 + if (cpu < nr_cpu_ids) 117 + goto found; 118 + 119 + if (flags & SCX_PICK_IDLE_CORE) 120 + return -EBUSY; 121 + } 122 + 123 + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); 124 + if (cpu >= nr_cpu_ids) 125 + return -EBUSY; 126 + 127 + found: 128 + if (scx_idle_test_and_clear_cpu(cpu)) 129 + return cpu; 130 + else 131 + goto retry; 132 + } 133 + 134 + /* 135 + * Tracks nodes that have not yet been visited when searching for an idle 136 + * CPU across all available nodes. 137 + */ 138 + static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); 139 + 140 + /* 141 + * Search for an idle CPU across all nodes, excluding @node. 142 + */ 143 + static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) 144 + { 145 + nodemask_t *unvisited; 146 + s32 cpu = -EBUSY; 147 + 148 + preempt_disable(); 149 + unvisited = this_cpu_ptr(&per_cpu_unvisited); 150 + 151 + /* 152 + * Restrict the search to the online nodes (excluding the current 153 + * node that has been visited already). 154 + */ 155 + nodes_copy(*unvisited, node_states[N_ONLINE]); 156 + node_clear(node, *unvisited); 157 + 158 + /* 159 + * Traverse all nodes in order of increasing distance, starting 160 + * from @node. 161 + * 162 + * This loop is O(N^2), with N being the amount of NUMA nodes, 163 + * which might be quite expensive in large NUMA systems. However, 164 + * this complexity comes into play only when a scheduler enables 165 + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU 166 + * without specifying a target NUMA node, so it shouldn't be a 167 + * bottleneck is most cases. 168 + * 169 + * As a future optimization we may want to cache the list of nodes 170 + * in a per-node array, instead of actually traversing them every 171 + * time. 172 + */ 173 + for_each_node_numadist(node, *unvisited) { 174 + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); 175 + if (cpu >= 0) 176 + break; 177 + } 178 + preempt_enable(); 179 + 180 + return cpu; 181 + } 182 + 183 + /* 184 + * Find an idle CPU in the system, starting from @node. 185 + */ 186 + s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) 187 + { 188 + s32 cpu; 189 + 190 + /* 191 + * Always search in the starting node first (this is an 192 + * optimization that can save some cycles even when the search is 193 + * not limited to a single node). 194 + */ 195 + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); 196 + if (cpu >= 0) 197 + return cpu; 198 + 199 + /* 200 + * Stop the search if we are using only a single global cpumask 201 + * (NUMA_NO_NODE) or if the search is restricted to the first node 202 + * only. 203 + */ 204 + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) 205 + return -EBUSY; 206 + 207 + /* 208 + * Extend the search to the other online nodes. 209 + */ 210 + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); 211 + } 212 + 213 + /* 214 + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC 215 + * domain is not defined). 216 + */ 217 + static unsigned int llc_weight(s32 cpu) 218 + { 219 + struct sched_domain *sd; 220 + 221 + sd = rcu_dereference(per_cpu(sd_llc, cpu)); 222 + if (!sd) 223 + return 0; 224 + 225 + return sd->span_weight; 226 + } 227 + 228 + /* 229 + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC 230 + * domain is not defined). 231 + */ 232 + static struct cpumask *llc_span(s32 cpu) 233 + { 234 + struct sched_domain *sd; 235 + 236 + sd = rcu_dereference(per_cpu(sd_llc, cpu)); 237 + if (!sd) 238 + return 0; 239 + 240 + return sched_domain_span(sd); 241 + } 242 + 243 + /* 244 + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the 245 + * NUMA domain is not defined). 246 + */ 247 + static unsigned int numa_weight(s32 cpu) 248 + { 249 + struct sched_domain *sd; 250 + struct sched_group *sg; 251 + 252 + sd = rcu_dereference(per_cpu(sd_numa, cpu)); 253 + if (!sd) 254 + return 0; 255 + sg = sd->groups; 256 + if (!sg) 257 + return 0; 258 + 259 + return sg->group_weight; 260 + } 261 + 262 + /* 263 + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA 264 + * domain is not defined). 265 + */ 266 + static struct cpumask *numa_span(s32 cpu) 267 + { 268 + struct sched_domain *sd; 269 + struct sched_group *sg; 270 + 271 + sd = rcu_dereference(per_cpu(sd_numa, cpu)); 272 + if (!sd) 273 + return NULL; 274 + sg = sd->groups; 275 + if (!sg) 276 + return NULL; 277 + 278 + return sched_group_span(sg); 279 + } 280 + 281 + /* 282 + * Return true if the LLC domains do not perfectly overlap with the NUMA 283 + * domains, false otherwise. 284 + */ 285 + static bool llc_numa_mismatch(void) 286 + { 287 + int cpu; 288 + 289 + /* 290 + * We need to scan all online CPUs to verify whether their scheduling 291 + * domains overlap. 292 + * 293 + * While it is rare to encounter architectures with asymmetric NUMA 294 + * topologies, CPU hotplugging or virtualized environments can result 295 + * in asymmetric configurations. 296 + * 297 + * For example: 298 + * 299 + * NUMA 0: 300 + * - LLC 0: cpu0..cpu7 301 + * - LLC 1: cpu8..cpu15 [offline] 302 + * 303 + * NUMA 1: 304 + * - LLC 0: cpu16..cpu23 305 + * - LLC 1: cpu24..cpu31 306 + * 307 + * In this case, if we only check the first online CPU (cpu0), we might 308 + * incorrectly assume that the LLC and NUMA domains are fully 309 + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC 310 + * domains). 311 + */ 312 + for_each_online_cpu(cpu) 313 + if (llc_weight(cpu) != numa_weight(cpu)) 314 + return true; 315 + 316 + return false; 317 + } 318 + 319 + /* 320 + * Initialize topology-aware scheduling. 321 + * 322 + * Detect if the system has multiple LLC or multiple NUMA domains and enable 323 + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle 324 + * selection policy. 325 + * 326 + * Assumption: the kernel's internal topology representation assumes that each 327 + * CPU belongs to a single LLC domain, and that each LLC domain is entirely 328 + * contained within a single NUMA node. 329 + */ 330 + void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) 331 + { 332 + bool enable_llc = false, enable_numa = false; 333 + unsigned int nr_cpus; 334 + s32 cpu = cpumask_first(cpu_online_mask); 335 + 336 + /* 337 + * Enable LLC domain optimization only when there are multiple LLC 338 + * domains among the online CPUs. If all online CPUs are part of a 339 + * single LLC domain, the idle CPU selection logic can choose any 340 + * online CPU without bias. 341 + * 342 + * Note that it is sufficient to check the LLC domain of the first 343 + * online CPU to determine whether a single LLC domain includes all 344 + * CPUs. 345 + */ 346 + rcu_read_lock(); 347 + nr_cpus = llc_weight(cpu); 348 + if (nr_cpus > 0) { 349 + if (nr_cpus < num_online_cpus()) 350 + enable_llc = true; 351 + pr_debug("sched_ext: LLC=%*pb weight=%u\n", 352 + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); 353 + } 354 + 355 + /* 356 + * Enable NUMA optimization only when there are multiple NUMA domains 357 + * among the online CPUs and the NUMA domains don't perfectly overlaps 358 + * with the LLC domains. 359 + * 360 + * If all CPUs belong to the same NUMA node and the same LLC domain, 361 + * enabling both NUMA and LLC optimizations is unnecessary, as checking 362 + * for an idle CPU in the same domain twice is redundant. 363 + * 364 + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA 365 + * optimization, as we would naturally select idle CPUs within 366 + * specific NUMA nodes querying the corresponding per-node cpumask. 367 + */ 368 + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { 369 + nr_cpus = numa_weight(cpu); 370 + if (nr_cpus > 0) { 371 + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) 372 + enable_numa = true; 373 + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", 374 + cpumask_pr_args(numa_span(cpu)), nr_cpus); 375 + } 376 + } 377 + rcu_read_unlock(); 378 + 379 + pr_debug("sched_ext: LLC idle selection %s\n", 380 + str_enabled_disabled(enable_llc)); 381 + pr_debug("sched_ext: NUMA idle selection %s\n", 382 + str_enabled_disabled(enable_numa)); 383 + 384 + if (enable_llc) 385 + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); 386 + else 387 + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); 388 + if (enable_numa) 389 + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); 390 + else 391 + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); 392 + } 393 + 394 + /* 395 + * Built-in CPU idle selection policy: 396 + * 397 + * 1. Prioritize full-idle cores: 398 + * - always prioritize CPUs from fully idle cores (both logical CPUs are 399 + * idle) to avoid interference caused by SMT. 400 + * 401 + * 2. Reuse the same CPU: 402 + * - prefer the last used CPU to take advantage of cached data (L1, L2) and 403 + * branch prediction optimizations. 404 + * 405 + * 3. Pick a CPU within the same LLC (Last-Level Cache): 406 + * - if the above conditions aren't met, pick a CPU that shares the same LLC 407 + * to maintain cache locality. 408 + * 409 + * 4. Pick a CPU within the same NUMA node, if enabled: 410 + * - choose a CPU from the same NUMA node to reduce memory access latency. 411 + * 412 + * 5. Pick any idle CPU usable by the task. 413 + * 414 + * Step 3 and 4 are performed only if the system has, respectively, 415 + * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and 416 + * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. 417 + * 418 + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always 419 + * begin in @prev_cpu's node and proceed to other nodes in order of 420 + * increasing distance. 421 + * 422 + * Return the picked CPU if idle, or a negative value otherwise. 423 + * 424 + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because 425 + * we never call ops.select_cpu() for them, see select_task_rq(). 426 + */ 427 + s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) 428 + { 429 + const struct cpumask *llc_cpus = NULL; 430 + const struct cpumask *numa_cpus = NULL; 431 + int node = scx_cpu_node_if_enabled(prev_cpu); 432 + s32 cpu; 433 + 434 + /* 435 + * This is necessary to protect llc_cpus. 436 + */ 437 + rcu_read_lock(); 438 + 439 + /* 440 + * Determine the scheduling domain only if the task is allowed to run 441 + * on all CPUs. 442 + * 443 + * This is done primarily for efficiency, as it avoids the overhead of 444 + * updating a cpumask every time we need to select an idle CPU (which 445 + * can be costly in large SMP systems), but it also aligns logically: 446 + * if a task's scheduling domain is restricted by user-space (through 447 + * CPU affinity), the task will simply use the flat scheduling domain 448 + * defined by user-space. 449 + */ 450 + if (p->nr_cpus_allowed >= num_possible_cpus()) { 451 + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) 452 + numa_cpus = numa_span(prev_cpu); 453 + 454 + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) 455 + llc_cpus = llc_span(prev_cpu); 456 + } 457 + 458 + /* 459 + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. 460 + */ 461 + if (wake_flags & SCX_WAKE_SYNC) { 462 + int waker_node; 463 + 464 + /* 465 + * If the waker's CPU is cache affine and prev_cpu is idle, 466 + * then avoid a migration. 467 + */ 468 + cpu = smp_processor_id(); 469 + if (cpus_share_cache(cpu, prev_cpu) && 470 + scx_idle_test_and_clear_cpu(prev_cpu)) { 471 + cpu = prev_cpu; 472 + goto out_unlock; 473 + } 474 + 475 + /* 476 + * If the waker's local DSQ is empty, and the system is under 477 + * utilized, try to wake up @p to the local DSQ of the waker. 478 + * 479 + * Checking only for an empty local DSQ is insufficient as it 480 + * could give the wakee an unfair advantage when the system is 481 + * oversaturated. 482 + * 483 + * Checking only for the presence of idle CPUs is also 484 + * insufficient as the local DSQ of the waker could have tasks 485 + * piled up on it even if there is an idle core elsewhere on 486 + * the system. 487 + */ 488 + waker_node = cpu_to_node(cpu); 489 + if (!(current->flags & PF_EXITING) && 490 + cpu_rq(cpu)->scx.local_dsq.nr == 0 && 491 + (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && 492 + !cpumask_empty(idle_cpumask(waker_node)->cpu)) { 493 + if (cpumask_test_cpu(cpu, p->cpus_ptr)) 494 + goto out_unlock; 495 + } 496 + } 497 + 498 + /* 499 + * If CPU has SMT, any wholly idle CPU is likely a better pick than 500 + * partially idle @prev_cpu. 501 + */ 502 + if (sched_smt_active()) { 503 + /* 504 + * Keep using @prev_cpu if it's part of a fully idle core. 505 + */ 506 + if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && 507 + scx_idle_test_and_clear_cpu(prev_cpu)) { 508 + cpu = prev_cpu; 509 + goto out_unlock; 510 + } 511 + 512 + /* 513 + * Search for any fully idle core in the same LLC domain. 514 + */ 515 + if (llc_cpus) { 516 + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); 517 + if (cpu >= 0) 518 + goto out_unlock; 519 + } 520 + 521 + /* 522 + * Search for any fully idle core in the same NUMA node. 523 + */ 524 + if (numa_cpus) { 525 + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); 526 + if (cpu >= 0) 527 + goto out_unlock; 528 + } 529 + 530 + /* 531 + * Search for any full-idle core usable by the task. 532 + * 533 + * If the node-aware idle CPU selection policy is enabled 534 + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always 535 + * begin in prev_cpu's node and proceed to other nodes in 536 + * order of increasing distance. 537 + */ 538 + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); 539 + if (cpu >= 0) 540 + goto out_unlock; 541 + 542 + /* 543 + * Give up if we're strictly looking for a full-idle SMT 544 + * core. 545 + */ 546 + if (flags & SCX_PICK_IDLE_CORE) { 547 + cpu = prev_cpu; 548 + goto out_unlock; 549 + } 550 + } 551 + 552 + /* 553 + * Use @prev_cpu if it's idle. 554 + */ 555 + if (scx_idle_test_and_clear_cpu(prev_cpu)) { 556 + cpu = prev_cpu; 557 + goto out_unlock; 558 + } 559 + 560 + /* 561 + * Search for any idle CPU in the same LLC domain. 562 + */ 563 + if (llc_cpus) { 564 + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); 565 + if (cpu >= 0) 566 + goto out_unlock; 567 + } 568 + 569 + /* 570 + * Search for any idle CPU in the same NUMA node. 571 + */ 572 + if (numa_cpus) { 573 + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); 574 + if (cpu >= 0) 575 + goto out_unlock; 576 + } 577 + 578 + /* 579 + * Search for any idle CPU usable by the task. 580 + * 581 + * If the node-aware idle CPU selection policy is enabled 582 + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin 583 + * in prev_cpu's node and proceed to other nodes in order of 584 + * increasing distance. 585 + */ 586 + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); 587 + if (cpu >= 0) 588 + goto out_unlock; 589 + 590 + out_unlock: 591 + rcu_read_unlock(); 592 + 593 + return cpu; 594 + } 595 + 596 + /* 597 + * Initialize global and per-node idle cpumasks. 598 + */ 599 + void scx_idle_init_masks(void) 600 + { 601 + int node; 602 + 603 + /* Allocate global idle cpumasks */ 604 + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); 605 + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); 606 + 607 + /* Allocate per-node idle cpumasks */ 608 + scx_idle_node_masks = kcalloc(num_possible_nodes(), 609 + sizeof(*scx_idle_node_masks), GFP_KERNEL); 610 + BUG_ON(!scx_idle_node_masks); 611 + 612 + for_each_node(node) { 613 + scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), 614 + GFP_KERNEL, node); 615 + BUG_ON(!scx_idle_node_masks[node]); 616 + 617 + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); 618 + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); 619 + } 620 + } 621 + 622 + static void update_builtin_idle(int cpu, bool idle) 623 + { 624 + int node = scx_cpu_node_if_enabled(cpu); 625 + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; 626 + 627 + assign_cpu(cpu, idle_cpus, idle); 628 + 629 + #ifdef CONFIG_SCHED_SMT 630 + if (sched_smt_active()) { 631 + const struct cpumask *smt = cpu_smt_mask(cpu); 632 + struct cpumask *idle_smts = idle_cpumask(node)->smt; 633 + 634 + if (idle) { 635 + /* 636 + * idle_smt handling is racy but that's fine as it's 637 + * only for optimization and self-correcting. 638 + */ 639 + if (!cpumask_subset(smt, idle_cpus)) 640 + return; 641 + cpumask_or(idle_smts, idle_smts, smt); 642 + } else { 643 + cpumask_andnot(idle_smts, idle_smts, smt); 644 + } 645 + } 646 + #endif 647 + } 648 + 649 + /* 650 + * Update the idle state of a CPU to @idle. 651 + * 652 + * If @do_notify is true, ops.update_idle() is invoked to notify the scx 653 + * scheduler of an actual idle state transition (idle to busy or vice 654 + * versa). If @do_notify is false, only the idle state in the idle masks is 655 + * refreshed without invoking ops.update_idle(). 656 + * 657 + * This distinction is necessary, because an idle CPU can be "reserved" and 658 + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as 659 + * busy even if no tasks are dispatched. In this case, the CPU may return 660 + * to idle without a true state transition. Refreshing the idle masks 661 + * without invoking ops.update_idle() ensures accurate idle state tracking 662 + * while avoiding unnecessary updates and maintaining balanced state 663 + * transitions. 664 + */ 665 + void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) 666 + { 667 + int cpu = cpu_of(rq); 668 + 669 + lockdep_assert_rq_held(rq); 670 + 671 + /* 672 + * Trigger ops.update_idle() only when transitioning from a task to 673 + * the idle thread and vice versa. 674 + * 675 + * Idle transitions are indicated by do_notify being set to true, 676 + * managed by put_prev_task_idle()/set_next_task_idle(). 677 + */ 678 + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) 679 + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); 680 + 681 + /* 682 + * Update the idle masks: 683 + * - for real idle transitions (do_notify == true) 684 + * - for idle-to-idle transitions (indicated by the previous task 685 + * being the idle thread, managed by pick_task_idle()) 686 + * 687 + * Skip updating idle masks if the previous task is not the idle 688 + * thread, since set_next_task_idle() has already handled it when 689 + * transitioning from a task to the idle thread (calling this 690 + * function with do_notify == true). 691 + * 692 + * In this way we can avoid updating the idle masks twice, 693 + * unnecessarily. 694 + */ 695 + if (static_branch_likely(&scx_builtin_idle_enabled)) 696 + if (do_notify || is_idle_task(rq->curr)) 697 + update_builtin_idle(cpu, idle); 698 + } 699 + 700 + static void reset_idle_masks(struct sched_ext_ops *ops) 701 + { 702 + int node; 703 + 704 + /* 705 + * Consider all online cpus idle. Should converge to the actual state 706 + * quickly. 707 + */ 708 + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { 709 + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); 710 + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); 711 + return; 712 + } 713 + 714 + for_each_node(node) { 715 + const struct cpumask *node_mask = cpumask_of_node(node); 716 + 717 + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); 718 + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); 719 + } 720 + } 721 + #endif /* CONFIG_SMP */ 722 + 723 + void scx_idle_enable(struct sched_ext_ops *ops) 724 + { 725 + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) 726 + static_branch_enable(&scx_builtin_idle_enabled); 727 + else 728 + static_branch_disable(&scx_builtin_idle_enabled); 729 + 730 + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) 731 + static_branch_enable(&scx_builtin_idle_per_node); 732 + else 733 + static_branch_disable(&scx_builtin_idle_per_node); 734 + 735 + #ifdef CONFIG_SMP 736 + reset_idle_masks(ops); 737 + #endif 738 + } 739 + 740 + void scx_idle_disable(void) 741 + { 742 + static_branch_disable(&scx_builtin_idle_enabled); 743 + static_branch_disable(&scx_builtin_idle_per_node); 744 + } 745 + 746 + /******************************************************************************** 747 + * Helpers that can be called from the BPF scheduler. 748 + */ 749 + 750 + static int validate_node(int node) 751 + { 752 + if (!static_branch_likely(&scx_builtin_idle_per_node)) { 753 + scx_ops_error("per-node idle tracking is disabled"); 754 + return -EOPNOTSUPP; 755 + } 756 + 757 + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ 758 + if (node == NUMA_NO_NODE) 759 + return -ENOENT; 760 + 761 + /* Make sure node is in a valid range */ 762 + if (node < 0 || node >= nr_node_ids) { 763 + scx_ops_error("invalid node %d", node); 764 + return -EINVAL; 765 + } 766 + 767 + /* Make sure the node is part of the set of possible nodes */ 768 + if (!node_possible(node)) { 769 + scx_ops_error("unavailable node %d", node); 770 + return -EINVAL; 771 + } 772 + 773 + return node; 774 + } 775 + 776 + __bpf_kfunc_start_defs(); 777 + 778 + static bool check_builtin_idle_enabled(void) 779 + { 780 + if (static_branch_likely(&scx_builtin_idle_enabled)) 781 + return true; 782 + 783 + scx_ops_error("built-in idle tracking is disabled"); 784 + return false; 785 + } 786 + 787 + /** 788 + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or 789 + * trigger an error if @cpu is invalid 790 + * @cpu: target CPU 791 + */ 792 + __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) 793 + { 794 + #ifdef CONFIG_NUMA 795 + if (!ops_cpu_valid(cpu, NULL)) 796 + return NUMA_NO_NODE; 797 + 798 + return cpu_to_node(cpu); 799 + #else 800 + return 0; 801 + #endif 802 + } 803 + 804 + /** 805 + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() 806 + * @p: task_struct to select a CPU for 807 + * @prev_cpu: CPU @p was on previously 808 + * @wake_flags: %SCX_WAKE_* flags 809 + * @is_idle: out parameter indicating whether the returned CPU is idle 810 + * 811 + * Can only be called from ops.select_cpu() if the built-in CPU selection is 812 + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. 813 + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). 814 + * 815 + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is 816 + * currently idle and thus a good candidate for direct dispatching. 817 + */ 818 + __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, 819 + u64 wake_flags, bool *is_idle) 820 + { 821 + #ifdef CONFIG_SMP 822 + s32 cpu; 823 + #endif 824 + if (!ops_cpu_valid(prev_cpu, NULL)) 825 + goto prev_cpu; 826 + 827 + if (!check_builtin_idle_enabled()) 828 + goto prev_cpu; 829 + 830 + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) 831 + goto prev_cpu; 832 + 833 + #ifdef CONFIG_SMP 834 + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); 835 + if (cpu >= 0) { 836 + *is_idle = true; 837 + return cpu; 838 + } 839 + #endif 840 + 841 + prev_cpu: 842 + *is_idle = false; 843 + return prev_cpu; 844 + } 845 + 846 + /** 847 + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the 848 + * idle-tracking per-CPU cpumask of a target NUMA node. 849 + * @node: target NUMA node 850 + * 851 + * Returns an empty cpumask if idle tracking is not enabled, if @node is 852 + * not valid, or running on a UP kernel. In this case the actual error will 853 + * be reported to the BPF scheduler via scx_ops_error(). 854 + */ 855 + __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) 856 + { 857 + node = validate_node(node); 858 + if (node < 0) 859 + return cpu_none_mask; 860 + 861 + #ifdef CONFIG_SMP 862 + return idle_cpumask(node)->cpu; 863 + #else 864 + return cpu_none_mask; 865 + #endif 866 + } 867 + 868 + /** 869 + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking 870 + * per-CPU cpumask. 871 + * 872 + * Returns an empty mask if idle tracking is not enabled, or running on a 873 + * UP kernel. 874 + */ 875 + __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) 876 + { 877 + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { 878 + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); 879 + return cpu_none_mask; 880 + } 881 + 882 + if (!check_builtin_idle_enabled()) 883 + return cpu_none_mask; 884 + 885 + #ifdef CONFIG_SMP 886 + return idle_cpumask(NUMA_NO_NODE)->cpu; 887 + #else 888 + return cpu_none_mask; 889 + #endif 890 + } 891 + 892 + /** 893 + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the 894 + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be 895 + * used to determine if an entire physical core is free. 896 + * @node: target NUMA node 897 + * 898 + * Returns an empty cpumask if idle tracking is not enabled, if @node is 899 + * not valid, or running on a UP kernel. In this case the actual error will 900 + * be reported to the BPF scheduler via scx_ops_error(). 901 + */ 902 + __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) 903 + { 904 + node = validate_node(node); 905 + if (node < 0) 906 + return cpu_none_mask; 907 + 908 + #ifdef CONFIG_SMP 909 + if (sched_smt_active()) 910 + return idle_cpumask(node)->smt; 911 + else 912 + return idle_cpumask(node)->cpu; 913 + #else 914 + return cpu_none_mask; 915 + #endif 916 + } 917 + 918 + /** 919 + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, 920 + * per-physical-core cpumask. Can be used to determine if an entire physical 921 + * core is free. 922 + * 923 + * Returns an empty mask if idle tracking is not enabled, or running on a 924 + * UP kernel. 925 + */ 926 + __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) 927 + { 928 + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { 929 + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); 930 + return cpu_none_mask; 931 + } 932 + 933 + if (!check_builtin_idle_enabled()) 934 + return cpu_none_mask; 935 + 936 + #ifdef CONFIG_SMP 937 + if (sched_smt_active()) 938 + return idle_cpumask(NUMA_NO_NODE)->smt; 939 + else 940 + return idle_cpumask(NUMA_NO_NODE)->cpu; 941 + #else 942 + return cpu_none_mask; 943 + #endif 944 + } 945 + 946 + /** 947 + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to 948 + * either the percpu, or SMT idle-tracking cpumask. 949 + * @idle_mask: &cpumask to use 950 + */ 951 + __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) 952 + { 953 + /* 954 + * Empty function body because we aren't actually acquiring or releasing 955 + * a reference to a global idle cpumask, which is read-only in the 956 + * caller and is never released. The acquire / release semantics here 957 + * are just used to make the cpumask a trusted pointer in the caller. 958 + */ 959 + } 960 + 961 + /** 962 + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state 963 + * @cpu: cpu to test and clear idle for 964 + * 965 + * Returns %true if @cpu was idle and its idle state was successfully cleared. 966 + * %false otherwise. 967 + * 968 + * Unavailable if ops.update_idle() is implemented and 969 + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 970 + */ 971 + __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) 972 + { 973 + if (!check_builtin_idle_enabled()) 974 + return false; 975 + 976 + if (ops_cpu_valid(cpu, NULL)) 977 + return scx_idle_test_and_clear_cpu(cpu); 978 + else 979 + return false; 980 + } 981 + 982 + /** 983 + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node 984 + * @cpus_allowed: Allowed cpumask 985 + * @node: target NUMA node 986 + * @flags: %SCX_PICK_IDLE_* flags 987 + * 988 + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. 989 + * 990 + * Returns the picked idle cpu number on success, or -%EBUSY if no matching 991 + * cpu was found. 992 + * 993 + * The search starts from @node and proceeds to other online NUMA nodes in 994 + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, 995 + * in which case the search is limited to the target @node). 996 + * 997 + * Always returns an error if ops.update_idle() is implemented and 998 + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if 999 + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. 1000 + */ 1001 + __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, 1002 + int node, u64 flags) 1003 + { 1004 + node = validate_node(node); 1005 + if (node < 0) 1006 + return node; 1007 + 1008 + return scx_pick_idle_cpu(cpus_allowed, node, flags); 1009 + } 1010 + 1011 + /** 1012 + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu 1013 + * @cpus_allowed: Allowed cpumask 1014 + * @flags: %SCX_PICK_IDLE_CPU_* flags 1015 + * 1016 + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu 1017 + * number on success. -%EBUSY if no matching cpu was found. 1018 + * 1019 + * Idle CPU tracking may race against CPU scheduling state transitions. For 1020 + * example, this function may return -%EBUSY as CPUs are transitioning into the 1021 + * idle state. If the caller then assumes that there will be dispatch events on 1022 + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs 1023 + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and 1024 + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch 1025 + * event in the near future. 1026 + * 1027 + * Unavailable if ops.update_idle() is implemented and 1028 + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. 1029 + * 1030 + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use 1031 + * scx_bpf_pick_idle_cpu_node() instead. 1032 + */ 1033 + __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, 1034 + u64 flags) 1035 + { 1036 + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { 1037 + scx_ops_error("per-node idle tracking is enabled"); 1038 + return -EBUSY; 1039 + } 1040 + 1041 + if (!check_builtin_idle_enabled()) 1042 + return -EBUSY; 1043 + 1044 + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); 1045 + } 1046 + 1047 + /** 1048 + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available 1049 + * or pick any CPU from @node 1050 + * @cpus_allowed: Allowed cpumask 1051 + * @node: target NUMA node 1052 + * @flags: %SCX_PICK_IDLE_CPU_* flags 1053 + * 1054 + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any 1055 + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu 1056 + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is 1057 + * empty. 1058 + * 1059 + * The search starts from @node and proceeds to other online NUMA nodes in 1060 + * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified, 1061 + * in which case the search is limited to the target @node, regardless of 1062 + * the CPU idle state). 1063 + * 1064 + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not 1065 + * set, this function can't tell which CPUs are idle and will always pick any 1066 + * CPU. 1067 + */ 1068 + __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, 1069 + int node, u64 flags) 1070 + { 1071 + s32 cpu; 1072 + 1073 + node = validate_node(node); 1074 + if (node < 0) 1075 + return node; 1076 + 1077 + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); 1078 + if (cpu >= 0) 1079 + return cpu; 1080 + 1081 + if (flags & SCX_PICK_IDLE_IN_NODE) 1082 + cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed); 1083 + else 1084 + cpu = cpumask_any_distribute(cpus_allowed); 1085 + if (cpu < nr_cpu_ids) 1086 + return cpu; 1087 + else 1088 + return -EBUSY; 1089 + } 1090 + 1091 + /** 1092 + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU 1093 + * @cpus_allowed: Allowed cpumask 1094 + * @flags: %SCX_PICK_IDLE_CPU_* flags 1095 + * 1096 + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any 1097 + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu 1098 + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is 1099 + * empty. 1100 + * 1101 + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not 1102 + * set, this function can't tell which CPUs are idle and will always pick any 1103 + * CPU. 1104 + * 1105 + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use 1106 + * scx_bpf_pick_any_cpu_node() instead. 1107 + */ 1108 + __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, 1109 + u64 flags) 1110 + { 1111 + s32 cpu; 1112 + 1113 + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { 1114 + scx_ops_error("per-node idle tracking is enabled"); 1115 + return -EBUSY; 1116 + } 1117 + 1118 + if (static_branch_likely(&scx_builtin_idle_enabled)) { 1119 + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); 1120 + if (cpu >= 0) 1121 + return cpu; 1122 + } 1123 + 1124 + cpu = cpumask_any_distribute(cpus_allowed); 1125 + if (cpu < nr_cpu_ids) 1126 + return cpu; 1127 + else 1128 + return -EBUSY; 1129 + } 1130 + 1131 + __bpf_kfunc_end_defs(); 1132 + 1133 + BTF_KFUNCS_START(scx_kfunc_ids_idle) 1134 + BTF_ID_FLAGS(func, scx_bpf_cpu_node) 1135 + BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) 1136 + BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) 1137 + BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) 1138 + BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) 1139 + BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) 1140 + BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) 1141 + BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) 1142 + BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) 1143 + BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) 1144 + BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) 1145 + BTF_KFUNCS_END(scx_kfunc_ids_idle) 1146 + 1147 + static const struct btf_kfunc_id_set scx_kfunc_set_idle = { 1148 + .owner = THIS_MODULE, 1149 + .set = &scx_kfunc_ids_idle, 1150 + }; 1151 + 1152 + BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) 1153 + BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) 1154 + BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) 1155 + 1156 + static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { 1157 + .owner = THIS_MODULE, 1158 + .set = &scx_kfunc_ids_select_cpu, 1159 + }; 1160 + 1161 + int scx_idle_init(void) 1162 + { 1163 + int ret; 1164 + 1165 + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || 1166 + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || 1167 + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || 1168 + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); 1169 + 1170 + return ret; 1171 + }

+35

kernel/sched/ext_idle.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4 + * 5 + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 6 + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 7 + * Copyright (c) 2022 David Vernet <dvernet@meta.com> 8 + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> 9 + */ 10 + #ifndef _KERNEL_SCHED_EXT_IDLE_H 11 + #define _KERNEL_SCHED_EXT_IDLE_H 12 + 13 + struct sched_ext_ops; 14 + 15 + #ifdef CONFIG_SMP 16 + void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); 17 + void scx_idle_init_masks(void); 18 + bool scx_idle_test_and_clear_cpu(int cpu); 19 + s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); 20 + #else /* !CONFIG_SMP */ 21 + static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} 22 + static inline void scx_idle_init_masks(void) {} 23 + static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } 24 + static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) 25 + { 26 + return -EBUSY; 27 + } 28 + #endif /* CONFIG_SMP */ 29 + 30 + s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags); 31 + void scx_idle_enable(struct sched_ext_ops *ops); 32 + void scx_idle_disable(void); 33 + int scx_idle_init(void); 34 + 35 + #endif /* _KERNEL_SCHED_EXT_IDLE_H */

+31

mm/mempolicy.c

··· 196 196 } 197 197 EXPORT_SYMBOL_GPL(numa_nearest_node); 198 198 199 + /** 200 + * nearest_node_nodemask - Find the node in @mask at the nearest distance 201 + * from @node. 202 + * 203 + * @node: a valid node ID to start the search from. 204 + * @mask: a pointer to a nodemask representing the allowed nodes. 205 + * 206 + * This function iterates over all nodes in @mask and calculates the 207 + * distance from the starting @node, then it returns the node ID that is 208 + * the closest to @node, or MAX_NUMNODES if no node is found. 209 + * 210 + * Note that @node must be a valid node ID usable with node_distance(), 211 + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes 212 + * or unexpected behavior. 213 + */ 214 + int nearest_node_nodemask(int node, nodemask_t *mask) 215 + { 216 + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; 217 + 218 + for_each_node_mask(n, *mask) { 219 + dist = node_distance(node, n); 220 + if (dist < min_dist) { 221 + min_dist = dist; 222 + min_node = n; 223 + } 224 + } 225 + 226 + return min_node; 227 + } 228 + EXPORT_SYMBOL_GPL(nearest_node_nodemask); 229 + 199 230 struct mempolicy *get_task_policy(struct task_struct *p) 200 231 { 201 232 struct mempolicy *pol = p->mempolicy;

+34

tools/sched_ext/include/scx/common.bpf.h

··· 7 7 #ifndef __SCX_COMMON_BPF_H 8 8 #define __SCX_COMMON_BPF_H 9 9 10 + /* 11 + * The generated kfunc prototypes in vmlinux.h are missing address space 12 + * attributes which cause build failures. For now, suppress the generated 13 + * prototypes. See https://github.com/sched-ext/scx/issues/1111. 14 + */ 15 + #define BPF_NO_KFUNC_PROTOTYPES 16 + 10 17 #ifdef LSP 11 18 #define __bpf__ 12 19 #include "../vmlinux.h" ··· 25 18 #include <bpf/bpf_tracing.h> 26 19 #include <asm-generic/errno.h> 27 20 #include "user_exit_info.h" 21 + #include "enum_defs.autogen.h" 28 22 29 23 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ 30 24 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ ··· 70 62 u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; 71 63 u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; 72 64 void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; 65 + u32 scx_bpf_nr_node_ids(void) __ksym __weak; 73 66 u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; 67 + int scx_bpf_cpu_node(s32 cpu) __ksym __weak; 74 68 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; 75 69 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; 76 70 void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; 71 + const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak; 77 72 const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; 73 + const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak; 78 74 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; 79 75 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; 80 76 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; 77 + s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; 81 78 s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; 79 + s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; 82 80 s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; 83 81 bool scx_bpf_task_running(const struct task_struct *p) __ksym; 84 82 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; 85 83 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; 86 84 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; 87 85 u64 scx_bpf_now(void) __ksym __weak; 86 + void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; 88 87 89 88 /* 90 89 * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from 91 90 * within bpf_for_each() loops. 92 91 */ 93 92 #define BPF_FOR_EACH_ITER (&___it) 93 + 94 + #define scx_read_event(e, name) \ 95 + (bpf_core_field_exists((e)->name) ? (e)->name : 0) 94 96 95 97 static inline __attribute__((format(printf, 1, 2))) 96 98 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} ··· 600 582 { .__val = (val) }; \ 601 583 __write_once_size(&(x), __u.__c, sizeof(x)); \ 602 584 __u.__val; \ 585 + }) 586 + 587 + #define READ_ONCE_ARENA(type, x) \ 588 + ({ \ 589 + union { type __val; char __c[1]; } __u = \ 590 + { .__c = { 0 } }; \ 591 + __read_once_size((void *)&(x), __u.__c, sizeof(x)); \ 592 + __u.__val; \ 593 + }) 594 + 595 + #define WRITE_ONCE_ARENA(type, x, val) \ 596 + ({ \ 597 + union { type __val; char __c[1]; } __u = \ 598 + { .__val = (val) }; \ 599 + __write_once_size((void *)&(x), __u.__c, sizeof(x)); \ 600 + __u.__val; \ 603 601 }) 604 602 605 603 /*

+1

tools/sched_ext/include/scx/common.h

··· 16 16 #include <stdlib.h> 17 17 #include <stdint.h> 18 18 #include <errno.h> 19 + #include "enum_defs.autogen.h" 19 20 20 21 typedef uint8_t u8; 21 22 typedef uint16_t u16;

+95

tools/sched_ext/include/scx/compat.bpf.h

··· 125 125 false; \ 126 126 }) 127 127 128 + /** 129 + * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on 130 + * in a compatible way. We will preserve this __COMPAT helper until v6.16. 131 + * 132 + * @enq_flags: enqueue flags from ops.enqueue() 133 + * 134 + * Return: True if SCX_ENQ_CPU_SELECTED is turned on in @enq_flags 135 + */ 136 + static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) 137 + { 138 + #ifdef HAVE_SCX_ENQ_CPU_SELECTED 139 + /* 140 + * This is the case that a BPF code compiled against vmlinux.h 141 + * where the enum SCX_ENQ_CPU_SELECTED exists. 142 + */ 143 + 144 + /* 145 + * We should temporarily suspend the macro expansion of 146 + * 'SCX_ENQ_CPU_SELECTED'. This avoids 'SCX_ENQ_CPU_SELECTED' being 147 + * rewritten to '__SCX_ENQ_CPU_SELECTED' when 'SCX_ENQ_CPU_SELECTED' 148 + * is defined in 'scripts/gen_enums.py'. 149 + */ 150 + #pragma push_macro("SCX_ENQ_CPU_SELECTED") 151 + #undef SCX_ENQ_CPU_SELECTED 152 + u64 flag; 153 + 154 + /* 155 + * When the kernel did not have SCX_ENQ_CPU_SELECTED, 156 + * select_task_rq_scx() has never been skipped. Thus, this case 157 + * should be considered that the CPU has already been selected. 158 + */ 159 + if (!bpf_core_enum_value_exists(enum scx_enq_flags, 160 + SCX_ENQ_CPU_SELECTED)) 161 + return true; 162 + 163 + flag = bpf_core_enum_value(enum scx_enq_flags, SCX_ENQ_CPU_SELECTED); 164 + return enq_flags & flag; 165 + 166 + /* 167 + * Once done, resume the macro expansion of 'SCX_ENQ_CPU_SELECTED'. 168 + */ 169 + #pragma pop_macro("SCX_ENQ_CPU_SELECTED") 170 + #else 171 + /* 172 + * This is the case that a BPF code compiled against vmlinux.h 173 + * where the enum SCX_ENQ_CPU_SELECTED does NOT exist. 174 + */ 175 + return true; 176 + #endif /* HAVE_SCX_ENQ_CPU_SELECTED */ 177 + } 178 + 179 + 128 180 #define scx_bpf_now() \ 129 181 (bpf_ksym_exists(scx_bpf_now) ? \ 130 182 scx_bpf_now() : \ 131 183 bpf_ktime_get_ns()) 184 + 185 + /* 186 + * v6.15: Introduce event counters. 187 + * 188 + * Preserve the following macro until v6.17. 189 + */ 190 + #define __COMPAT_scx_bpf_events(events, size) \ 191 + (bpf_ksym_exists(scx_bpf_events) ? \ 192 + scx_bpf_events(events, size) : ({})) 193 + 194 + /* 195 + * v6.15: Introduce NUMA-aware kfuncs to operate with per-node idle 196 + * cpumasks. 197 + * 198 + * Preserve the following __COMPAT_scx_*_node macros until v6.17. 199 + */ 200 + #define __COMPAT_scx_bpf_nr_node_ids() \ 201 + (bpf_ksym_exists(scx_bpf_nr_node_ids) ? \ 202 + scx_bpf_nr_node_ids() : 1U) 203 + 204 + #define __COMPAT_scx_bpf_cpu_node(cpu) \ 205 + (bpf_ksym_exists(scx_bpf_cpu_node) ? \ 206 + scx_bpf_cpu_node(cpu) : 0) 207 + 208 + #define __COMPAT_scx_bpf_get_idle_cpumask_node(node) \ 209 + (bpf_ksym_exists(scx_bpf_get_idle_cpumask_node) ? \ 210 + scx_bpf_get_idle_cpumask_node(node) : \ 211 + scx_bpf_get_idle_cpumask()) \ 212 + 213 + #define __COMPAT_scx_bpf_get_idle_smtmask_node(node) \ 214 + (bpf_ksym_exists(scx_bpf_get_idle_smtmask_node) ? \ 215 + scx_bpf_get_idle_smtmask_node(node) : \ 216 + scx_bpf_get_idle_smtmask()) 217 + 218 + #define __COMPAT_scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) \ 219 + (bpf_ksym_exists(scx_bpf_pick_idle_cpu_node) ? \ 220 + scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) : \ 221 + scx_bpf_pick_idle_cpu(cpus_allowed, flags)) 222 + 223 + #define __COMPAT_scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) \ 224 + (bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ? \ 225 + scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ 226 + scx_bpf_pick_any_cpu(cpus_allowed, flags)) 132 227 133 228 /* 134 229 * Define sched_ext_ops. This may be expanded to define multiple variants for

+14 -2

tools/sched_ext/include/scx/compat.h

··· 106 106 return false; 107 107 } 108 108 109 - #define SCX_OPS_SWITCH_PARTIAL \ 110 - __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") 109 + #define SCX_OPS_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_ops_flags", #name) 110 + 111 + #define SCX_OPS_KEEP_BUILTIN_IDLE SCX_OPS_FLAG(SCX_OPS_KEEP_BUILTIN_IDLE) 112 + #define SCX_OPS_ENQ_LAST SCX_OPS_FLAG(SCX_OPS_ENQ_LAST) 113 + #define SCX_OPS_ENQ_EXITING SCX_OPS_FLAG(SCX_OPS_ENQ_EXITING) 114 + #define SCX_OPS_SWITCH_PARTIAL SCX_OPS_FLAG(SCX_OPS_SWITCH_PARTIAL) 115 + #define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED) 116 + #define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP) 117 + #define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE) 118 + 119 + #define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name) 120 + 121 + #define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE) 122 + #define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE) 111 123 112 124 static inline long scx_hotplug_seq(void) 113 125 {

+120

tools/sched_ext/include/scx/enum_defs.autogen.h

··· 1 + /* 2 + * WARNING: This file is autogenerated from gen_enum_defs.py [1]. 3 + * 4 + * [1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py 5 + */ 6 + 7 + #ifndef __ENUM_DEFS_AUTOGEN_H__ 8 + #define __ENUM_DEFS_AUTOGEN_H__ 9 + 10 + #define HAVE_SCX_DSP_DFL_MAX_BATCH 11 + #define HAVE_SCX_DSP_MAX_LOOPS 12 + #define HAVE_SCX_WATCHDOG_MAX_TIMEOUT 13 + #define HAVE_SCX_EXIT_BT_LEN 14 + #define HAVE_SCX_EXIT_MSG_LEN 15 + #define HAVE_SCX_EXIT_DUMP_DFL_LEN 16 + #define HAVE_SCX_CPUPERF_ONE 17 + #define HAVE_SCX_OPS_TASK_ITER_BATCH 18 + #define HAVE_SCX_CPU_PREEMPT_RT 19 + #define HAVE_SCX_CPU_PREEMPT_DL 20 + #define HAVE_SCX_CPU_PREEMPT_STOP 21 + #define HAVE_SCX_CPU_PREEMPT_UNKNOWN 22 + #define HAVE_SCX_DEQ_SLEEP 23 + #define HAVE_SCX_DEQ_CORE_SCHED_EXEC 24 + #define HAVE_SCX_DSQ_FLAG_BUILTIN 25 + #define HAVE_SCX_DSQ_FLAG_LOCAL_ON 26 + #define HAVE_SCX_DSQ_INVALID 27 + #define HAVE_SCX_DSQ_GLOBAL 28 + #define HAVE_SCX_DSQ_LOCAL 29 + #define HAVE_SCX_DSQ_LOCAL_ON 30 + #define HAVE_SCX_DSQ_LOCAL_CPU_MASK 31 + #define HAVE_SCX_DSQ_ITER_REV 32 + #define HAVE___SCX_DSQ_ITER_HAS_SLICE 33 + #define HAVE___SCX_DSQ_ITER_HAS_VTIME 34 + #define HAVE___SCX_DSQ_ITER_USER_FLAGS 35 + #define HAVE___SCX_DSQ_ITER_ALL_FLAGS 36 + #define HAVE_SCX_DSQ_LNODE_ITER_CURSOR 37 + #define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT 38 + #define HAVE_SCX_ENQ_WAKEUP 39 + #define HAVE_SCX_ENQ_HEAD 40 + #define HAVE_SCX_ENQ_CPU_SELECTED 41 + #define HAVE_SCX_ENQ_PREEMPT 42 + #define HAVE_SCX_ENQ_REENQ 43 + #define HAVE_SCX_ENQ_LAST 44 + #define HAVE___SCX_ENQ_INTERNAL_MASK 45 + #define HAVE_SCX_ENQ_CLEAR_OPSS 46 + #define HAVE_SCX_ENQ_DSQ_PRIQ 47 + #define HAVE_SCX_TASK_DSQ_ON_PRIQ 48 + #define HAVE_SCX_TASK_QUEUED 49 + #define HAVE_SCX_TASK_RESET_RUNNABLE_AT 50 + #define HAVE_SCX_TASK_DEQD_FOR_SLEEP 51 + #define HAVE_SCX_TASK_STATE_SHIFT 52 + #define HAVE_SCX_TASK_STATE_BITS 53 + #define HAVE_SCX_TASK_STATE_MASK 54 + #define HAVE_SCX_TASK_CURSOR 55 + #define HAVE_SCX_ECODE_RSN_HOTPLUG 56 + #define HAVE_SCX_ECODE_ACT_RESTART 57 + #define HAVE_SCX_EXIT_NONE 58 + #define HAVE_SCX_EXIT_DONE 59 + #define HAVE_SCX_EXIT_UNREG 60 + #define HAVE_SCX_EXIT_UNREG_BPF 61 + #define HAVE_SCX_EXIT_UNREG_KERN 62 + #define HAVE_SCX_EXIT_SYSRQ 63 + #define HAVE_SCX_EXIT_ERROR 64 + #define HAVE_SCX_EXIT_ERROR_BPF 65 + #define HAVE_SCX_EXIT_ERROR_STALL 66 + #define HAVE_SCX_KF_UNLOCKED 67 + #define HAVE_SCX_KF_CPU_RELEASE 68 + #define HAVE_SCX_KF_DISPATCH 69 + #define HAVE_SCX_KF_ENQUEUE 70 + #define HAVE_SCX_KF_SELECT_CPU 71 + #define HAVE_SCX_KF_REST 72 + #define HAVE___SCX_KF_RQ_LOCKED 73 + #define HAVE___SCX_KF_TERMINAL 74 + #define HAVE_SCX_KICK_IDLE 75 + #define HAVE_SCX_KICK_PREEMPT 76 + #define HAVE_SCX_KICK_WAIT 77 + #define HAVE_SCX_OPI_BEGIN 78 + #define HAVE_SCX_OPI_NORMAL_BEGIN 79 + #define HAVE_SCX_OPI_NORMAL_END 80 + #define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN 81 + #define HAVE_SCX_OPI_CPU_HOTPLUG_END 82 + #define HAVE_SCX_OPI_END 83 + #define HAVE_SCX_OPS_ENABLING 84 + #define HAVE_SCX_OPS_ENABLED 85 + #define HAVE_SCX_OPS_DISABLING 86 + #define HAVE_SCX_OPS_DISABLED 87 + #define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE 88 + #define HAVE_SCX_OPS_ENQ_LAST 89 + #define HAVE_SCX_OPS_ENQ_EXITING 90 + #define HAVE_SCX_OPS_SWITCH_PARTIAL 91 + #define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT 92 + #define HAVE_SCX_OPS_ALL_FLAGS 93 + #define HAVE_SCX_OPSS_NONE 94 + #define HAVE_SCX_OPSS_QUEUEING 95 + #define HAVE_SCX_OPSS_QUEUED 96 + #define HAVE_SCX_OPSS_DISPATCHING 97 + #define HAVE_SCX_OPSS_QSEQ_SHIFT 98 + #define HAVE_SCX_PICK_IDLE_CORE 99 + #define HAVE_SCX_OPS_NAME_LEN 100 + #define HAVE_SCX_SLICE_DFL 101 + #define HAVE_SCX_SLICE_INF 102 + #define HAVE_SCX_RQ_ONLINE 103 + #define HAVE_SCX_RQ_CAN_STOP_TICK 104 + #define HAVE_SCX_RQ_BAL_PENDING 105 + #define HAVE_SCX_RQ_BAL_KEEP 106 + #define HAVE_SCX_RQ_BYPASSING 107 + #define HAVE_SCX_RQ_IN_WAKEUP 108 + #define HAVE_SCX_RQ_IN_BALANCE 109 + #define HAVE_SCX_TASK_NONE 110 + #define HAVE_SCX_TASK_INIT 111 + #define HAVE_SCX_TASK_READY 112 + #define HAVE_SCX_TASK_ENABLED 113 + #define HAVE_SCX_TASK_NR_STATES 114 + #define HAVE_SCX_TG_ONLINE 115 + #define HAVE_SCX_TG_INITED 116 + #define HAVE_SCX_WAKE_FORK 117 + #define HAVE_SCX_WAKE_TTWU 118 + #define HAVE_SCX_WAKE_SYNC 119 + 120 + #endif /* __ENUM_DEFS_AUTOGEN_H__ */

+12 -3

tools/sched_ext/scx_central.c

··· 10 10 #include <unistd.h> 11 11 #include <inttypes.h> 12 12 #include <signal.h> 13 + #include <assert.h> 13 14 #include <libgen.h> 14 15 #include <bpf/bpf.h> 15 16 #include <scx/common.h> ··· 61 60 skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); 62 61 skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); 63 62 63 + assert(skel->rodata->nr_cpu_ids <= INT32_MAX); 64 + 64 65 while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { 65 66 switch (opt) { 66 67 case 's': 67 68 skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; 68 69 break; 69 - case 'c': 70 - skel->rodata->central_cpu = strtoul(optarg, NULL, 0); 70 + case 'c': { 71 + u32 central_cpu = strtoul(optarg, NULL, 0); 72 + if (central_cpu >= skel->rodata->nr_cpu_ids) { 73 + fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids); 74 + return -1; 75 + } 76 + skel->rodata->central_cpu = (s32)central_cpu; 71 77 break; 78 + } 72 79 case 'v': 73 80 verbose = true; 74 81 break; ··· 105 96 */ 106 97 cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); 107 98 SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); 108 - CPU_ZERO(cpuset); 99 + CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset); 109 100 CPU_SET(skel->rodata->central_cpu, cpuset); 110 101 SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), 111 102 "Failed to affinitize to central CPU %d (max %d)",

+22 -1

tools/sched_ext/scx_qmap.bpf.c

··· 231 231 } 232 232 233 233 /* if select_cpu() wasn't called, try direct dispatch */ 234 - if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && 234 + if (!__COMPAT_is_enq_cpu_selected(enq_flags) && 235 235 (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { 236 236 __sync_fetch_and_add(&nr_ddsp_from_enq, 1); 237 237 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); ··· 763 763 764 764 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) 765 765 { 766 + struct scx_event_stats events; 767 + 766 768 bpf_rcu_read_lock(); 767 769 dispatch_highpri(true); 768 770 bpf_rcu_read_unlock(); ··· 773 771 774 772 if (print_shared_dsq) 775 773 dump_shared_dsq(); 774 + 775 + __COMPAT_scx_bpf_events(&events, sizeof(events)); 776 + 777 + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", 778 + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); 779 + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", 780 + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); 781 + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", 782 + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); 783 + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", 784 + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); 785 + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SLICE_DFL", 786 + scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); 787 + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", 788 + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); 789 + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", 790 + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); 791 + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", 792 + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); 776 793 777 794 bpf_timer_start(timer, ONE_SEC_IN_NS, 0); 778 795 return 0;

+1

tools/testing/selftests/sched_ext/Makefile

··· 172 172 maximal \ 173 173 maybe_null \ 174 174 minimal \ 175 + numa \ 175 176 prog_run \ 176 177 reload_loop \ 177 178 select_cpu_dfl \

+100

tools/testing/selftests/sched_ext/numa.bpf.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * A scheduler that validates the behavior of the NUMA-aware 4 + * functionalities. 5 + * 6 + * The scheduler creates a separate DSQ for each NUMA node, ensuring tasks 7 + * are exclusively processed by CPUs within their respective nodes. Idle 8 + * CPUs are selected only within the same node, so task migration can only 9 + * occurs between CPUs belonging to the same node. 10 + * 11 + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> 12 + */ 13 + 14 + #include <scx/common.bpf.h> 15 + 16 + char _license[] SEC("license") = "GPL"; 17 + 18 + UEI_DEFINE(uei); 19 + 20 + const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE; 21 + 22 + static bool is_cpu_idle(s32 cpu, int node) 23 + { 24 + const struct cpumask *idle_cpumask; 25 + bool idle; 26 + 27 + idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node); 28 + idle = bpf_cpumask_test_cpu(cpu, idle_cpumask); 29 + scx_bpf_put_cpumask(idle_cpumask); 30 + 31 + return idle; 32 + } 33 + 34 + s32 BPF_STRUCT_OPS(numa_select_cpu, 35 + struct task_struct *p, s32 prev_cpu, u64 wake_flags) 36 + { 37 + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); 38 + s32 cpu; 39 + 40 + /* 41 + * We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here, 42 + * since it already tries to pick an idle CPU within the node 43 + * first, but let's use both functions for better testing coverage. 44 + */ 45 + cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node, 46 + __COMPAT_SCX_PICK_IDLE_IN_NODE); 47 + if (cpu < 0) 48 + cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node, 49 + __COMPAT_SCX_PICK_IDLE_IN_NODE); 50 + 51 + if (is_cpu_idle(cpu, node)) 52 + scx_bpf_error("CPU %d should be marked as busy", cpu); 53 + 54 + if (__COMPAT_scx_bpf_cpu_node(cpu) != node) 55 + scx_bpf_error("CPU %d should be in node %d", cpu, node); 56 + 57 + return cpu; 58 + } 59 + 60 + void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags) 61 + { 62 + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); 63 + 64 + scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags); 65 + } 66 + 67 + void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev) 68 + { 69 + int node = __COMPAT_scx_bpf_cpu_node(cpu); 70 + 71 + scx_bpf_dsq_move_to_local(node); 72 + } 73 + 74 + s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init) 75 + { 76 + int node, err; 77 + 78 + bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) { 79 + err = scx_bpf_create_dsq(node, node); 80 + if (err) 81 + return err; 82 + } 83 + 84 + return 0; 85 + } 86 + 87 + void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei) 88 + { 89 + UEI_RECORD(uei, ei); 90 + } 91 + 92 + SEC(".struct_ops.link") 93 + struct sched_ext_ops numa_ops = { 94 + .select_cpu = (void *)numa_select_cpu, 95 + .enqueue = (void *)numa_enqueue, 96 + .dispatch = (void *)numa_dispatch, 97 + .init = (void *)numa_init, 98 + .exit = (void *)numa_exit, 99 + .name = "numa", 100 + };

+59

tools/testing/selftests/sched_ext/numa.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> 4 + */ 5 + #include <bpf/bpf.h> 6 + #include <scx/common.h> 7 + #include <sys/wait.h> 8 + #include <unistd.h> 9 + #include "numa.bpf.skel.h" 10 + #include "scx_test.h" 11 + 12 + static enum scx_test_status setup(void **ctx) 13 + { 14 + struct numa *skel; 15 + 16 + skel = numa__open(); 17 + SCX_FAIL_IF(!skel, "Failed to open"); 18 + SCX_ENUM_INIT(skel); 19 + skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE; 20 + skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE; 21 + SCX_FAIL_IF(numa__load(skel), "Failed to load skel"); 22 + 23 + *ctx = skel; 24 + 25 + return SCX_TEST_PASS; 26 + } 27 + 28 + static enum scx_test_status run(void *ctx) 29 + { 30 + struct numa *skel = ctx; 31 + struct bpf_link *link; 32 + 33 + link = bpf_map__attach_struct_ops(skel->maps.numa_ops); 34 + SCX_FAIL_IF(!link, "Failed to attach scheduler"); 35 + 36 + /* Just sleeping is fine, plenty of scheduling events happening */ 37 + sleep(1); 38 + 39 + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); 40 + bpf_link__destroy(link); 41 + 42 + return SCX_TEST_PASS; 43 + } 44 + 45 + static void cleanup(void *ctx) 46 + { 47 + struct numa *skel = ctx; 48 + 49 + numa__destroy(skel); 50 + } 51 + 52 + struct scx_test numa = { 53 + .name = "numa", 54 + .description = "Verify NUMA-aware functionalities", 55 + .setup = setup, 56 + .run = run, 57 + .cleanup = cleanup, 58 + }; 59 + REGISTER_SCX_TEST(&numa)

Configure Feed

Configure Feed