tools/sched_ext/include: Add missing helpers to common.bpf.h

+277

1 changed file

expand all

tools

sched_ext

include

scx

common.bpf.h

+277

tools/sched_ext/include/scx/common.bpf.h

··· 292 292 }) 293 293 #endif /* ARRAY_ELEM_PTR */ 294 294 295 + /** 296 + * __sink - Hide @expr's value from the compiler and BPF verifier 297 + * @expr: The expression whose value should be opacified 298 + * 299 + * No-op at runtime. The empty inline assembly with a read-write constraint 300 + * ("+g") has two effects at compile/verify time: 301 + * 302 + * 1. Compiler: treats @expr as both read and written, preventing dead-code 303 + * elimination and keeping @expr (and any side effects that produced it) 304 + * alive. 305 + * 306 + * 2. BPF verifier: forgets the precise value/range of @expr ("makes it 307 + * imprecise"). The verifier normally tracks exact ranges for every register 308 + * and stack slot. While useful, precision means each distinct value creates a 309 + * separate verifier state. Inside loops this leads to state explosion - each 310 + * iteration carries different precise values so states never merge and the 311 + * verifier explores every iteration individually. 312 + * 313 + * Example - preventing loop state explosion:: 314 + * 315 + * u32 nr_intersects = 0, nr_covered = 0; 316 + * __sink(nr_intersects); 317 + * __sink(nr_covered); 318 + * bpf_for(i, 0, nr_nodes) { 319 + * if (intersects(cpumask, node_mask[i])) 320 + * nr_intersects++; 321 + * if (covers(cpumask, node_mask[i])) 322 + * nr_covered++; 323 + * } 324 + * 325 + * Without __sink(), the verifier tracks every possible (nr_intersects, 326 + * nr_covered) pair across iterations, causing "BPF program is too large". With 327 + * __sink(), the values become unknown scalars so all iterations collapse into 328 + * one reusable state. 329 + * 330 + * Example - keeping a reference alive:: 331 + * 332 + * struct task_struct *t = bpf_task_acquire(task); 333 + * __sink(t); 334 + * 335 + * Follows the convention from BPF selftests (bpf_misc.h). 336 + */ 337 + #define __sink(expr) asm volatile ("" : "+g"(expr)) 338 + 295 339 /* 296 340 * BPF declarations and helpers 297 341 */ ··· 381 337 382 338 /* cgroup */ 383 339 struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; 340 + struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) __ksym; 384 341 void bpf_cgroup_release(struct cgroup *cgrp) __ksym; 385 342 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; 386 343 ··· 788 743 } 789 744 790 745 /* 746 + * ctzll -- Counts trailing zeros in an unsigned long long. If the input value 747 + * is zero, the return value is undefined. 748 + */ 749 + static inline int ctzll(u64 v) 750 + { 751 + #if (!defined(__BPF__) && defined(__SCX_TARGET_ARCH_x86)) || \ 752 + (defined(__BPF__) && defined(__clang_major__) && __clang_major__ >= 19) 753 + /* 754 + * Use the ctz builtin when: (1) building for native x86, or 755 + * (2) building for BPF with clang >= 19 (BPF backend supports 756 + * the intrinsic from clang 19 onward; earlier versions hit 757 + * "unimplemented opcode" in the backend). 758 + */ 759 + return __builtin_ctzll(v); 760 + #else 761 + /* 762 + * If neither the target architecture nor the toolchains support ctzll, 763 + * use software-based emulation. Let's use the De Bruijn sequence-based 764 + * approach to find LSB fastly. See the details of De Bruijn sequence: 765 + * 766 + * https://en.wikipedia.org/wiki/De_Bruijn_sequence 767 + * https://www.chessprogramming.org/BitScan#De_Bruijn_Multiplication 768 + */ 769 + const int lookup_table[64] = { 770 + 0, 1, 48, 2, 57, 49, 28, 3, 61, 58, 50, 42, 38, 29, 17, 4, 771 + 62, 55, 59, 36, 53, 51, 43, 22, 45, 39, 33, 30, 24, 18, 12, 5, 772 + 63, 47, 56, 27, 60, 41, 37, 16, 54, 35, 52, 21, 44, 32, 23, 11, 773 + 46, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, 774 + }; 775 + const u64 DEBRUIJN_CONSTANT = 0x03f79d71b4cb0a89ULL; 776 + unsigned int index; 777 + u64 lowest_bit; 778 + const int *lt; 779 + 780 + if (v == 0) 781 + return -1; 782 + 783 + /* 784 + * Isolate the least significant bit (LSB). 785 + * For example, if v = 0b...10100, then v & -v = 0b...00100 786 + */ 787 + lowest_bit = v & -v; 788 + 789 + /* 790 + * Each isolated bit produces a unique 6-bit value, guaranteed by the 791 + * De Bruijn property. Calculate a unique index into the lookup table 792 + * using the magic constant and a right shift. 793 + * 794 + * Multiplying by the 64-bit constant "spreads out" that 1-bit into a 795 + * unique pattern in the top 6 bits. This uniqueness property is 796 + * exactly what a De Bruijn sequence guarantees: Every possible 6-bit 797 + * pattern (in top bits) occurs exactly once for each LSB position. So, 798 + * the constant 0x03f79d71b4cb0a89ULL is carefully chosen to be a 799 + * De Bruijn sequence, ensuring no collisions in the table index. 800 + */ 801 + index = (lowest_bit * DEBRUIJN_CONSTANT) >> 58; 802 + 803 + /* 804 + * Lookup in a precomputed table. No collision is guaranteed by the 805 + * De Bruijn property. 806 + */ 807 + lt = MEMBER_VPTR(lookup_table, [index]); 808 + return (lt)? *lt : -1; 809 + #endif 810 + } 811 + 812 + /* 791 813 * Return a value proportionally scaled to the task's weight. 792 814 */ 793 815 static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value) ··· 870 758 return value * 100 / p->scx.weight; 871 759 } 872 760 761 + 762 + /* 763 + * Get a random u64 from the kernel's pseudo-random generator. 764 + */ 765 + static inline u64 get_prandom_u64() 766 + { 767 + return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32(); 768 + } 769 + 770 + /* 771 + * Define the shadow structure to avoid a compilation error when 772 + * vmlinux.h does not enable necessary kernel configs. The ___local 773 + * suffix is a CO-RE convention that tells the loader to match this 774 + * against the base struct rq in the kernel. The attribute 775 + * preserve_access_index tells the compiler to generate a CO-RE 776 + * relocation for these fields. 777 + */ 778 + struct rq___local { 779 + /* 780 + * A monotonically increasing clock per CPU. It is rq->clock minus 781 + * cumulative IRQ time and hypervisor steal time. Unlike rq->clock, 782 + * it does not advance during IRQ processing or hypervisor preemption. 783 + * It does advance during idle (the idle task counts as a running task 784 + * for this purpose). 785 + */ 786 + u64 clock_task; 787 + /* 788 + * Invariant version of clock_task scaled by CPU capacity and 789 + * frequency. For example, clock_pelt advances 2x slower on a CPU 790 + * with half the capacity. 791 + * 792 + * At idle exit, rq->clock_pelt jumps forward to resync with 793 + * clock_task. The kernel's rq_clock_pelt() corrects for this jump 794 + * by subtracting lost_idle_time, yielding a clock that appears 795 + * continuous across idle transitions. scx_clock_pelt() mirrors 796 + * rq_clock_pelt() by performing the same subtraction. 797 + */ 798 + u64 clock_pelt; 799 + /* 800 + * Accumulates the magnitude of each clock_pelt jump at idle exit. 801 + * Subtracting this from clock_pelt gives rq_clock_pelt(): a 802 + * continuous, capacity-invariant clock suitable for both task 803 + * execution time stamping and cross-idle measurements. 804 + */ 805 + unsigned long lost_idle_time; 806 + /* 807 + * Shadow of paravirt_steal_clock() (the hypervisor's cumulative 808 + * stolen time counter). Stays frozen while the hypervisor preempts 809 + * the vCPU; catches up the next time update_rq_clock_task() is 810 + * called. The delta is the stolen time not yet subtracted from 811 + * clock_task. 812 + * 813 + * Unlike irqtime->total (a plain kernel-side field), the live stolen 814 + * time counter lives in hypervisor-specific shared memory and has no 815 + * kernel-side equivalent readable from BPF in a hypervisor-agnostic 816 + * way. This field is therefore the only portable BPF-accessible 817 + * approximation of cumulative steal time. 818 + * 819 + * Available only when CONFIG_PARAVIRT_TIME_ACCOUNTING is on. 820 + */ 821 + u64 prev_steal_time_rq; 822 + } __attribute__((preserve_access_index)); 823 + 824 + extern struct rq runqueues __ksym; 825 + 826 + /* 827 + * Define the shadow structure to avoid a compilation error when 828 + * vmlinux.h does not enable necessary kernel configs. 829 + */ 830 + struct irqtime___local { 831 + /* 832 + * Cumulative IRQ time counter for this CPU, in nanoseconds. Advances 833 + * immediately at the exit of every hardirq and non-ksoftirqd softirq 834 + * via irqtime_account_irq(). ksoftirqd time is counted as normal 835 + * task time and is NOT included. NMI time is also NOT included. 836 + * 837 + * The companion field irqtime->sync (struct u64_stats_sync) protects 838 + * against 64-bit tearing on 32-bit architectures. On 64-bit kernels, 839 + * u64_stats_sync is an empty struct and all seqcount operations are 840 + * no-ops, so a plain BPF_CORE_READ of this field is safe. 841 + * 842 + * Available only when CONFIG_IRQ_TIME_ACCOUNTING is on. 843 + */ 844 + u64 total; 845 + } __attribute__((preserve_access_index)); 846 + 847 + /* 848 + * cpu_irqtime is a per-CPU variable defined only when 849 + * CONFIG_IRQ_TIME_ACCOUNTING is on. Declare it as __weak so the BPF 850 + * loader sets its address to 0 (rather than failing) when the symbol 851 + * is absent from the running kernel. 852 + */ 853 + extern struct irqtime___local cpu_irqtime __ksym __weak; 854 + 855 + static inline struct rq___local *get_current_rq(u32 cpu) 856 + { 857 + /* 858 + * This is a workaround to get an rq pointer since we decided to 859 + * deprecate scx_bpf_cpu_rq(). 860 + * 861 + * WARNING: The caller must hold the rq lock for @cpu. This is 862 + * guaranteed when called from scheduling callbacks (ops.running, 863 + * ops.stopping, ops.enqueue, ops.dequeue, ops.dispatch, etc.). 864 + * There is no runtime check available in BPF for kernel spinlock 865 + * state — correctness is enforced by calling context only. 866 + */ 867 + return (void *)bpf_per_cpu_ptr(&runqueues, cpu); 868 + } 869 + 870 + static inline u64 scx_clock_task(u32 cpu) 871 + { 872 + struct rq___local *rq = get_current_rq(cpu); 873 + 874 + /* Equivalent to the kernel's rq_clock_task(). */ 875 + return rq ? rq->clock_task : 0; 876 + } 877 + 878 + static inline u64 scx_clock_pelt(u32 cpu) 879 + { 880 + struct rq___local *rq = get_current_rq(cpu); 881 + 882 + /* 883 + * Equivalent to the kernel's rq_clock_pelt(): subtracts 884 + * lost_idle_time from clock_pelt to absorb the jump that occurs 885 + * when clock_pelt resyncs with clock_task at idle exit. The result 886 + * is a continuous, capacity-invariant clock safe for both task 887 + * execution time stamping and cross-idle measurements. 888 + */ 889 + return rq ? (rq->clock_pelt - rq->lost_idle_time) : 0; 890 + } 891 + 892 + static inline u64 scx_clock_virt(u32 cpu) 893 + { 894 + struct rq___local *rq; 895 + 896 + /* 897 + * Check field existence before calling get_current_rq() so we avoid 898 + * the per_cpu lookup entirely on kernels built without 899 + * CONFIG_PARAVIRT_TIME_ACCOUNTING. 900 + */ 901 + if (!bpf_core_field_exists(((struct rq___local *)0)->prev_steal_time_rq)) 902 + return 0; 903 + 904 + /* Lagging shadow of the kernel's paravirt_steal_clock(). */ 905 + rq = get_current_rq(cpu); 906 + return rq ? BPF_CORE_READ(rq, prev_steal_time_rq) : 0; 907 + } 908 + 909 + static inline u64 scx_clock_irq(u32 cpu) 910 + { 911 + struct irqtime___local *irqt; 912 + 913 + /* 914 + * bpf_core_type_exists() resolves at load time: if struct irqtime is 915 + * absent from kernel BTF (CONFIG_IRQ_TIME_ACCOUNTING off), the loader 916 + * patches this into an unconditional return 0, making the 917 + * bpf_per_cpu_ptr() call below dead code that the verifier never sees. 918 + */ 919 + if (!bpf_core_type_exists(struct irqtime___local)) 920 + return 0; 921 + 922 + /* Equivalent to the kernel's irq_time_read(). */ 923 + irqt = bpf_per_cpu_ptr(&cpu_irqtime, cpu); 924 + return irqt ? BPF_CORE_READ(irqt, total) : 0; 925 + } 873 926 874 927 #include "compat.bpf.h" 875 928 #include "enums.bpf.h"

Configure Feed

Configure Feed