Merge branch 'bpf-relax-8-frame-limitation-for-global-subprogs'

Emil Tsalapatis says:

====================
bpf: Relax 8 frame limitation for global subprogs

The BPF verifier currently limits the maximum runtime call stack to
8 frames. Larger BPF programs like sched-ext schedulers routinely
fail verification because they exceed this limit, even as they use
very little actual stack space for each frame.

Relax the verifier to permit call stacks > 8 frames deep when the
call stacks include global subprogs. The old 8 stack frame limit now
only applies to call stacks composed entirely of static function calls.
This works because global functions are each verified in isolation, so
the verifier does not need to cross-reference verification state across
the function call boundary, which has been the reason for limiting the
call stack size in the first place.

This patch does not change the verification time limit of 8 stack
frames. Static functions that are inlined for verification purposes
still only go 8 frames deep to avoid changing the verifier's internal
data structures used for verification. These data structures only
support holding information on up to 8 stack frames.

This patch also does not adjust the actual maximum stack size of 512.

CHANGELOG
=========

v5 -> v6 (https://lore.kernel.org/bpf/20260311182831.91219-1-emil@etsalapatis.com/)
- Make bpf_subprog_call_depth_info internal to verifier.c (Alexei)

v4 -> v5 (https://lore.kernel.org/bpf/20260309204430.201219-1-emil@etsalapatis.com/)
- Move depth tracking state to verifier (Eduard) and free it after verification (Alexei)
- Fix selftest patch title and formatting errors (Yonghong)

v3 -> v4 (https://lore.kernel.org/bpf/20260303043106.406099-1-emil@etsalapatis.com/)
- Factor out temp call depth tracking info into its own struct (Eduard)
- Bring depth calculation loop in line with the other instances (Mykyta)
- Add comment on why selftest call stack is 16 bytes/frame (Eduard)
- Rename "cidx" to "caller" for clarity (Mykyta, Eduard)

v2 -> v3 (https://lore.kernel.org/bpf/20260210213606.475415-1-emil@etsalapatis.com/)
- Change logic to remove arbitrary limit on call depth (Eduard)
- Add additional selftests (Eduard)

v1 -> v2 (https://lore.kernel.org/bpf/20260202233716.835638-1-emil@etsalapatis.com)
- Adjust patch to only increase the runtime stack depth, leaving the
verification-time stack depth unchanged (Alexei)

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
====================

Link: https://patch.msgid.link/20260316161225.128011-1-emil@etsalapatis.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Alexei Starovoitov 2 months ago 6c8e1a9e 202e42e4

+160 -31

4 changed files

expand all

kernel

bpf

verifier.c

tools

testing

selftests

bpf

prog_tests

test_global_funcs.c

progs

test_global_func3.c

test_global_func_deep_stack.c

+54 -22

kernel/bpf/verifier.c

··· 6724 6724 return round_up(max_t(u32, stack_depth, 1), 32); 6725 6725 } 6726 6726 6727 + /* temporary state used for call frame depth calculation */ 6728 + struct bpf_subprog_call_depth_info { 6729 + int ret_insn; /* caller instruction where we return to. */ 6730 + int caller; /* caller subprogram idx */ 6731 + int frame; /* # of consecutive static call stack frames on top of stack */ 6732 + }; 6733 + 6727 6734 /* starting from main bpf function walk all instructions of the function 6728 6735 * and recursively walk all callees that given function can call. 6729 6736 * Ignore jump and exit insns. 6730 - * Since recursion is prevented by check_cfg() this algorithm 6731 - * only needs a local stack of MAX_CALL_FRAMES to remember callsites 6732 6737 */ 6733 6738 static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, 6739 + struct bpf_subprog_call_depth_info *dinfo, 6734 6740 bool priv_stack_supported) 6735 6741 { 6736 6742 struct bpf_subprog_info *subprog = env->subprog_info; 6737 6743 struct bpf_insn *insn = env->prog->insnsi; 6738 6744 int depth = 0, frame = 0, i, subprog_end, subprog_depth; 6739 6745 bool tail_call_reachable = false; 6740 - int ret_insn[MAX_CALL_FRAMES]; 6741 - int ret_prog[MAX_CALL_FRAMES]; 6742 - int j; 6746 + int total; 6747 + int tmp; 6748 + 6749 + /* no caller idx */ 6750 + dinfo[idx].caller = -1; 6743 6751 6744 6752 i = subprog[idx].start; 6745 6753 if (!priv_stack_supported) ··· 6799 6791 } else { 6800 6792 depth += subprog_depth; 6801 6793 if (depth > MAX_BPF_STACK) { 6794 + total = 0; 6795 + for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) 6796 + total++; 6797 + 6802 6798 verbose(env, "combined stack size of %d calls is %d. Too large\n", 6803 - frame + 1, depth); 6799 + total, depth); 6804 6800 return -EACCES; 6805 6801 } 6806 6802 } ··· 6818 6806 6819 6807 if (!is_bpf_throw_kfunc(insn + i)) 6820 6808 continue; 6821 - if (subprog[idx].is_cb) 6822 - err = true; 6823 - for (int c = 0; c < frame && !err; c++) { 6824 - if (subprog[ret_prog[c]].is_cb) { 6809 + for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) { 6810 + if (subprog[tmp].is_cb) { 6825 6811 err = true; 6826 6812 break; 6827 6813 } ··· 6835 6825 if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) 6836 6826 continue; 6837 6827 /* remember insn and function to return to */ 6838 - ret_insn[frame] = i + 1; 6839 - ret_prog[frame] = idx; 6840 6828 6841 6829 /* find the callee */ 6842 6830 next_insn = i + insn[i].imm + 1; ··· 6854 6846 return -EINVAL; 6855 6847 } 6856 6848 } 6849 + 6850 + /* store caller info for after we return from callee */ 6851 + dinfo[idx].frame = frame; 6852 + dinfo[idx].ret_insn = i + 1; 6853 + 6854 + /* push caller idx into callee's dinfo */ 6855 + dinfo[sidx].caller = idx; 6856 + 6857 6857 i = next_insn; 6858 + 6858 6859 idx = sidx; 6859 6860 if (!priv_stack_supported) 6860 6861 subprog[idx].priv_stack_mode = NO_PRIV_STACK; ··· 6871 6854 if (subprog[idx].has_tail_call) 6872 6855 tail_call_reachable = true; 6873 6856 6874 - frame++; 6857 + frame = subprog_is_global(env, idx) ? 0 : frame + 1; 6875 6858 if (frame >= MAX_CALL_FRAMES) { 6876 6859 verbose(env, "the call stack of %d frames is too deep !\n", 6877 6860 frame); ··· 6885 6868 * tail call counter throughout bpf2bpf calls combined with tailcalls 6886 6869 */ 6887 6870 if (tail_call_reachable) 6888 - for (j = 0; j < frame; j++) { 6889 - if (subprog[ret_prog[j]].is_exception_cb) { 6871 + for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) { 6872 + if (subprog[tmp].is_exception_cb) { 6890 6873 verbose(env, "cannot tail call within exception cb\n"); 6891 6874 return -EINVAL; 6892 6875 } 6893 - subprog[ret_prog[j]].tail_call_reachable = true; 6876 + subprog[tmp].tail_call_reachable = true; 6894 6877 } 6895 6878 if (subprog[0].tail_call_reachable) 6896 6879 env->prog->aux->tail_call_reachable = true; ··· 6898 6881 /* end of for() loop means the last insn of the 'subprog' 6899 6882 * was reached. Doesn't matter whether it was JA or EXIT 6900 6883 */ 6901 - if (frame == 0) 6884 + if (frame == 0 && dinfo[idx].caller < 0) 6902 6885 return 0; 6903 6886 if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE) 6904 6887 depth -= round_up_stack_depth(env, subprog[idx].stack_depth); 6905 - frame--; 6906 - i = ret_insn[frame]; 6907 - idx = ret_prog[frame]; 6888 + 6889 + /* pop caller idx from callee */ 6890 + idx = dinfo[idx].caller; 6891 + 6892 + /* retrieve caller state from its frame */ 6893 + frame = dinfo[idx].frame; 6894 + i = dinfo[idx].ret_insn; 6895 + 6908 6896 goto continue_func; 6909 6897 } 6910 6898 6911 6899 static int check_max_stack_depth(struct bpf_verifier_env *env) 6912 6900 { 6913 6901 enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN; 6902 + struct bpf_subprog_call_depth_info *dinfo; 6914 6903 struct bpf_subprog_info *si = env->subprog_info; 6915 6904 bool priv_stack_supported; 6916 6905 int ret; 6906 + 6907 + dinfo = kvcalloc(env->subprog_cnt, sizeof(*dinfo), GFP_KERNEL_ACCOUNT); 6908 + if (!dinfo) 6909 + return -ENOMEM; 6917 6910 6918 6911 for (int i = 0; i < env->subprog_cnt; i++) { 6919 6912 if (si[i].has_tail_call) { ··· 6946 6919 for (int i = env->subprog_cnt - 1; i >= 0; i--) { 6947 6920 if (!i || si[i].is_async_cb) { 6948 6921 priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE; 6949 - ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); 6950 - if (ret < 0) 6922 + ret = check_max_stack_depth_subprog(env, i, dinfo, 6923 + priv_stack_supported); 6924 + if (ret < 0) { 6925 + kvfree(dinfo); 6951 6926 return ret; 6927 + } 6952 6928 } 6953 6929 } 6954 6930 ··· 6961 6931 break; 6962 6932 } 6963 6933 } 6934 + 6935 + kvfree(dinfo); 6964 6936 6965 6937 return 0; 6966 6938 }

tools/testing/selftests/bpf/prog_tests/test_global_funcs.c

··· 18 18 #include "test_global_func15.skel.h" 19 19 #include "test_global_func16.skel.h" 20 20 #include "test_global_func17.skel.h" 21 + #include "test_global_func_deep_stack.skel.h" 21 22 #include "test_global_func_ctx_args.skel.h" 22 23 23 24 #include "bpf/libbpf_internal.h" ··· 156 155 RUN_TESTS(test_global_func15); 157 156 RUN_TESTS(test_global_func16); 158 157 RUN_TESTS(test_global_func17); 158 + RUN_TESTS(test_global_func_deep_stack); 159 159 RUN_TESTS(test_global_func_ctx_args); 160 160 161 161 if (test__start_subtest("ctx_arg_rewrite"))

+9 -9

tools/testing/selftests/bpf/progs/test_global_func3.c

··· 5 5 #include <bpf/bpf_helpers.h> 6 6 #include "bpf_misc.h" 7 7 8 - __attribute__ ((noinline)) 8 + static __attribute__ ((noinline)) 9 9 int f1(struct __sk_buff *skb) 10 10 { 11 11 return skb->len; 12 12 } 13 13 14 - __attribute__ ((noinline)) 14 + static __attribute__ ((noinline)) 15 15 int f2(int val, struct __sk_buff *skb) 16 16 { 17 17 return f1(skb) + val; 18 18 } 19 19 20 - __attribute__ ((noinline)) 20 + static __attribute__ ((noinline)) 21 21 int f3(int val, struct __sk_buff *skb, int var) 22 22 { 23 23 return f2(var, skb) + val; 24 24 } 25 25 26 - __attribute__ ((noinline)) 26 + static __attribute__ ((noinline)) 27 27 int f4(struct __sk_buff *skb) 28 28 { 29 29 return f3(1, skb, 2); 30 30 } 31 31 32 - __attribute__ ((noinline)) 32 + static __attribute__ ((noinline)) 33 33 int f5(struct __sk_buff *skb) 34 34 { 35 35 return f4(skb); 36 36 } 37 37 38 - __attribute__ ((noinline)) 38 + static __attribute__ ((noinline)) 39 39 int f6(struct __sk_buff *skb) 40 40 { 41 41 return f5(skb); 42 42 } 43 43 44 - __attribute__ ((noinline)) 44 + static __attribute__ ((noinline)) 45 45 int f7(struct __sk_buff *skb) 46 46 { 47 47 return f6(skb); 48 48 } 49 49 50 - __attribute__ ((noinline)) 50 + static __attribute__ ((noinline)) 51 51 int f8(struct __sk_buff *skb) 52 52 { 53 53 return f7(skb); 54 54 } 55 55 56 56 SEC("tc") 57 - __failure __msg("the call stack of 8 frames") 57 + __failure __msg("the call stack of 9 frames") 58 58 int global_func3(struct __sk_buff *skb) 59 59 { 60 60 return f8(skb);

+95

tools/testing/selftests/bpf/progs/test_global_func_deep_stack.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2026 Meta Platforms, Inc and affiliates. */ 3 + #include <linux/bpf.h> 4 + #include <bpf/bpf_helpers.h> 5 + #include "bpf_misc.h" 6 + 7 + /* 8 + * Macro tricks to tersely define for long non-recursive call chains. Add 9 + * computation to the functions prevent tail recursion from reducing the 10 + * stack size to 0. 11 + */ 12 + 13 + #define CAT(a, b) a ## b 14 + #define XCAT(a, b) CAT(a, b) 15 + 16 + #define F_0 \ 17 + __attribute__((noinline)) \ 18 + int f0(unsigned long a) \ 19 + { \ 20 + volatile long b = a + 16; \ 21 + if (a == 0) \ 22 + return 0; \ 23 + return b; \ 24 + } 25 + 26 + #define FN(n, prev) \ 27 + __attribute__((noinline)) \ 28 + int XCAT(f, n)(unsigned long a) \ 29 + { \ 30 + volatile long b = XCAT(f, prev)(a - 1); \ 31 + if (!b) \ 32 + return 0; \ 33 + return b + 1; \ 34 + } 35 + 36 + /* Call chain 33 levels deep. */ 37 + #define F_1 F_0 FN(1, 0) 38 + #define F_2 F_1 FN(2, 1) 39 + #define F_3 F_2 FN(3, 2) 40 + #define F_4 F_3 FN(4, 3) 41 + #define F_5 F_4 FN(5, 4) 42 + #define F_6 F_5 FN(6, 5) 43 + #define F_7 F_6 FN(7, 6) 44 + #define F_8 F_7 FN(8, 7) 45 + #define F_9 F_8 FN(9, 8) 46 + #define F_10 F_9 FN(10, 9) 47 + #define F_11 F_10 FN(11, 10) 48 + #define F_12 F_11 FN(12, 11) 49 + #define F_13 F_12 FN(13, 12) 50 + #define F_14 F_13 FN(14, 13) 51 + #define F_15 F_14 FN(15, 14) 52 + #define F_16 F_15 FN(16, 15) 53 + #define F_17 F_16 FN(17, 16) 54 + #define F_18 F_17 FN(18, 17) 55 + #define F_19 F_18 FN(19, 18) 56 + #define F_20 F_19 FN(20, 19) 57 + #define F_21 F_20 FN(21, 20) 58 + #define F_22 F_21 FN(22, 21) 59 + #define F_23 F_22 FN(23, 22) 60 + #define F_24 F_23 FN(24, 23) 61 + #define F_25 F_24 FN(25, 24) 62 + #define F_26 F_25 FN(26, 25) 63 + #define F_27 F_26 FN(27, 26) 64 + #define F_28 F_27 FN(28, 27) 65 + #define F_29 F_28 FN(29, 28) 66 + #define F_30 F_29 FN(30, 29) 67 + #define F_31 F_30 FN(31, 30) 68 + #define F_32 F_31 FN(32, 31) 69 + 70 + #define CAT2(a, b) a ## b 71 + #define XCAT2(a, b) CAT2(a, b) 72 + 73 + #define F(n) XCAT2(F_, n) 74 + 75 + F(32) 76 + 77 + /* Ensure that even 32 levels deep, the function verifies. */ 78 + SEC("syscall") 79 + __success 80 + int global_func_deep_stack_success(struct __sk_buff *skb) 81 + { 82 + return f31(55); 83 + } 84 + 85 + /* 86 + * Check we actually honor stack limits (33 * 16 = 528 > 512 = MAX_STACK_DEPTH). 87 + * The stack depth is 16 because the verifier calls round_up_stack_depth() on 88 + * the size. 89 + */ 90 + SEC("syscall") 91 + __failure __msg("combined stack size of 34 calls") 92 + int global_func_deep_stack_fail(struct __sk_buff *skb) 93 + { 94 + return f32(123); 95 + }

Configure Feed

Configure Feed