Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'bpf-inline-helpers-in-arm64-and-riscv-jits'

Puranjay Mohan says:

====================
bpf: Inline helpers in arm64 and riscv JITs

Changes in v5 -> v6:
arm64 v5: https://lore.kernel.org/all/20240430234739.79185-1-puranjay@kernel.org/
riscv v2: https://lore.kernel.org/all/20240430175834.33152-1-puranjay@kernel.org/
- Combine riscv and arm64 changes in single series
- Some coding style fixes

Changes in v4 -> v5:
v4: https://lore.kernel.org/all/20240429131647.50165-1-puranjay@kernel.org/
- Implement the inlining of the bpf_get_smp_processor_id() in the JIT.

NOTE: This needs to be based on:
https://lore.kernel.org/all/20240430175834.33152-1-puranjay@kernel.org/
to be built.

Manual run of bpf-ci with this series rebased on above:
https://github.com/kernel-patches/bpf/pull/6929

Changes in v3 -> v4:
v3: https://lore.kernel.org/all/20240426121349.97651-1-puranjay@kernel.org/
- Fix coding style issue related to C89 standards.

Changes in v2 -> v3:
v2: https://lore.kernel.org/all/20240424173550.16359-1-puranjay@kernel.org/
- Fixed the xlated dump of percpu mov to "r0 = &(void __percpu *)(r0)"
- Made ARM64 and x86-64 use the same code for inlining. The only difference
that remains is the per-cpu address of the cpu_number.

Changes in v1 -> v2:
v1: https://lore.kernel.org/all/20240405091707.66675-1-puranjay12@gmail.com/
- Add a patch to inline bpf_get_smp_processor_id()
- Fix an issue in MRS instruction encoding as pointed out by Will
- Remove CONFIG_SMP check because arm64 kernel always compiles with CONFIG_SMP

This series adds the support of internal only per-CPU instructions and inlines
the bpf_get_smp_processor_id() helper call for ARM64 and RISC-V BPF JITs.

Here is an example of calls to bpf_get_smp_processor_id() and
percpu_array_map_lookup_elem() before and after this series on ARM64.

BPF
=====
BEFORE AFTER
-------- -------

int cpu = bpf_get_smp_processor_id(); int cpu = bpf_get_smp_processor_id();
(85) call bpf_get_smp_processor_id#229032 (85) call bpf_get_smp_processor_id#8

p = bpf_map_lookup_elem(map, &zero); p = bpf_map_lookup_elem(map, &zero);
(18) r1 = map[id:78] (18) r1 = map[id:153]
(18) r2 = map[id:82][0]+65536 (18) r2 = map[id:157][0]+65536
(85) call percpu_array_map_lookup_elem#313512 (07) r1 += 496
(61) r0 = *(u32 *)(r2 +0)
(35) if r0 >= 0x1 goto pc+5
(67) r0 <<= 3
(0f) r0 += r1
(79) r0 = *(u64 *)(r0 +0)
(bf) r0 = &(void __percpu *)(r0)
(05) goto pc+1
(b7) r0 = 0

ARM64 JIT
===========

BEFORE AFTER
-------- -------

int cpu = bpf_get_smp_processor_id(); int cpu = bpf_get_smp_processor_id();
mov x10, #0xfffffffffffff4d0 mrs x10, sp_el0
movk x10, #0x802b, lsl #16 ldr w7, [x10, #24]
movk x10, #0x8000, lsl #32
blr x10
add x7, x0, #0x0

p = bpf_map_lookup_elem(map, &zero); p = bpf_map_lookup_elem(map, &zero);
mov x0, #0xffff0003ffffffff mov x0, #0xffff0003ffffffff
movk x0, #0xce5c, lsl #16 movk x0, #0xe0f3, lsl #16
movk x0, #0xca00 movk x0, #0x7c00
mov x1, #0xffff8000ffffffff mov x1, #0xffff8000ffffffff
movk x1, #0x8bdb, lsl #16 movk x1, #0xb0c7, lsl #16
movk x1, #0x6000 movk x1, #0xe000
mov x10, #0xffffffffffff3ed0 add x0, x0, #0x1f0
movk x10, #0x802d, lsl #16 ldr w7, [x1]
movk x10, #0x8000, lsl #32 cmp x7, #0x1
blr x10 b.cs 0x0000000000000090
add x7, x0, #0x0 lsl x7, x7, #3
add x7, x7, x0
ldr x7, [x7]
mrs x10, tpidr_el1
add x7, x7, x10
b 0x0000000000000094
mov x7, #0x0

Performance improvement found using benchmark[1]

./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc

+---------------+-------------------+-------------------+--------------+
| Name | Before | After | % change |
|---------------+-------------------+-------------------+--------------|
| glob-arr-inc | 23.380 ± 1.675M/s | 25.893 ± 0.026M/s | + 10.74% |
| arr-inc | 23.928 ± 0.034M/s | 25.213 ± 0.063M/s | + 5.37% |
| hash-inc | 12.352 ± 0.005M/s | 12.609 ± 0.013M/s | + 2.08% |
+---------------+-------------------+-------------------+--------------+

[1] https://github.com/anakryiko/linux/commit/8dec900975ef

RISCV64 JIT output for `call bpf_get_smp_processor_id`
=======================================================

Before After
-------- -------

auipc t1,0x848c ld a5,32(tp)
jalr 604(t1)
mv a5,a0

Benchmark using [1] on Qemu.

./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc

+---------------+------------------+------------------+--------------+
| Name | Before | After | % change |
|---------------+------------------+------------------+--------------|
| glob-arr-inc | 1.077 ± 0.006M/s | 1.336 ± 0.010M/s | + 24.04% |
| arr-inc | 1.078 ± 0.002M/s | 1.332 ± 0.015M/s | + 23.56% |
| hash-inc | 0.494 ± 0.004M/s | 0.653 ± 0.001M/s | + 32.18% |
+---------------+------------------+------------------+--------------+
====================

Link: https://lore.kernel.org/r/20240502151854.9810-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

+132
+8
arch/arm64/include/asm/insn.h
··· 135 135 AARCH64_INSN_SPCLREG_SP_EL2 = 0xF210 136 136 }; 137 137 138 + enum aarch64_insn_system_register { 139 + AARCH64_INSN_SYSREG_TPIDR_EL1 = 0x4684, 140 + AARCH64_INSN_SYSREG_TPIDR_EL2 = 0x6682, 141 + AARCH64_INSN_SYSREG_SP_EL0 = 0x4208, 142 + }; 143 + 138 144 enum aarch64_insn_variant { 139 145 AARCH64_INSN_VARIANT_32BIT, 140 146 AARCH64_INSN_VARIANT_64BIT ··· 692 686 } 693 687 #endif 694 688 u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type); 689 + u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, 690 + enum aarch64_insn_system_register sysreg); 695 691 696 692 s32 aarch64_get_branch_offset(u32 insn); 697 693 u32 aarch64_set_branch_offset(u32 insn, s32 offset);
+11
arch/arm64/lib/insn.c
··· 1515 1515 1516 1516 return insn; 1517 1517 } 1518 + 1519 + u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result, 1520 + enum aarch64_insn_system_register sysreg) 1521 + { 1522 + u32 insn = aarch64_insn_get_mrs_value(); 1523 + 1524 + insn &= ~GENMASK(19, 0); 1525 + insn |= sysreg << 5; 1526 + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, 1527 + insn, result); 1528 + }
+8
arch/arm64/net/bpf_jit.h
··· 297 297 #define A64_ADR(Rd, offset) \ 298 298 aarch64_insn_gen_adr(0, offset, Rd, AARCH64_INSN_ADR_TYPE_ADR) 299 299 300 + /* MRS */ 301 + #define A64_MRS_TPIDR_EL1(Rt) \ 302 + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL1) 303 + #define A64_MRS_TPIDR_EL2(Rt) \ 304 + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_TPIDR_EL2) 305 + #define A64_MRS_SP_EL0(Rt) \ 306 + aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0) 307 + 300 308 #endif /* _BPF_JIT_H */
+39
arch/arm64/net/bpf_jit_comp.c
··· 890 890 emit(A64_ORR(1, tmp, dst, tmp), ctx); 891 891 emit(A64_MOV(1, dst, tmp), ctx); 892 892 break; 893 + } else if (insn_is_mov_percpu_addr(insn)) { 894 + if (dst != src) 895 + emit(A64_MOV(1, dst, src), ctx); 896 + if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN)) 897 + emit(A64_MRS_TPIDR_EL2(tmp), ctx); 898 + else 899 + emit(A64_MRS_TPIDR_EL1(tmp), ctx); 900 + emit(A64_ADD(1, dst, dst, tmp), ctx); 901 + break; 893 902 } 894 903 switch (insn->off) { 895 904 case 0: ··· 1228 1219 const u8 r0 = bpf2a64[BPF_REG_0]; 1229 1220 bool func_addr_fixed; 1230 1221 u64 func_addr; 1222 + u32 cpu_offset; 1223 + 1224 + /* Implement helper call to bpf_get_smp_processor_id() inline */ 1225 + if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) { 1226 + cpu_offset = offsetof(struct thread_info, cpu); 1227 + 1228 + emit(A64_MRS_SP_EL0(tmp), ctx); 1229 + if (is_lsi_offset(cpu_offset, 2)) { 1230 + emit(A64_LDR32I(r0, tmp, cpu_offset), ctx); 1231 + } else { 1232 + emit_a64_mov_i(1, tmp2, cpu_offset, ctx); 1233 + emit(A64_LDR32(r0, tmp, tmp2), ctx); 1234 + } 1235 + break; 1236 + } 1231 1237 1232 1238 ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, 1233 1239 &func_addr, &func_addr_fixed); ··· 2581 2557 return false; 2582 2558 } 2583 2559 return true; 2560 + } 2561 + 2562 + bool bpf_jit_supports_percpu_insn(void) 2563 + { 2564 + return true; 2565 + } 2566 + 2567 + bool bpf_jit_inlines_helper_call(s32 imm) 2568 + { 2569 + switch (imm) { 2570 + case BPF_FUNC_get_smp_processor_id: 2571 + return true; 2572 + default: 2573 + return false; 2574 + } 2584 2575 } 2585 2576 2586 2577 void bpf_jit_free(struct bpf_prog *prog)
+50
arch/riscv/net/bpf_jit_comp64.c
··· 12 12 #include <linux/stop_machine.h> 13 13 #include <asm/patch.h> 14 14 #include <asm/cfi.h> 15 + #include <asm/percpu.h> 15 16 #include "bpf_jit.h" 16 17 17 18 #define RV_FENTRY_NINSNS 2 ··· 1090 1089 emit_or(RV_REG_T1, rd, RV_REG_T1, ctx); 1091 1090 emit_mv(rd, RV_REG_T1, ctx); 1092 1091 break; 1092 + } else if (insn_is_mov_percpu_addr(insn)) { 1093 + if (rd != rs) 1094 + emit_mv(rd, rs, ctx); 1095 + #ifdef CONFIG_SMP 1096 + /* Load current CPU number in T1 */ 1097 + emit_ld(RV_REG_T1, offsetof(struct thread_info, cpu), 1098 + RV_REG_TP, ctx); 1099 + /* << 3 because offsets are 8 bytes */ 1100 + emit_slli(RV_REG_T1, RV_REG_T1, 3, ctx); 1101 + /* Load address of __per_cpu_offset array in T2 */ 1102 + emit_addr(RV_REG_T2, (u64)&__per_cpu_offset, extra_pass, ctx); 1103 + /* Add offset of current CPU to __per_cpu_offset */ 1104 + emit_add(RV_REG_T1, RV_REG_T2, RV_REG_T1, ctx); 1105 + /* Load __per_cpu_offset[cpu] in T1 */ 1106 + emit_ld(RV_REG_T1, 0, RV_REG_T1, ctx); 1107 + /* Add the offset to Rd */ 1108 + emit_add(rd, rd, RV_REG_T1, ctx); 1109 + #endif 1093 1110 } 1094 1111 if (imm == 1) { 1095 1112 /* Special mov32 for zext */ ··· 1492 1473 { 1493 1474 bool fixed_addr; 1494 1475 u64 addr; 1476 + 1477 + /* Inline calls to bpf_get_smp_processor_id() 1478 + * 1479 + * RV_REG_TP holds the address of the current CPU's task_struct and thread_info is 1480 + * at offset 0 in task_struct. 1481 + * Load cpu from thread_info: 1482 + * Set R0 to ((struct thread_info *)(RV_REG_TP))->cpu 1483 + * 1484 + * This replicates the implementation of raw_smp_processor_id() on RISCV 1485 + */ 1486 + if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) { 1487 + /* Load current CPU number in R0 */ 1488 + emit_ld(bpf_to_rv_reg(BPF_REG_0, ctx), offsetof(struct thread_info, cpu), 1489 + RV_REG_TP, ctx); 1490 + break; 1491 + } 1495 1492 1496 1493 mark_call(ctx); 1497 1494 ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, ··· 2072 2037 bool bpf_jit_supports_arena(void) 2073 2038 { 2074 2039 return true; 2040 + } 2041 + 2042 + bool bpf_jit_supports_percpu_insn(void) 2043 + { 2044 + return true; 2045 + } 2046 + 2047 + bool bpf_jit_inlines_helper_call(s32 imm) 2048 + { 2049 + switch (imm) { 2050 + case BPF_FUNC_get_smp_processor_id: 2051 + return true; 2052 + default: 2053 + return false; 2054 + } 2075 2055 }
+1
include/linux/filter.h
··· 993 993 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); 994 994 void bpf_jit_compile(struct bpf_prog *prog); 995 995 bool bpf_jit_needs_zext(void); 996 + bool bpf_jit_inlines_helper_call(s32 imm); 996 997 bool bpf_jit_supports_subprog_tailcalls(void); 997 998 bool bpf_jit_supports_percpu_insn(void); 998 999 bool bpf_jit_supports_kfunc_call(void);
+11
kernel/bpf/core.c
··· 2941 2941 return false; 2942 2942 } 2943 2943 2944 + /* Return true if the JIT inlines the call to the helper corresponding to 2945 + * the imm. 2946 + * 2947 + * The verifier will not patch the insn->imm for the call to the helper if 2948 + * this returns true. 2949 + */ 2950 + bool __weak bpf_jit_inlines_helper_call(s32 imm) 2951 + { 2952 + return false; 2953 + } 2954 + 2944 2955 /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ 2945 2956 bool __weak bpf_jit_supports_subprog_tailcalls(void) 2946 2957 {
+4
kernel/bpf/verifier.c
··· 19996 19996 goto next_insn; 19997 19997 } 19998 19998 19999 + /* Skip inlining the helper call if the JIT does it. */ 20000 + if (bpf_jit_inlines_helper_call(insn->imm)) 20001 + goto next_insn; 20002 + 19999 20003 if (insn->imm == BPF_FUNC_get_route_realm) 20000 20004 prog->dst_needed = 1; 20001 20005 if (insn->imm == BPF_FUNC_get_prandom_u32)