Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

xor: avoid indirect calls for arm64-optimized ops

Remove the inner xor_block_templates, and instead have two separate actual
template that call into the neon-enabled compilation unit.

Link: https://lkml.kernel.org/r/20260327061704.3707577-21-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Eric Biggers <ebiggers@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: David Sterba <dsterba@suse.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason A. Donenfeld <jason@zx2c4.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Li Nan <linan122@huawei.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Magnus Lindholm <linmag7@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Christoph Hellwig and committed by
Andrew Morton
352ebd06 77fd47e5

+114 -97
+8 -5
arch/arm64/include/asm/xor.h
··· 7 7 #include <asm-generic/xor.h> 8 8 #include <asm/simd.h> 9 9 10 - extern struct xor_block_template xor_block_arm64; 11 - void __init xor_neon_init(void); 10 + extern struct xor_block_template xor_block_neon; 11 + extern struct xor_block_template xor_block_eor3; 12 12 13 13 #define arch_xor_init arch_xor_init 14 14 static __always_inline void __init arch_xor_init(void) 15 15 { 16 - xor_neon_init(); 17 16 xor_register(&xor_block_8regs); 18 17 xor_register(&xor_block_32regs); 19 - if (cpu_has_neon()) 20 - xor_register(&xor_block_arm64); 18 + if (cpu_has_neon()) { 19 + if (cpu_have_named_feature(SHA3)) 20 + xor_register(&xor_block_eor3); 21 + else 22 + xor_register(&xor_block_neon); 23 + } 21 24 }
+49 -46
lib/raid/xor/arm64/xor-neon-glue.c
··· 7 7 #include <linux/raid/xor_impl.h> 8 8 #include <asm/simd.h> 9 9 #include <asm/xor.h> 10 + #include "xor-neon.h" 10 11 11 - extern struct xor_block_template const xor_block_inner_neon; 12 - 13 - static void 14 - xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, 15 - const unsigned long * __restrict p2) 16 - { 17 - scoped_ksimd() 18 - xor_block_inner_neon.do_2(bytes, p1, p2); 19 - } 20 - 21 - static void 22 - xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, 23 - const unsigned long * __restrict p2, 24 - const unsigned long * __restrict p3) 25 - { 26 - scoped_ksimd() 27 - xor_block_inner_neon.do_3(bytes, p1, p2, p3); 28 - } 29 - 30 - static void 31 - xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, 32 - const unsigned long * __restrict p2, 33 - const unsigned long * __restrict p3, 34 - const unsigned long * __restrict p4) 35 - { 36 - scoped_ksimd() 37 - xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4); 38 - } 39 - 40 - static void 41 - xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, 42 - const unsigned long * __restrict p2, 43 - const unsigned long * __restrict p3, 44 - const unsigned long * __restrict p4, 45 - const unsigned long * __restrict p5) 46 - { 47 - scoped_ksimd() 48 - xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5); 49 - } 50 - 51 - struct xor_block_template xor_block_arm64 = { 52 - .name = "arm64_neon", 53 - .do_2 = xor_neon_2, 54 - .do_3 = xor_neon_3, 55 - .do_4 = xor_neon_4, 56 - .do_5 = xor_neon_5 12 + #define XOR_TEMPLATE(_name) \ 13 + static void \ 14 + xor_##_name##_2(unsigned long bytes, unsigned long * __restrict p1, \ 15 + const unsigned long * __restrict p2) \ 16 + { \ 17 + scoped_ksimd() \ 18 + __xor_##_name##_2(bytes, p1, p2); \ 19 + } \ 20 + \ 21 + static void \ 22 + xor_##_name##_3(unsigned long bytes, unsigned long * __restrict p1, \ 23 + const unsigned long * __restrict p2, \ 24 + const unsigned long * __restrict p3) \ 25 + { \ 26 + scoped_ksimd() \ 27 + __xor_##_name##_3(bytes, p1, p2, p3); \ 28 + } \ 29 + \ 30 + static void \ 31 + xor_##_name##_4(unsigned long bytes, unsigned long * __restrict p1, \ 32 + const unsigned long * __restrict p2, \ 33 + const unsigned long * __restrict p3, \ 34 + const unsigned long * __restrict p4) \ 35 + { \ 36 + scoped_ksimd() \ 37 + __xor_##_name##_4(bytes, p1, p2, p3, p4); \ 38 + } \ 39 + \ 40 + static void \ 41 + xor_##_name##_5(unsigned long bytes, unsigned long * __restrict p1, \ 42 + const unsigned long * __restrict p2, \ 43 + const unsigned long * __restrict p3, \ 44 + const unsigned long * __restrict p4, \ 45 + const unsigned long * __restrict p5) \ 46 + { \ 47 + scoped_ksimd() \ 48 + __xor_##_name##_5(bytes, p1, p2, p3, p4, p5); \ 49 + } \ 50 + \ 51 + struct xor_block_template xor_block_##_name = { \ 52 + .name = __stringify(_name), \ 53 + .do_2 = xor_##_name##_2, \ 54 + .do_3 = xor_##_name##_3, \ 55 + .do_4 = xor_##_name##_4, \ 56 + .do_5 = xor_##_name##_5 \ 57 57 }; 58 + 59 + XOR_TEMPLATE(neon); 60 + XOR_TEMPLATE(eor3);
+27 -46
lib/raid/xor/arm64/xor-neon.c
··· 8 8 #include <linux/cache.h> 9 9 #include <asm/neon-intrinsics.h> 10 10 #include <asm/xor.h> 11 + #include "xor-neon.h" 11 12 12 - static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1, 13 - const unsigned long * __restrict p2) 13 + void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, 14 + const unsigned long * __restrict p2) 14 15 { 15 16 uint64_t *dp1 = (uint64_t *)p1; 16 17 uint64_t *dp2 = (uint64_t *)p2; ··· 37 36 } while (--lines > 0); 38 37 } 39 38 40 - static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1, 41 - const unsigned long * __restrict p2, 42 - const unsigned long * __restrict p3) 39 + void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, 40 + const unsigned long * __restrict p2, 41 + const unsigned long * __restrict p3) 43 42 { 44 43 uint64_t *dp1 = (uint64_t *)p1; 45 44 uint64_t *dp2 = (uint64_t *)p2; ··· 73 72 } while (--lines > 0); 74 73 } 75 74 76 - static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1, 77 - const unsigned long * __restrict p2, 78 - const unsigned long * __restrict p3, 79 - const unsigned long * __restrict p4) 75 + void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, 76 + const unsigned long * __restrict p2, 77 + const unsigned long * __restrict p3, 78 + const unsigned long * __restrict p4) 80 79 { 81 80 uint64_t *dp1 = (uint64_t *)p1; 82 81 uint64_t *dp2 = (uint64_t *)p2; ··· 118 117 } while (--lines > 0); 119 118 } 120 119 121 - static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1, 122 - const unsigned long * __restrict p2, 123 - const unsigned long * __restrict p3, 124 - const unsigned long * __restrict p4, 125 - const unsigned long * __restrict p5) 120 + void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, 121 + const unsigned long * __restrict p2, 122 + const unsigned long * __restrict p3, 123 + const unsigned long * __restrict p4, 124 + const unsigned long * __restrict p5) 126 125 { 127 126 uint64_t *dp1 = (uint64_t *)p1; 128 127 uint64_t *dp2 = (uint64_t *)p2; ··· 172 171 } while (--lines > 0); 173 172 } 174 173 175 - struct xor_block_template xor_block_inner_neon __ro_after_init = { 176 - .name = "__inner_neon__", 177 - .do_2 = xor_arm64_neon_2, 178 - .do_3 = xor_arm64_neon_3, 179 - .do_4 = xor_arm64_neon_4, 180 - .do_5 = xor_arm64_neon_5, 181 - }; 182 - 183 174 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) 184 175 { 185 176 uint64x2_t res; ··· 182 189 return res; 183 190 } 184 191 185 - static void xor_arm64_eor3_3(unsigned long bytes, 186 - unsigned long * __restrict p1, 187 - const unsigned long * __restrict p2, 188 - const unsigned long * __restrict p3) 192 + void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, 193 + const unsigned long * __restrict p2, 194 + const unsigned long * __restrict p3) 189 195 { 190 196 uint64_t *dp1 = (uint64_t *)p1; 191 197 uint64_t *dp2 = (uint64_t *)p2; ··· 216 224 } while (--lines > 0); 217 225 } 218 226 219 - static void xor_arm64_eor3_4(unsigned long bytes, 220 - unsigned long * __restrict p1, 221 - const unsigned long * __restrict p2, 222 - const unsigned long * __restrict p3, 223 - const unsigned long * __restrict p4) 227 + void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, 228 + const unsigned long * __restrict p2, 229 + const unsigned long * __restrict p3, 230 + const unsigned long * __restrict p4) 224 231 { 225 232 uint64_t *dp1 = (uint64_t *)p1; 226 233 uint64_t *dp2 = (uint64_t *)p2; ··· 259 268 } while (--lines > 0); 260 269 } 261 270 262 - static void xor_arm64_eor3_5(unsigned long bytes, 263 - unsigned long * __restrict p1, 264 - const unsigned long * __restrict p2, 265 - const unsigned long * __restrict p3, 266 - const unsigned long * __restrict p4, 267 - const unsigned long * __restrict p5) 271 + void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, 272 + const unsigned long * __restrict p2, 273 + const unsigned long * __restrict p3, 274 + const unsigned long * __restrict p4, 275 + const unsigned long * __restrict p5) 268 276 { 269 277 uint64_t *dp1 = (uint64_t *)p1; 270 278 uint64_t *dp2 = (uint64_t *)p2; ··· 303 313 dp4 += 8; 304 314 dp5 += 8; 305 315 } while (--lines > 0); 306 - } 307 - 308 - void __init xor_neon_init(void) 309 - { 310 - if (cpu_have_named_feature(SHA3)) { 311 - xor_block_inner_neon.do_3 = xor_arm64_eor3_3; 312 - xor_block_inner_neon.do_4 = xor_arm64_eor3_4; 313 - xor_block_inner_neon.do_5 = xor_arm64_eor3_5; 314 - } 315 316 }
+30
lib/raid/xor/arm64/xor-neon.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + 3 + void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, 4 + const unsigned long * __restrict p2); 5 + void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, 6 + const unsigned long * __restrict p2, 7 + const unsigned long * __restrict p3); 8 + void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, 9 + const unsigned long * __restrict p2, 10 + const unsigned long * __restrict p3, 11 + const unsigned long * __restrict p4); 12 + void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, 13 + const unsigned long * __restrict p2, 14 + const unsigned long * __restrict p3, 15 + const unsigned long * __restrict p4, 16 + const unsigned long * __restrict p5); 17 + 18 + #define __xor_eor3_2 __xor_neon_2 19 + void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, 20 + const unsigned long * __restrict p2, 21 + const unsigned long * __restrict p3); 22 + void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, 23 + const unsigned long * __restrict p2, 24 + const unsigned long * __restrict p3, 25 + const unsigned long * __restrict p4); 26 + void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, 27 + const unsigned long * __restrict p2, 28 + const unsigned long * __restrict p3, 29 + const unsigned long * __restrict p4, 30 + const unsigned long * __restrict p5);