Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

x86/crc32: implement crc32_le using new template

Instantiate crc-pclmul-template.S for crc32_le, and delete the original
PCLMULQDQ optimized implementation. This has the following advantages:

- Less CRC-variant-specific code.
- VPCLMULQDQ support, greatly improving performance on sufficiently long
messages on newer CPUs.
- A faster reduction from 128 bits to the final CRC.
- Support for lengths not a multiple of 16 bytes, improving performance
for such lengths.
- Support for misaligned buffers, improving performance in such cases.

Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit:

Length Before After
------ ------ -----
1 427 MB/s 605 MB/s
16 710 MB/s 3631 MB/s
64 704 MB/s 7615 MB/s
127 3610 MB/s 9710 MB/s
128 8759 MB/s 12702 MB/s
200 7083 MB/s 15343 MB/s
256 17284 MB/s 22904 MB/s
511 10919 MB/s 27309 MB/s
512 19849 MB/s 48900 MB/s
1024 21216 MB/s 62630 MB/s
3173 22150 MB/s 72437 MB/s
4096 22496 MB/s 79593 MB/s
16384 22018 MB/s 85106 MB/s

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20250210174540.161705-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>

+65 -244
+53
arch/x86/lib/crc-pclmul-consts.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * CRC constants generated by: 4 + * 5 + * ./scripts/gen-crc-consts.py x86_pclmul crc32_lsb_0xedb88320 6 + * 7 + * Do not edit manually. 8 + */ 9 + 10 + /* 11 + * CRC folding constants generated for least-significant-bit-first CRC-32 using 12 + * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + 13 + * x^5 + x^4 + x^2 + x^1 + x^0 14 + */ 15 + static const struct { 16 + u64 fold_across_2048_bits_consts[2]; 17 + u64 fold_across_1024_bits_consts[2]; 18 + u64 fold_across_512_bits_consts[2]; 19 + u64 fold_across_256_bits_consts[2]; 20 + u64 fold_across_128_bits_consts[2]; 21 + u8 shuf_table[48]; 22 + u64 barrett_reduction_consts[2]; 23 + } crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = { 24 + .fold_across_2048_bits_consts = { 25 + 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */ 26 + 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */ 27 + }, 28 + .fold_across_1024_bits_consts = { 29 + 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */ 30 + 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */ 31 + }, 32 + .fold_across_512_bits_consts = { 33 + 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */ 34 + 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */ 35 + }, 36 + .fold_across_256_bits_consts = { 37 + 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */ 38 + 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */ 39 + }, 40 + .fold_across_128_bits_consts = { 41 + 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */ 42 + 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */ 43 + }, 44 + .shuf_table = { 45 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 46 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 47 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 48 + }, 49 + .barrett_reduction_consts = { 50 + 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */ 51 + 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */ 52 + }, 53 + };
+8 -29
arch/x86/lib/crc32-glue.c
··· 7 7 * Copyright 2024 Google LLC 8 8 */ 9 9 10 - #include <asm/cpufeatures.h> 11 - #include <asm/simd.h> 12 - #include <crypto/internal/simd.h> 13 10 #include <linux/crc32.h> 14 - #include <linux/linkage.h> 15 11 #include <linux/module.h> 16 - 17 - /* minimum size of buffer for crc32_pclmul_le_16 */ 18 - #define CRC32_PCLMUL_MIN_LEN 64 12 + #include "crc-pclmul-template.h" 19 13 20 14 static DEFINE_STATIC_KEY_FALSE(have_crc32); 21 15 static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); 22 16 23 - u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); 17 + DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); 24 18 25 19 u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) 26 20 { 27 - if (len >= CRC32_PCLMUL_MIN_LEN + 15 && 28 - static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { 29 - size_t n = -(uintptr_t)p & 15; 30 - 31 - /* align p to 16-byte boundary */ 32 - if (n) { 33 - crc = crc32_le_base(crc, p, n); 34 - p += n; 35 - len -= n; 36 - } 37 - n = round_down(len, 16); 38 - kernel_fpu_begin(); 39 - crc = crc32_pclmul_le_16(crc, p, n); 40 - kernel_fpu_end(); 41 - p += n; 42 - len -= n; 43 - } 44 - if (len) 45 - crc = crc32_le_base(crc, p, len); 46 - return crc; 21 + CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, 22 + have_pclmulqdq); 23 + return crc32_le_base(crc, p, len); 47 24 } 48 25 EXPORT_SYMBOL(crc32_le_arch); 49 26 ··· 74 97 { 75 98 if (boot_cpu_has(X86_FEATURE_XMM4_2)) 76 99 static_branch_enable(&have_crc32); 77 - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) 100 + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { 78 101 static_branch_enable(&have_pclmulqdq); 102 + INIT_CRC_PCLMUL(crc32_lsb); 103 + } 79 104 return 0; 80 105 } 81 106 arch_initcall(crc32_x86_init);
+4 -215
arch/x86/lib/crc32-pclmul.S
··· 1 - /* SPDX-License-Identifier: GPL-2.0-only */ 2 - /* 3 - * Copyright 2012 Xyratex Technology Limited 4 - * 5 - * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 6 - * calculation. 7 - * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 8 - * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 9 - * at: 10 - * http://www.intel.com/products/processor/manuals/ 11 - * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 12 - * Volume 2B: Instruction Set Reference, N-Z 13 - * 14 - * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 15 - * Alexander Boyko <Alexander_Boyko@xyratex.com> 16 - */ 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + // Copyright 2025 Google LLC 17 3 18 - #include <linux/linkage.h> 4 + #include "crc-pclmul-template.S" 19 5 20 - 21 - .section .rodata 22 - .align 16 23 - /* 24 - * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 25 - * #define CONSTANT_R1 0x154442bd4LL 26 - * 27 - * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 28 - * #define CONSTANT_R2 0x1c6e41596LL 29 - */ 30 - .Lconstant_R2R1: 31 - .octa 0x00000001c6e415960000000154442bd4 32 - /* 33 - * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 34 - * #define CONSTANT_R3 0x1751997d0LL 35 - * 36 - * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 37 - * #define CONSTANT_R4 0x0ccaa009eLL 38 - */ 39 - .Lconstant_R4R3: 40 - .octa 0x00000000ccaa009e00000001751997d0 41 - /* 42 - * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 43 - * #define CONSTANT_R5 0x163cd6124LL 44 - */ 45 - .Lconstant_R5: 46 - .octa 0x00000000000000000000000163cd6124 47 - .Lconstant_mask32: 48 - .octa 0x000000000000000000000000FFFFFFFF 49 - /* 50 - * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 51 - * 52 - * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL 53 - * #define CONSTANT_RU 0x1F7011641LL 54 - */ 55 - .Lconstant_RUpoly: 56 - .octa 0x00000001F701164100000001DB710641 57 - 58 - #define CONSTANT %xmm0 59 - 60 - #ifdef __x86_64__ 61 - #define CRC %edi 62 - #define BUF %rsi 63 - #define LEN %rdx 64 - #else 65 - #define CRC %eax 66 - #define BUF %edx 67 - #define LEN %ecx 68 - #endif 69 - 70 - 71 - 72 - .text 73 - /** 74 - * Calculate crc32 75 - * CRC - initial crc32 76 - * BUF - buffer (16 bytes aligned) 77 - * LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63 78 - * return %eax crc32 79 - * u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); 80 - */ 81 - 82 - SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 83 - movdqa (BUF), %xmm1 84 - movdqa 0x10(BUF), %xmm2 85 - movdqa 0x20(BUF), %xmm3 86 - movdqa 0x30(BUF), %xmm4 87 - movd CRC, CONSTANT 88 - pxor CONSTANT, %xmm1 89 - sub $0x40, LEN 90 - add $0x40, BUF 91 - cmp $0x40, LEN 92 - jb .Lless_64 93 - 94 - #ifdef __x86_64__ 95 - movdqa .Lconstant_R2R1(%rip), CONSTANT 96 - #else 97 - movdqa .Lconstant_R2R1, CONSTANT 98 - #endif 99 - 100 - .Lloop_64:/* 64 bytes Full cache line folding */ 101 - prefetchnta 0x40(BUF) 102 - movdqa %xmm1, %xmm5 103 - movdqa %xmm2, %xmm6 104 - movdqa %xmm3, %xmm7 105 - #ifdef __x86_64__ 106 - movdqa %xmm4, %xmm8 107 - #endif 108 - pclmulqdq $0x00, CONSTANT, %xmm1 109 - pclmulqdq $0x00, CONSTANT, %xmm2 110 - pclmulqdq $0x00, CONSTANT, %xmm3 111 - #ifdef __x86_64__ 112 - pclmulqdq $0x00, CONSTANT, %xmm4 113 - #endif 114 - pclmulqdq $0x11, CONSTANT, %xmm5 115 - pclmulqdq $0x11, CONSTANT, %xmm6 116 - pclmulqdq $0x11, CONSTANT, %xmm7 117 - #ifdef __x86_64__ 118 - pclmulqdq $0x11, CONSTANT, %xmm8 119 - #endif 120 - pxor %xmm5, %xmm1 121 - pxor %xmm6, %xmm2 122 - pxor %xmm7, %xmm3 123 - #ifdef __x86_64__ 124 - pxor %xmm8, %xmm4 125 - #else 126 - /* xmm8 unsupported for x32 */ 127 - movdqa %xmm4, %xmm5 128 - pclmulqdq $0x00, CONSTANT, %xmm4 129 - pclmulqdq $0x11, CONSTANT, %xmm5 130 - pxor %xmm5, %xmm4 131 - #endif 132 - 133 - pxor (BUF), %xmm1 134 - pxor 0x10(BUF), %xmm2 135 - pxor 0x20(BUF), %xmm3 136 - pxor 0x30(BUF), %xmm4 137 - 138 - sub $0x40, LEN 139 - add $0x40, BUF 140 - cmp $0x40, LEN 141 - jge .Lloop_64 142 - .Lless_64:/* Folding cache line into 128bit */ 143 - #ifdef __x86_64__ 144 - movdqa .Lconstant_R4R3(%rip), CONSTANT 145 - #else 146 - movdqa .Lconstant_R4R3, CONSTANT 147 - #endif 148 - prefetchnta (BUF) 149 - 150 - movdqa %xmm1, %xmm5 151 - pclmulqdq $0x00, CONSTANT, %xmm1 152 - pclmulqdq $0x11, CONSTANT, %xmm5 153 - pxor %xmm5, %xmm1 154 - pxor %xmm2, %xmm1 155 - 156 - movdqa %xmm1, %xmm5 157 - pclmulqdq $0x00, CONSTANT, %xmm1 158 - pclmulqdq $0x11, CONSTANT, %xmm5 159 - pxor %xmm5, %xmm1 160 - pxor %xmm3, %xmm1 161 - 162 - movdqa %xmm1, %xmm5 163 - pclmulqdq $0x00, CONSTANT, %xmm1 164 - pclmulqdq $0x11, CONSTANT, %xmm5 165 - pxor %xmm5, %xmm1 166 - pxor %xmm4, %xmm1 167 - 168 - cmp $0x10, LEN 169 - jb .Lfold_64 170 - .Lloop_16:/* Folding rest buffer into 128bit */ 171 - movdqa %xmm1, %xmm5 172 - pclmulqdq $0x00, CONSTANT, %xmm1 173 - pclmulqdq $0x11, CONSTANT, %xmm5 174 - pxor %xmm5, %xmm1 175 - pxor (BUF), %xmm1 176 - sub $0x10, LEN 177 - add $0x10, BUF 178 - cmp $0x10, LEN 179 - jge .Lloop_16 180 - 181 - .Lfold_64: 182 - /* perform the last 64 bit fold, also adds 32 zeroes 183 - * to the input stream */ 184 - pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ 185 - psrldq $0x08, %xmm1 186 - pxor CONSTANT, %xmm1 187 - 188 - /* final 32-bit fold */ 189 - movdqa %xmm1, %xmm2 190 - #ifdef __x86_64__ 191 - movdqa .Lconstant_R5(%rip), CONSTANT 192 - movdqa .Lconstant_mask32(%rip), %xmm3 193 - #else 194 - movdqa .Lconstant_R5, CONSTANT 195 - movdqa .Lconstant_mask32, %xmm3 196 - #endif 197 - psrldq $0x04, %xmm2 198 - pand %xmm3, %xmm1 199 - pclmulqdq $0x00, CONSTANT, %xmm1 200 - pxor %xmm2, %xmm1 201 - 202 - /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 203 - #ifdef __x86_64__ 204 - movdqa .Lconstant_RUpoly(%rip), CONSTANT 205 - #else 206 - movdqa .Lconstant_RUpoly, CONSTANT 207 - #endif 208 - movdqa %xmm1, %xmm2 209 - pand %xmm3, %xmm1 210 - pclmulqdq $0x10, CONSTANT, %xmm1 211 - pand %xmm3, %xmm1 212 - pclmulqdq $0x00, CONSTANT, %xmm1 213 - pxor %xmm2, %xmm1 214 - pextrd $0x01, %xmm1, %eax 215 - 216 - RET 217 - SYM_FUNC_END(crc32_pclmul_le_16) 6 + DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)