x86/crc: add "template" for [V]PCLMULQDQ based CRC functions

+665

2 changed files

expand all

arch

x86

lib

crc-pclmul-template.S

crc-pclmul-template.h

+584

arch/x86/lib/crc-pclmul-template.S

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + // 3 + // Template to generate [V]PCLMULQDQ-based CRC functions for x86 4 + // 5 + // Copyright 2025 Google LLC 6 + // 7 + // Author: Eric Biggers <ebiggers@google.com> 8 + 9 + #include <linux/linkage.h> 10 + 11 + // Offsets within the generated constants table 12 + .set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only 13 + .set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next 14 + .set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next 15 + .set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next 16 + .set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next 17 + .set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 18 + .set OFFSETOF_SHUF_TABLE, 1*16 19 + .set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 20 + 21 + // Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the 22 + // corresponding non-VEX instruction plus any needed moves. The supported 23 + // instruction formats are: 24 + // 25 + // - Two-arg [src, dst], where the non-VEX format is the same. 26 + // - Three-arg [src1, src2, dst] where the non-VEX format is 27 + // [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. 28 + // 29 + // \insn gives the instruction without a "v" prefix and including any immediate 30 + // argument if needed to make the instruction follow one of the above formats. 31 + // If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to 32 + // it first; this is needed when \arg1 is an unaligned mem operand. 33 + .macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp 34 + .if AVX_LEVEL == 0 35 + // VEX not allowed. Emulate it. 36 + .ifnb \arg3 // Three-arg [src1, src2, dst] 37 + .ifc "\arg2", "\arg3" // src2 == dst? 38 + .ifnb \unaligned_mem_tmp 39 + movdqu \arg1, \unaligned_mem_tmp 40 + \insn \unaligned_mem_tmp, \arg3 41 + .else 42 + \insn \arg1, \arg3 43 + .endif 44 + .else // src2 != dst 45 + .ifc "\arg1", "\arg3" 46 + .error "Can't have src1 == dst when src2 != dst" 47 + .endif 48 + .ifnb \unaligned_mem_tmp 49 + movdqu \arg1, \unaligned_mem_tmp 50 + movdqa \arg2, \arg3 51 + \insn \unaligned_mem_tmp, \arg3 52 + .else 53 + movdqa \arg2, \arg3 54 + \insn \arg1, \arg3 55 + .endif 56 + .endif 57 + .else // Two-arg [src, dst] 58 + .ifnb \unaligned_mem_tmp 59 + movdqu \arg1, \unaligned_mem_tmp 60 + \insn \unaligned_mem_tmp, \arg2 61 + .else 62 + \insn \arg1, \arg2 63 + .endif 64 + .endif 65 + .else 66 + // VEX is allowed. Emit the desired instruction directly. 67 + .ifnb \arg3 68 + v\insn \arg1, \arg2, \arg3 69 + .else 70 + v\insn \arg1, \arg2 71 + .endif 72 + .endif 73 + .endm 74 + 75 + // Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector 76 + // register of length VL. 77 + .macro _vbroadcast src, dst 78 + .if VL == 16 79 + _cond_vex movdqa, \src, \dst 80 + .elseif VL == 32 81 + vbroadcasti128 \src, \dst 82 + .else 83 + vbroadcasti32x4 \src, \dst 84 + .endif 85 + .endm 86 + 87 + // Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC 88 + // is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. 89 + .macro _load_data vl, src, bswap_mask, dst 90 + .if \vl < 64 91 + _cond_vex movdqu, "\src", \dst 92 + .else 93 + vmovdqu8 \src, \dst 94 + .endif 95 + .if !LSB_CRC 96 + _cond_vex pshufb, \bswap_mask, \dst, \dst 97 + .endif 98 + .endm 99 + 100 + .macro _prepare_v0 vl, v0, v1, bswap_mask 101 + .if LSB_CRC 102 + .if \vl < 64 103 + _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 104 + .else 105 + vpxorq (BUF), \v0, \v0 106 + .endif 107 + .else 108 + _load_data \vl, (BUF), \bswap_mask, \v1 109 + .if \vl < 64 110 + _cond_vex pxor, \v1, \v0, \v0 111 + .else 112 + vpxorq \v1, \v0, \v0 113 + .endif 114 + .endif 115 + .endm 116 + 117 + // The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for 118 + // msb-first order or the physically high qword for lsb-first order 119 + #define LO64_TERMS 0 120 + 121 + // The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high 122 + // qword for msb-first order or the physically low qword for lsb-first order 123 + #define HI64_TERMS 1 124 + 125 + // Multiply the given \src1_terms of each 128-bit lane of \src1 by the given 126 + // \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. 127 + .macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst 128 + _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ 129 + \src1, \src2, \dst 130 + .endm 131 + 132 + // Fold \acc into \data and store the result back into \acc. \data can be an 133 + // unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no 134 + // byte-reflection is needed; otherwise it must be a vector register. \consts 135 + // is a vector register containing the needed fold constants, and \tmp is a 136 + // temporary vector register. All arguments must be the same length. 137 + .macro _fold_vec acc, data, consts, tmp 138 + _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp 139 + _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc 140 + .if AVX_LEVEL < 10 141 + _cond_vex pxor, \data, \tmp, \tmp 142 + _cond_vex pxor, \tmp, \acc, \acc 143 + .else 144 + vpternlogq $0x96, \data, \tmp, \acc 145 + .endif 146 + .endm 147 + 148 + // Fold \acc into \data and store the result back into \acc. \data is an 149 + // unaligned mem operand, \consts is a vector register containing the needed 150 + // fold constants, \bswap_mask is a vector register containing the 151 + // byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are 152 + // temporary vector registers. All arguments must have length \vl. 153 + .macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 154 + .if AVX_LEVEL == 0 || !LSB_CRC 155 + _load_data \vl, \data, \bswap_mask, \tmp1 156 + _fold_vec \acc, \tmp1, \consts, \tmp2 157 + .else 158 + _fold_vec \acc, \data, \consts, \tmp1 159 + .endif 160 + .endm 161 + 162 + // Load the constants for folding across 2**i vectors of length VL at a time 163 + // into all 128-bit lanes of the vector register CONSTS. 164 + .macro _load_vec_folding_consts i 165 + _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ 166 + CONSTS 167 + .endm 168 + 169 + // Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store 170 + // the result back into \v0. If the remaining length mod \vl is nonzero, also 171 + // fold \vl data bytes from BUF. For both operations the fold distance is \vl. 172 + // \consts must be a register of length \vl containing the fold constants. 173 + .macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 174 + _fold_vec \v0, \v1, \consts, \tmp1 175 + test $\vl, LEN8 176 + jz .Lfold_vec_final_done\@ 177 + _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 178 + add $\vl, BUF 179 + .Lfold_vec_final_done\@: 180 + .endm 181 + 182 + // This macro generates the body of a CRC function with the following prototype: 183 + // 184 + // crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); 185 + // 186 + // |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. 187 + // |buf| is the data to checksum. |len| is the data length in bytes, which must 188 + // be at least 16. |consts| is a pointer to the fold_across_128_bits_consts 189 + // field of the constants struct that was generated for the chosen CRC variant. 190 + // 191 + // Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. 192 + // 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If 193 + // the file is compiled in i386 mode, then the maximum supported value is 32. 194 + // 195 + // \lsb_crc is 1 if the CRC processes the least significant bit of each byte 196 + // first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 197 + // if the CRC processes the most significant bit of each byte first, i.e. maps 198 + // bit0 to x^0, bit1 to x^1, bit7 to x^7. 199 + // 200 + // \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. 201 + // 202 + // \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or 203 + // 10 for AVX10 or AVX512. 204 + // 205 + // If \vl == 16 && \avx_level == 0, the generated code requires: 206 + // PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) 207 + // 208 + // If \vl == 32 && \avx_level == 2, the generated code requires: 209 + // VPCLMULQDQ && AVX2. 210 + // 211 + // If \vl == 32 && \avx_level == 10, the generated code requires: 212 + // VPCLMULQDQ && (AVX10/256 || (AVX512BW && AVX512VL)) 213 + // 214 + // If \vl == 64 && \avx_level == 10, the generated code requires: 215 + // VPCLMULQDQ && (AVX10/512 || (AVX512BW && AVX512VL)) 216 + // 217 + // Other \vl and \avx_level combinations are either not supported or not useful. 218 + .macro _crc_pclmul n, lsb_crc, vl, avx_level 219 + .set LSB_CRC, \lsb_crc 220 + .set VL, \vl 221 + .set AVX_LEVEL, \avx_level 222 + 223 + // Define aliases for the xmm, ymm, or zmm registers according to VL. 224 + .irp i, 0,1,2,3,4,5,6,7 225 + .if VL == 16 226 + .set V\i, %xmm\i 227 + .set LOG2_VL, 4 228 + .elseif VL == 32 229 + .set V\i, %ymm\i 230 + .set LOG2_VL, 5 231 + .elseif VL == 64 232 + .set V\i, %zmm\i 233 + .set LOG2_VL, 6 234 + .else 235 + .error "Unsupported vector length" 236 + .endif 237 + .endr 238 + // Define aliases for the function parameters. 239 + // Note: when crc_t is shorter than u32, zero-extension to 32 bits is 240 + // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed 241 + // when crc_t is shorter than u64. 242 + #ifdef __x86_64__ 243 + .if \n <= 32 244 + .set CRC, %edi 245 + .else 246 + .set CRC, %rdi 247 + .endif 248 + .set BUF, %rsi 249 + .set LEN, %rdx 250 + .set LEN32, %edx 251 + .set LEN8, %dl 252 + .set CONSTS_PTR, %rcx 253 + #else 254 + // 32-bit support, assuming -mregparm=3 and not including support for 255 + // CRC-64 (which would use both eax and edx to pass the crc parameter). 256 + .set CRC, %eax 257 + .set BUF, %edx 258 + .set LEN, %ecx 259 + .set LEN32, %ecx 260 + .set LEN8, %cl 261 + .set CONSTS_PTR, %ebx // Passed on stack 262 + #endif 263 + 264 + // Define aliases for some local variables. V0-V5 are used without 265 + // aliases (for accumulators, data, temporary values, etc). Staying 266 + // within the first 8 vector registers keeps the code 32-bit SSE 267 + // compatible and reduces the size of 64-bit SSE code slightly. 268 + .set BSWAP_MASK, V6 269 + .set BSWAP_MASK_YMM, %ymm6 270 + .set BSWAP_MASK_XMM, %xmm6 271 + .set CONSTS, V7 272 + .set CONSTS_YMM, %ymm7 273 + .set CONSTS_XMM, %xmm7 274 + 275 + #ifdef __i386__ 276 + push CONSTS_PTR 277 + mov 8(%esp), CONSTS_PTR 278 + #endif 279 + 280 + // Create a 128-bit vector that contains the initial CRC in the end 281 + // representing the high-order polynomial coefficients, and the rest 0. 282 + // If the CRC is msb-first, also load the byte-reflection table. 283 + .if \n <= 32 284 + _cond_vex movd, CRC, %xmm0 285 + .else 286 + _cond_vex movq, CRC, %xmm0 287 + .endif 288 + .if !LSB_CRC 289 + _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 290 + _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK 291 + .endif 292 + 293 + // Load the first vector of data and XOR the initial CRC into the 294 + // appropriate end of the first 128-bit lane of data. If LEN < VL, then 295 + // use a short vector and jump ahead to the final reduction. (LEN >= 16 296 + // is guaranteed here but not necessarily LEN >= VL.) 297 + .if VL >= 32 298 + cmp $VL, LEN 299 + jae .Lat_least_1vec\@ 300 + .if VL == 64 301 + cmp $32, LEN32 302 + jb .Lless_than_32bytes\@ 303 + _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM 304 + add $32, BUF 305 + jmp .Lreduce_256bits_to_128bits\@ 306 + .Lless_than_32bytes\@: 307 + .endif 308 + _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM 309 + add $16, BUF 310 + vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 311 + jmp .Lcheck_for_partial_block\@ 312 + .Lat_least_1vec\@: 313 + .endif 314 + _prepare_v0 VL, V0, V1, BSWAP_MASK 315 + 316 + // Handle VL <= LEN < 4*VL. 317 + cmp $4*VL-1, LEN 318 + ja .Lat_least_4vecs\@ 319 + add $VL, BUF 320 + // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. 321 + // If VL==16 then load fold_across_128_bits_consts first, as the final 322 + // reduction depends on it and it won't be loaded anywhere else. 323 + cmp $2*VL-1, LEN32 324 + .if VL == 16 325 + _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 326 + .endif 327 + jbe .Lreduce_1vec_to_128bits\@ 328 + // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to 329 + // the reduction from 2 vectors. 330 + _load_data VL, (BUF), BSWAP_MASK, V1 331 + add $VL, BUF 332 + jmp .Lreduce_2vecs_to_1\@ 333 + 334 + .Lat_least_4vecs\@: 335 + // Load 3 more vectors of data. 336 + _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 337 + _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 338 + _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 339 + sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 340 + add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 341 + 342 + // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next 343 + // 4 vectors of data and write the result back to V0-V3. 344 + cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 345 + jbe .Lreduce_4vecs_to_2\@ 346 + _load_vec_folding_consts 2 347 + .Lfold_4vecs_loop\@: 348 + _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 349 + _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 350 + _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 351 + _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 352 + sub $-4*VL, BUF 353 + add $-4*VL, LEN 354 + cmp $4*VL-1, LEN 355 + ja .Lfold_4vecs_loop\@ 356 + 357 + // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold 358 + // two more vectors of data from BUF, if at least that much remains. 359 + .Lreduce_4vecs_to_2\@: 360 + _load_vec_folding_consts 1 361 + _fold_vec V0, V2, CONSTS, V4 362 + _fold_vec V1, V3, CONSTS, V4 363 + test $2*VL, LEN8 364 + jz .Lreduce_2vecs_to_1\@ 365 + _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 366 + _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 367 + sub $-2*VL, BUF 368 + 369 + // Fold V0 into V1 and write the result back to V0. Then fold one more 370 + // vector of data from BUF, if at least that much remains. 371 + .Lreduce_2vecs_to_1\@: 372 + _load_vec_folding_consts 0 373 + _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 374 + 375 + .Lreduce_1vec_to_128bits\@: 376 + .if VL == 64 377 + // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of 378 + // data from BUF, if at least that much remains. 379 + vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM 380 + vextracti64x4 $1, %zmm0, %ymm1 381 + _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 382 + .Lreduce_256bits_to_128bits\@: 383 + .endif 384 + .if VL >= 32 385 + // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of 386 + // data from BUF, if at least that much remains. 387 + vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 388 + vextracti128 $1, %ymm0, %xmm1 389 + _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 390 + .Lcheck_for_partial_block\@: 391 + .endif 392 + and $15, LEN32 393 + jz .Lreduce_128bits_to_crc\@ 394 + 395 + // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now 396 + // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 397 + // and B is the polynomial of the remaining LEN data bytes. To reduce 398 + // this to 128 bits without needing fold constants for each possible 399 + // LEN, rearrange this expression into C1*(x^128) + C2, where 400 + // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. 401 + // Then fold C1 into C2, which is just another fold across 128 bits. 402 + 403 + .if !LSB_CRC || AVX_LEVEL == 0 404 + // Load the last 16 data bytes. Note that originally LEN was >= 16. 405 + _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 406 + .endif // Else will use vpblendvb mem operand later. 407 + .if !LSB_CRC 408 + neg LEN // Needed for indexing shuf_table 409 + .endif 410 + 411 + // tmp = A*x^(8*LEN) mod x^128 412 + // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] 413 + // i.e. right-shift by LEN bytes. 414 + // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] 415 + // i.e. left-shift by LEN bytes. 416 + _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 417 + _cond_vex pshufb, %xmm3, %xmm0, %xmm1 418 + 419 + // C1 = floor(A / x^(128 - 8*LEN)) 420 + // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] 421 + // i.e. left-shift by 16-LEN bytes. 422 + // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] 423 + // i.e. right-shift by 16-LEN bytes. 424 + _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ 425 + %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 426 + 427 + // C2 = tmp + B. This is just a blend of tmp with the last 16 data 428 + // bytes (reflected if msb-first). The blend mask is the shuffle table 429 + // that was used to create tmp. 0 selects tmp, and 1 last16databytes. 430 + .if AVX_LEVEL == 0 431 + movdqa %xmm0, %xmm4 432 + movdqa %xmm3, %xmm0 433 + pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand 434 + movdqa %xmm4, %xmm0 435 + .elseif LSB_CRC 436 + vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 437 + .else 438 + vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 439 + .endif 440 + 441 + // Fold C1 into C2 and store the 128-bit result in %xmm0. 442 + _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 443 + 444 + .Lreduce_128bits_to_crc\@: 445 + // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit 446 + // polynomial stored in %xmm0 (using either lsb-first or msb-first bit 447 + // order according to LSB_CRC), and G is the CRC's generator polynomial. 448 + 449 + // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: 450 + // 451 + // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + 452 + // x^n * (%xmm0 mod x^64) 453 + // 454 + // Store t0 * x^(64-n) in %xmm0. I.e., actually do: 455 + // 456 + // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + 457 + // x^64 * (%xmm0 mod x^64) 458 + // 459 + // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned 460 + // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily 461 + // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the 462 + // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case 463 + // (considering the extra factor of x that gets implicitly introduced by 464 + // each pclmulqdq when using lsb-first order), is identical to the 465 + // constant that was used earlier for folding the LO64_TERMS across 128 466 + // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. 467 + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 468 + .if LSB_CRC 469 + _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) 470 + .else 471 + _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) 472 + .endif 473 + _cond_vex pxor, %xmm1, %xmm0, %xmm0 474 + // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). 475 + // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). 476 + 477 + // First step of Barrett reduction: Compute floor(t0 / G). This is the 478 + // polynomial by which G needs to be multiplied to cancel out the x^n 479 + // and higher terms of t0, i.e. to reduce t0 mod G. First do: 480 + // 481 + // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) 482 + // 483 + // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in 484 + // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest 485 + // value that makes enough precision be carried through the calculation. 486 + // 487 + // The '* x' makes it so the result is floor(t1 / x^64) rather than 488 + // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it 489 + // can be extracted much more easily in the next step. In the lsb-first 490 + // case the '* x' happens implicitly. In the msb-first case it must be 491 + // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the 492 + // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and 493 + // the multiplication by the x^64 term is handled using a pxor. The 494 + // pxor causes the low 64 terms of t1 to be wrong, but they are unused. 495 + _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM 496 + _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 497 + .if !LSB_CRC 498 + _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) 499 + .endif 500 + // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). 501 + 502 + // Second step of Barrett reduction: Cancel out the x^n and higher terms 503 + // of t0 by subtracting the needed multiple of G. This gives the CRC: 504 + // 505 + // crc := t0 - (G * floor(t0 / G)) 506 + // 507 + // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: 508 + // 509 + // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) 510 + // 511 + // Furthermore, since the resulting CRC is n-bit, if mod x^n is 512 + // explicitly applied to it then the x^n term of G makes no difference 513 + // in the result and can be omitted. This helps keep the constant 514 + // multiplier in 64 bits in most cases. This gives the following: 515 + // 516 + // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) 517 + // crc := (%xmm0 / x^(64-n)) mod x^n 518 + // 519 + // In the lsb-first case, each pclmulqdq implicitly introduces 520 + // an extra factor of x, so in that case the constant that needs to be 521 + // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. 522 + // For lsb-first CRCs where n=64, the extra factor of x cannot be as 523 + // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to 524 + // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC 525 + // polynomials have nonzero x^n and x^0 terms.) It works out as: the 526 + // CRC has be XORed with the physically low qword of %xmm1, representing 527 + // floor(t0 / G). The most efficient way to do that is to move it to 528 + // the physically high qword and use a ternlog to combine the two XORs. 529 + .if LSB_CRC && \n == 64 530 + _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 531 + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 532 + .if AVX_LEVEL < 10 533 + _cond_vex pxor, %xmm2, %xmm0, %xmm0 534 + _cond_vex pxor, %xmm1, %xmm0, %xmm0 535 + .else 536 + vpternlogq $0x96, %xmm2, %xmm1, %xmm0 537 + .endif 538 + _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 539 + .else 540 + _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 541 + _cond_vex pxor, %xmm1, %xmm0, %xmm0 542 + .if \n == 8 543 + _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 544 + .elseif \n == 16 545 + _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 546 + .elseif \n == 32 547 + _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 548 + .else // \n == 64 && !LSB_CRC 549 + _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 550 + .endif 551 + .endif 552 + 553 + .if VL > 16 554 + vzeroupper // Needed when ymm or zmm registers may have been used. 555 + .endif 556 + #ifdef __i386__ 557 + pop CONSTS_PTR 558 + #endif 559 + RET 560 + .endm 561 + 562 + #ifdef CONFIG_AS_VPCLMULQDQ 563 + #define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ 564 + SYM_FUNC_START(prefix##_pclmul_sse); \ 565 + _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ 566 + SYM_FUNC_END(prefix##_pclmul_sse); \ 567 + \ 568 + SYM_FUNC_START(prefix##_vpclmul_avx2); \ 569 + _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ 570 + SYM_FUNC_END(prefix##_vpclmul_avx2); \ 571 + \ 572 + SYM_FUNC_START(prefix##_vpclmul_avx10_256); \ 573 + _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=10; \ 574 + SYM_FUNC_END(prefix##_vpclmul_avx10_256); \ 575 + \ 576 + SYM_FUNC_START(prefix##_vpclmul_avx10_512); \ 577 + _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=10; \ 578 + SYM_FUNC_END(prefix##_vpclmul_avx10_512); 579 + #else 580 + #define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ 581 + SYM_FUNC_START(prefix##_pclmul_sse); \ 582 + _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ 583 + SYM_FUNC_END(prefix##_pclmul_sse); 584 + #endif // !CONFIG_AS_VPCLMULQDQ

+81

arch/x86/lib/crc-pclmul-template.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are 4 + * instantiated by crc-pclmul-template.S 5 + * 6 + * Copyright 2025 Google LLC 7 + * 8 + * Author: Eric Biggers <ebiggers@google.com> 9 + */ 10 + #ifndef _CRC_PCLMUL_TEMPLATE_H 11 + #define _CRC_PCLMUL_TEMPLATE_H 12 + 13 + #include <asm/cpufeatures.h> 14 + #include <asm/simd.h> 15 + #include <crypto/internal/simd.h> 16 + #include <linux/static_call.h> 17 + #include "crc-pclmul-consts.h" 18 + 19 + #define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \ 20 + crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \ 21 + const void *consts_ptr); \ 22 + crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \ 23 + const void *consts_ptr); \ 24 + crc_t prefix##_vpclmul_avx10_256(crc_t crc, const u8 *p, size_t len, \ 25 + const void *consts_ptr); \ 26 + crc_t prefix##_vpclmul_avx10_512(crc_t crc, const u8 *p, size_t len, \ 27 + const void *consts_ptr); \ 28 + DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse) 29 + 30 + #define INIT_CRC_PCLMUL(prefix) \ 31 + do { \ 32 + if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \ 33 + boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \ 34 + boot_cpu_has(X86_FEATURE_AVX2) && \ 35 + cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \ 36 + if (boot_cpu_has(X86_FEATURE_AVX512BW) && \ 37 + boot_cpu_has(X86_FEATURE_AVX512VL) && \ 38 + cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \ 39 + if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) \ 40 + static_call_update(prefix##_pclmul, \ 41 + prefix##_vpclmul_avx10_256); \ 42 + else \ 43 + static_call_update(prefix##_pclmul, \ 44 + prefix##_vpclmul_avx10_512); \ 45 + } else { \ 46 + static_call_update(prefix##_pclmul, \ 47 + prefix##_vpclmul_avx2); \ 48 + } \ 49 + } \ 50 + } while (0) 51 + 52 + /* 53 + * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16 54 + * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD. 55 + * 56 + * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions. 57 + * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(), 58 + * varying by CPU and factors such as which parts of the "FPU" state userspace 59 + * has touched, which could result in a larger cutoff being better. Indeed, a 60 + * larger cutoff is usually better for a *single* message. However, the 61 + * overhead of the FPU section gets amortized if multiple FPU sections get 62 + * executed before returning to userspace, since the XSAVE and XRSTOR occur only 63 + * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on 64 + * the dcache than the table-based code is, a 16-byte cutoff seems to work well. 65 + */ 66 + #define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ 67 + do { \ 68 + if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ 69 + crypto_simd_usable()) { \ 70 + const void *consts_ptr; \ 71 + \ 72 + consts_ptr = (consts).fold_across_128_bits_consts; \ 73 + kernel_fpu_begin(); \ 74 + crc = static_call(prefix##_pclmul)((crc), (p), (len), \ 75 + consts_ptr); \ 76 + kernel_fpu_end(); \ 77 + return crc; \ 78 + } \ 79 + } while (0) 80 + 81 + #endif /* _CRC_PCLMUL_TEMPLATE_H */

Configure Feed

Configure Feed