Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

x86/crc-t10dif: implement crc_t10dif using new template

Instantiate crc-pclmul-template.S for crc_t10dif and delete the original
PCLMULQDQ optimized implementation. This has the following advantages:

- Less CRC-variant-specific code.
- VPCLMULQDQ support, greatly improving performance on sufficiently long
messages on newer CPUs.
- A faster reduction from 128 bits to the final CRC.
- Support for i386.

Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit:

Length Before After
------ ------ -----
1 440 MB/s 386 MB/s
16 1865 MB/s 2008 MB/s
64 4343 MB/s 6917 MB/s
127 5440 MB/s 8909 MB/s
128 5533 MB/s 12150 MB/s
200 5908 MB/s 14423 MB/s
256 15870 MB/s 21288 MB/s
511 14219 MB/s 25840 MB/s
512 18361 MB/s 37797 MB/s
1024 19941 MB/s 61374 MB/s
3173 20461 MB/s 74909 MB/s
4096 21310 MB/s 78919 MB/s
16384 21663 MB/s 85012 MB/s

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20250210174540.161705-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>

+64 -349
+1 -1
arch/x86/Kconfig
··· 77 77 select ARCH_HAS_CPU_FINALIZE_INIT 78 78 select ARCH_HAS_CPU_PASID if IOMMU_SVA 79 79 select ARCH_HAS_CRC32 80 - select ARCH_HAS_CRC_T10DIF if X86_64 80 + select ARCH_HAS_CRC_T10DIF 81 81 select ARCH_HAS_CURRENT_STACK_POINTER 82 82 select ARCH_HAS_DEBUG_VIRTUAL 83 83 select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
+1 -1
arch/x86/lib/Makefile
··· 43 43 crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o 44 44 45 45 obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o 46 - crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o 46 + crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o 47 47 48 48 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o 49 49 obj-y += iomem.o
+47 -1
arch/x86/lib/crc-pclmul-consts.h
··· 2 2 /* 3 3 * CRC constants generated by: 4 4 * 5 - * ./scripts/gen-crc-consts.py x86_pclmul crc32_lsb_0xedb88320 5 + * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320 6 6 * 7 7 * Do not edit manually. 8 8 */ 9 + 10 + /* 11 + * CRC folding constants generated for most-significant-bit-first CRC-16 using 12 + * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 13 + */ 14 + static const struct { 15 + u8 bswap_mask[16]; 16 + u64 fold_across_2048_bits_consts[2]; 17 + u64 fold_across_1024_bits_consts[2]; 18 + u64 fold_across_512_bits_consts[2]; 19 + u64 fold_across_256_bits_consts[2]; 20 + u64 fold_across_128_bits_consts[2]; 21 + u8 shuf_table[48]; 22 + u64 barrett_reduction_consts[2]; 23 + } crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = { 24 + .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 25 + .fold_across_2048_bits_consts = { 26 + 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */ 27 + 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */ 28 + }, 29 + .fold_across_1024_bits_consts = { 30 + 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */ 31 + 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */ 32 + }, 33 + .fold_across_512_bits_consts = { 34 + 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */ 35 + 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */ 36 + }, 37 + .fold_across_256_bits_consts = { 38 + 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */ 39 + 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */ 40 + }, 41 + .fold_across_128_bits_consts = { 42 + 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */ 43 + 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */ 44 + }, 45 + .shuf_table = { 46 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 47 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 48 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 49 + }, 50 + .barrett_reduction_consts = { 51 + 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */ 52 + 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */ 53 + }, 54 + }; 9 55 10 56 /* 11 57 * CRC folding constants generated for least-significant-bit-first CRC-32 using
+9 -14
arch/x86/lib/crc-t10dif-glue.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-or-later 2 2 /* 3 - * CRC-T10DIF using PCLMULQDQ instructions 3 + * CRC-T10DIF using [V]PCLMULQDQ instructions 4 4 * 5 5 * Copyright 2024 Google LLC 6 6 */ 7 7 8 - #include <asm/cpufeatures.h> 9 - #include <asm/simd.h> 10 - #include <crypto/internal/simd.h> 11 8 #include <linux/crc-t10dif.h> 12 9 #include <linux/module.h> 10 + #include "crc-pclmul-template.h" 13 11 14 12 static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); 15 13 16 - asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); 14 + DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); 17 15 18 16 u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) 19 17 { 20 - if (len >= 16 && 21 - static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) { 22 - kernel_fpu_begin(); 23 - crc = crc_t10dif_pcl(crc, p, len); 24 - kernel_fpu_end(); 25 - return crc; 26 - } 18 + CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, 19 + have_pclmulqdq); 27 20 return crc_t10dif_generic(crc, p, len); 28 21 } 29 22 EXPORT_SYMBOL(crc_t10dif_arch); 30 23 31 24 static int __init crc_t10dif_x86_init(void) 32 25 { 33 - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) 26 + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { 34 27 static_branch_enable(&have_pclmulqdq); 28 + INIT_CRC_PCLMUL(crc16_msb); 29 + } 35 30 return 0; 36 31 } 37 32 arch_initcall(crc_t10dif_x86_init); ··· 36 41 } 37 42 module_exit(crc_t10dif_x86_exit); 38 43 39 - MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions"); 44 + MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions"); 40 45 MODULE_LICENSE("GPL");
+6
arch/x86/lib/crc16-msb-pclmul.S
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + // Copyright 2025 Google LLC 3 + 4 + #include "crc-pclmul-template.S" 5 + 6 + DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)
-332
arch/x86/lib/crct10dif-pcl-asm_64.S
··· 1 - ######################################################################## 2 - # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 3 - # 4 - # Copyright (c) 2013, Intel Corporation 5 - # 6 - # Authors: 7 - # Erdinc Ozturk <erdinc.ozturk@intel.com> 8 - # Vinodh Gopal <vinodh.gopal@intel.com> 9 - # James Guilford <james.guilford@intel.com> 10 - # Tim Chen <tim.c.chen@linux.intel.com> 11 - # 12 - # This software is available to you under a choice of one of two 13 - # licenses. You may choose to be licensed under the terms of the GNU 14 - # General Public License (GPL) Version 2, available from the file 15 - # COPYING in the main directory of this source tree, or the 16 - # OpenIB.org BSD license below: 17 - # 18 - # Redistribution and use in source and binary forms, with or without 19 - # modification, are permitted provided that the following conditions are 20 - # met: 21 - # 22 - # * Redistributions of source code must retain the above copyright 23 - # notice, this list of conditions and the following disclaimer. 24 - # 25 - # * Redistributions in binary form must reproduce the above copyright 26 - # notice, this list of conditions and the following disclaimer in the 27 - # documentation and/or other materials provided with the 28 - # distribution. 29 - # 30 - # * Neither the name of the Intel Corporation nor the names of its 31 - # contributors may be used to endorse or promote products derived from 32 - # this software without specific prior written permission. 33 - # 34 - # 35 - # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 36 - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 38 - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 39 - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 40 - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 41 - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 42 - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 43 - # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 44 - # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 45 - # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 46 - # 47 - # Reference paper titled "Fast CRC Computation for Generic 48 - # Polynomials Using PCLMULQDQ Instruction" 49 - # URL: http://www.intel.com/content/dam/www/public/us/en/documents 50 - # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 51 - # 52 - 53 - #include <linux/linkage.h> 54 - 55 - .text 56 - 57 - #define init_crc %edi 58 - #define buf %rsi 59 - #define len %rdx 60 - 61 - #define FOLD_CONSTS %xmm10 62 - #define BSWAP_MASK %xmm11 63 - 64 - # Fold reg1, reg2 into the next 32 data bytes, storing the result back into 65 - # reg1, reg2. 66 - .macro fold_32_bytes offset, reg1, reg2 67 - movdqu \offset(buf), %xmm9 68 - movdqu \offset+16(buf), %xmm12 69 - pshufb BSWAP_MASK, %xmm9 70 - pshufb BSWAP_MASK, %xmm12 71 - movdqa \reg1, %xmm8 72 - movdqa \reg2, %xmm13 73 - pclmulqdq $0x00, FOLD_CONSTS, \reg1 74 - pclmulqdq $0x11, FOLD_CONSTS, %xmm8 75 - pclmulqdq $0x00, FOLD_CONSTS, \reg2 76 - pclmulqdq $0x11, FOLD_CONSTS, %xmm13 77 - pxor %xmm9 , \reg1 78 - xorps %xmm8 , \reg1 79 - pxor %xmm12, \reg2 80 - xorps %xmm13, \reg2 81 - .endm 82 - 83 - # Fold src_reg into dst_reg. 84 - .macro fold_16_bytes src_reg, dst_reg 85 - movdqa \src_reg, %xmm8 86 - pclmulqdq $0x11, FOLD_CONSTS, \src_reg 87 - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 88 - pxor %xmm8, \dst_reg 89 - xorps \src_reg, \dst_reg 90 - .endm 91 - 92 - # 93 - # u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); 94 - # 95 - # Assumes len >= 16. 96 - # 97 - SYM_FUNC_START(crc_t10dif_pcl) 98 - 99 - movdqa .Lbswap_mask(%rip), BSWAP_MASK 100 - 101 - # For sizes less than 256 bytes, we can't fold 128 bytes at a time. 102 - cmp $256, len 103 - jl .Lless_than_256_bytes 104 - 105 - # Load the first 128 data bytes. Byte swapping is necessary to make the 106 - # bit order match the polynomial coefficient order. 107 - movdqu 16*0(buf), %xmm0 108 - movdqu 16*1(buf), %xmm1 109 - movdqu 16*2(buf), %xmm2 110 - movdqu 16*3(buf), %xmm3 111 - movdqu 16*4(buf), %xmm4 112 - movdqu 16*5(buf), %xmm5 113 - movdqu 16*6(buf), %xmm6 114 - movdqu 16*7(buf), %xmm7 115 - add $128, buf 116 - pshufb BSWAP_MASK, %xmm0 117 - pshufb BSWAP_MASK, %xmm1 118 - pshufb BSWAP_MASK, %xmm2 119 - pshufb BSWAP_MASK, %xmm3 120 - pshufb BSWAP_MASK, %xmm4 121 - pshufb BSWAP_MASK, %xmm5 122 - pshufb BSWAP_MASK, %xmm6 123 - pshufb BSWAP_MASK, %xmm7 124 - 125 - # XOR the first 16 data *bits* with the initial CRC value. 126 - pxor %xmm8, %xmm8 127 - pinsrw $7, init_crc, %xmm8 128 - pxor %xmm8, %xmm0 129 - 130 - movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS 131 - 132 - # Subtract 128 for the 128 data bytes just consumed. Subtract another 133 - # 128 to simplify the termination condition of the following loop. 134 - sub $256, len 135 - 136 - # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 137 - # bytes xmm0-7 into them, storing the result back into xmm0-7. 138 - .Lfold_128_bytes_loop: 139 - fold_32_bytes 0, %xmm0, %xmm1 140 - fold_32_bytes 32, %xmm2, %xmm3 141 - fold_32_bytes 64, %xmm4, %xmm5 142 - fold_32_bytes 96, %xmm6, %xmm7 143 - add $128, buf 144 - sub $128, len 145 - jge .Lfold_128_bytes_loop 146 - 147 - # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. 148 - 149 - # Fold across 64 bytes. 150 - movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS 151 - fold_16_bytes %xmm0, %xmm4 152 - fold_16_bytes %xmm1, %xmm5 153 - fold_16_bytes %xmm2, %xmm6 154 - fold_16_bytes %xmm3, %xmm7 155 - # Fold across 32 bytes. 156 - movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS 157 - fold_16_bytes %xmm4, %xmm6 158 - fold_16_bytes %xmm5, %xmm7 159 - # Fold across 16 bytes. 160 - movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 161 - fold_16_bytes %xmm6, %xmm7 162 - 163 - # Add 128 to get the correct number of data bytes remaining in 0...127 164 - # (not counting xmm7), following the previous extra subtraction by 128. 165 - # Then subtract 16 to simplify the termination condition of the 166 - # following loop. 167 - add $128-16, len 168 - 169 - # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes 170 - # xmm7 into them, storing the result back into xmm7. 171 - jl .Lfold_16_bytes_loop_done 172 - .Lfold_16_bytes_loop: 173 - movdqa %xmm7, %xmm8 174 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 175 - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 176 - pxor %xmm8, %xmm7 177 - movdqu (buf), %xmm0 178 - pshufb BSWAP_MASK, %xmm0 179 - pxor %xmm0 , %xmm7 180 - add $16, buf 181 - sub $16, len 182 - jge .Lfold_16_bytes_loop 183 - 184 - .Lfold_16_bytes_loop_done: 185 - # Add 16 to get the correct number of data bytes remaining in 0...15 186 - # (not counting xmm7), following the previous extra subtraction by 16. 187 - add $16, len 188 - je .Lreduce_final_16_bytes 189 - 190 - .Lhandle_partial_segment: 191 - # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 192 - # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do 193 - # this without needing a fold constant for each possible 'len', redivide 194 - # the bytes into a first chunk of 'len' bytes and a second chunk of 16 195 - # bytes, then fold the first chunk into the second. 196 - 197 - movdqa %xmm7, %xmm2 198 - 199 - # xmm1 = last 16 original data bytes 200 - movdqu -16(buf, len), %xmm1 201 - pshufb BSWAP_MASK, %xmm1 202 - 203 - # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. 204 - lea .Lbyteshift_table+16(%rip), %rax 205 - sub len, %rax 206 - movdqu (%rax), %xmm0 207 - pshufb %xmm0, %xmm2 208 - 209 - # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. 210 - pxor .Lmask1(%rip), %xmm0 211 - pshufb %xmm0, %xmm7 212 - 213 - # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), 214 - # then '16-len' bytes from xmm2 (high-order bytes). 215 - pblendvb %xmm2, %xmm1 #xmm0 is implicit 216 - 217 - # Fold the first chunk into the second chunk, storing the result in xmm7. 218 - movdqa %xmm7, %xmm8 219 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 220 - pclmulqdq $0x00, FOLD_CONSTS, %xmm8 221 - pxor %xmm8, %xmm7 222 - pxor %xmm1, %xmm7 223 - 224 - .Lreduce_final_16_bytes: 225 - # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC 226 - 227 - # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 228 - movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS 229 - 230 - # Fold the high 64 bits into the low 64 bits, while also multiplying by 231 - # x^64. This produces a 128-bit value congruent to x^64 * M(x) and 232 - # whose low 48 bits are 0. 233 - movdqa %xmm7, %xmm0 234 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) 235 - pslldq $8, %xmm0 236 - pxor %xmm0, %xmm7 # + low bits * x^64 237 - 238 - # Fold the high 32 bits into the low 96 bits. This produces a 96-bit 239 - # value congruent to x^64 * M(x) and whose low 48 bits are 0. 240 - movdqa %xmm7, %xmm0 241 - pand .Lmask2(%rip), %xmm0 # zero high 32 bits 242 - psrldq $12, %xmm7 # extract high 32 bits 243 - pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) 244 - pxor %xmm0, %xmm7 # + low bits 245 - 246 - # Load G(x) and floor(x^48 / G(x)). 247 - movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS 248 - 249 - # Use Barrett reduction to compute the final CRC value. 250 - movdqa %xmm7, %xmm0 251 - pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) 252 - psrlq $32, %xmm7 # /= x^32 253 - pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) 254 - psrlq $48, %xmm0 255 - pxor %xmm7, %xmm0 # + low 16 nonzero bits 256 - # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. 257 - 258 - pextrw $0, %xmm0, %eax 259 - RET 260 - 261 - .align 16 262 - .Lless_than_256_bytes: 263 - # Checksumming a buffer of length 16...255 bytes 264 - 265 - # Load the first 16 data bytes. 266 - movdqu (buf), %xmm7 267 - pshufb BSWAP_MASK, %xmm7 268 - add $16, buf 269 - 270 - # XOR the first 16 data *bits* with the initial CRC value. 271 - pxor %xmm0, %xmm0 272 - pinsrw $7, init_crc, %xmm0 273 - pxor %xmm0, %xmm7 274 - 275 - movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS 276 - cmp $16, len 277 - je .Lreduce_final_16_bytes # len == 16 278 - sub $32, len 279 - jge .Lfold_16_bytes_loop # 32 <= len <= 255 280 - add $16, len 281 - jmp .Lhandle_partial_segment # 17 <= len <= 31 282 - SYM_FUNC_END(crc_t10dif_pcl) 283 - 284 - .section .rodata, "a", @progbits 285 - .align 16 286 - 287 - # Fold constants precomputed from the polynomial 0x18bb7 288 - # G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 289 - .Lfold_across_128_bytes_consts: 290 - .quad 0x0000000000006123 # x^(8*128) mod G(x) 291 - .quad 0x0000000000002295 # x^(8*128+64) mod G(x) 292 - .Lfold_across_64_bytes_consts: 293 - .quad 0x0000000000001069 # x^(4*128) mod G(x) 294 - .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) 295 - .Lfold_across_32_bytes_consts: 296 - .quad 0x000000000000857d # x^(2*128) mod G(x) 297 - .quad 0x0000000000007acc # x^(2*128+64) mod G(x) 298 - .Lfold_across_16_bytes_consts: 299 - .quad 0x000000000000a010 # x^(1*128) mod G(x) 300 - .quad 0x0000000000001faa # x^(1*128+64) mod G(x) 301 - .Lfinal_fold_consts: 302 - .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) 303 - .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) 304 - .Lbarrett_reduction_consts: 305 - .quad 0x0000000000018bb7 # G(x) 306 - .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) 307 - 308 - .section .rodata.cst16.mask1, "aM", @progbits, 16 309 - .align 16 310 - .Lmask1: 311 - .octa 0x80808080808080808080808080808080 312 - 313 - .section .rodata.cst16.mask2, "aM", @progbits, 16 314 - .align 16 315 - .Lmask2: 316 - .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF 317 - 318 - .section .rodata.cst16.bswap_mask, "aM", @progbits, 16 319 - .align 16 320 - .Lbswap_mask: 321 - .octa 0x000102030405060708090A0B0C0D0E0F 322 - 323 - .section .rodata.cst32.byteshift_table, "aM", @progbits, 32 324 - .align 16 325 - # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] 326 - # is the index vector to shift left by 'len' bytes, and is also {0x80, ..., 327 - # 0x80} XOR the index vector to shift right by '16 - len' bytes. 328 - .Lbyteshift_table: 329 - .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 330 - .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 331 - .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 332 - .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0