riscv/crc: add "template" for Zbc optimized CRC functions

+319 -1

2 changed files

expand all

arch

riscv

lib

crc-clmul-template.h

scripts

gen-crc-consts.py

+265

arch/riscv/lib/crc-clmul-template.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* Copyright 2025 Google LLC */ 3 + 4 + /* 5 + * This file is a "template" that generates a CRC function optimized using the 6 + * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this 7 + * file must define the following parameters to specify the type of CRC: 8 + * 9 + * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC 10 + * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural 11 + * mapping between bits and polynomial coefficients 12 + * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected 13 + * mapping between bits and polynomial coefficients 14 + */ 15 + 16 + #include <asm/byteorder.h> 17 + #include <linux/minmax.h> 18 + 19 + #define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ 20 + 21 + static inline unsigned long clmul(unsigned long a, unsigned long b) 22 + { 23 + unsigned long res; 24 + 25 + asm(".option push\n" 26 + ".option arch,+zbc\n" 27 + "clmul %0, %1, %2\n" 28 + ".option pop\n" 29 + : "=r" (res) : "r" (a), "r" (b)); 30 + return res; 31 + } 32 + 33 + static inline unsigned long clmulh(unsigned long a, unsigned long b) 34 + { 35 + unsigned long res; 36 + 37 + asm(".option push\n" 38 + ".option arch,+zbc\n" 39 + "clmulh %0, %1, %2\n" 40 + ".option pop\n" 41 + : "=r" (res) : "r" (a), "r" (b)); 42 + return res; 43 + } 44 + 45 + static inline unsigned long clmulr(unsigned long a, unsigned long b) 46 + { 47 + unsigned long res; 48 + 49 + asm(".option push\n" 50 + ".option arch,+zbc\n" 51 + "clmulr %0, %1, %2\n" 52 + ".option pop\n" 53 + : "=r" (res) : "r" (a), "r" (b)); 54 + return res; 55 + } 56 + 57 + /* 58 + * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a 59 + * polynomial whose bit order matches the CRC's bit order. 60 + */ 61 + #ifdef CONFIG_64BIT 62 + # if LSB_CRC 63 + # define crc_load_long(x) le64_to_cpup(x) 64 + # else 65 + # define crc_load_long(x) be64_to_cpup(x) 66 + # endif 67 + #else 68 + # if LSB_CRC 69 + # define crc_load_long(x) le32_to_cpup(x) 70 + # else 71 + # define crc_load_long(x) be32_to_cpup(x) 72 + # endif 73 + #endif 74 + 75 + /* XOR @crc into the end of @msgpoly that represents the high-order terms. */ 76 + static inline unsigned long 77 + crc_clmul_prep(crc_t crc, unsigned long msgpoly) 78 + { 79 + #if LSB_CRC 80 + return msgpoly ^ crc; 81 + #else 82 + return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); 83 + #endif 84 + } 85 + 86 + /* 87 + * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it 88 + * modulo the generator polynomial G. This gives the CRC of @msgpoly. 89 + */ 90 + static inline crc_t 91 + crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) 92 + { 93 + unsigned long tmp; 94 + 95 + /* 96 + * First step of Barrett reduction with integrated multiplication by 97 + * x^n: calculate floor((msgpoly * x^n) / G). This is the value by 98 + * which G needs to be multiplied to cancel out the x^n and higher terms 99 + * of msgpoly * x^n. Do it using the following formula: 100 + * 101 + * msb-first: 102 + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) 103 + * lsb-first: 104 + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) 105 + * 106 + * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), 107 + * which fits a long exactly. Using any lower power of x there would 108 + * not carry enough precision through the calculation, while using any 109 + * higher power of x would require extra instructions to handle a wider 110 + * multiplication. In the msb-first case, using this power of x results 111 + * in needing a floored division by x^(BITS_PER_LONG-1), which matches 112 + * what clmulr produces. In the lsb-first case, a factor of x gets 113 + * implicitly introduced by each carryless multiplication (shown as 114 + * '* x' above), and the floored division instead needs to be by 115 + * x^BITS_PER_LONG which matches what clmul produces. 116 + */ 117 + #if LSB_CRC 118 + tmp = clmul(msgpoly, consts->barrett_reduction_const_1); 119 + #else 120 + tmp = clmulr(msgpoly, consts->barrett_reduction_const_1); 121 + #endif 122 + 123 + /* 124 + * Second step of Barrett reduction: 125 + * 126 + * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) 127 + * 128 + * This reduces (msgpoly * x^n) modulo G by adding the appropriate 129 + * multiple of G to it. The result uses only the x^0..x^(n-1) terms. 130 + * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those 131 + * terms in the first place, it is more efficient to do the equivalent: 132 + * 133 + * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n 134 + * 135 + * In the lsb-first case further modify it to the following which avoids 136 + * a shift, as the crc ends up in the physically low n bits from clmulr: 137 + * 138 + * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x 139 + * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n 140 + * 141 + * barrett_reduction_const_2 contains the constant multiplier (G - x^n) 142 + * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The 143 + * cast of the result to crc_t is essential, as it applies the mod x^n! 144 + */ 145 + #if LSB_CRC 146 + return clmulr(tmp, consts->barrett_reduction_const_2); 147 + #else 148 + return clmul(tmp, consts->barrett_reduction_const_2); 149 + #endif 150 + } 151 + 152 + /* Update @crc with the data from @msgpoly. */ 153 + static inline crc_t 154 + crc_clmul_update_long(crc_t crc, unsigned long msgpoly, 155 + const struct crc_clmul_consts *consts) 156 + { 157 + return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts); 158 + } 159 + 160 + /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ 161 + static inline crc_t 162 + crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, 163 + const struct crc_clmul_consts *consts) 164 + { 165 + unsigned long msgpoly; 166 + size_t i; 167 + 168 + #if LSB_CRC 169 + msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); 170 + for (i = 1; i < len; i++) 171 + msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); 172 + #else 173 + msgpoly = p[0]; 174 + for (i = 1; i < len; i++) 175 + msgpoly = (msgpoly << 8) ^ p[i]; 176 + #endif 177 + 178 + if (len >= sizeof(crc_t)) { 179 + #if LSB_CRC 180 + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); 181 + #else 182 + msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); 183 + #endif 184 + return crc_clmul_long(msgpoly, consts); 185 + } 186 + #if LSB_CRC 187 + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); 188 + return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); 189 + #else 190 + msgpoly ^= crc >> (CRC_BITS - 8*len); 191 + return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); 192 + #endif 193 + } 194 + 195 + static inline crc_t 196 + crc_clmul(crc_t crc, const void *p, size_t len, 197 + const struct crc_clmul_consts *consts) 198 + { 199 + size_t align; 200 + 201 + /* This implementation assumes that the CRC fits in an unsigned long. */ 202 + BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); 203 + 204 + /* If the buffer is not long-aligned, align it. */ 205 + align = (unsigned long)p % sizeof(unsigned long); 206 + if (align && len) { 207 + align = min(sizeof(unsigned long) - align, len); 208 + crc = crc_clmul_update_partial(crc, p, align, consts); 209 + p += align; 210 + len -= align; 211 + } 212 + 213 + if (len >= 4 * sizeof(unsigned long)) { 214 + unsigned long m0, m1; 215 + 216 + m0 = crc_clmul_prep(crc, crc_load_long(p)); 217 + m1 = crc_load_long(p + sizeof(unsigned long)); 218 + p += 2 * sizeof(unsigned long); 219 + len -= 2 * sizeof(unsigned long); 220 + /* 221 + * Main loop. Each iteration starts with a message polynomial 222 + * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two 223 + * more longs of data to form x^(3*BITS_PER_LONG)*m0 + 224 + * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then 225 + * "folds" that back into a congruent (modulo G) value that uses 226 + * just m0 and m1 again. This is done by multiplying m0 by the 227 + * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by 228 + * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then 229 + * adding the results to m2 and m3 as appropriate. Each such 230 + * multiplication produces a result twice the length of a long, 231 + * which in RISC-V is two instructions clmul and clmulh. 232 + * 233 + * This could be changed to fold across more than 2 longs at a 234 + * time if there is a CPU that can take advantage of it. 235 + */ 236 + do { 237 + unsigned long p0, p1, p2, p3; 238 + 239 + p0 = clmulh(m0, consts->fold_across_2_longs_const_hi); 240 + p1 = clmul(m0, consts->fold_across_2_longs_const_hi); 241 + p2 = clmulh(m1, consts->fold_across_2_longs_const_lo); 242 + p3 = clmul(m1, consts->fold_across_2_longs_const_lo); 243 + m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); 244 + m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ 245 + crc_load_long(p + sizeof(unsigned long)); 246 + 247 + p += 2 * sizeof(unsigned long); 248 + len -= 2 * sizeof(unsigned long); 249 + } while (len >= 2 * sizeof(unsigned long)); 250 + 251 + crc = crc_clmul_long(m0, consts); 252 + crc = crc_clmul_update_long(crc, m1, consts); 253 + } 254 + 255 + while (len >= sizeof(unsigned long)) { 256 + crc = crc_clmul_update_long(crc, crc_load_long(p), consts); 257 + p += sizeof(unsigned long); 258 + len -= sizeof(unsigned long); 259 + } 260 + 261 + if (len) 262 + crc = crc_clmul_update_partial(crc, p, len, consts); 263 + 264 + return crc; 265 + }

+54 -1

scripts/gen-crc-consts.py

··· 105 105 print(f'\t{s}') 106 106 print('};') 107 107 108 + def print_riscv_const(v, bits_per_long, name, val, desc): 109 + print(f'\t.{name} = {fmt_poly(v, val, bits_per_long)}, /* {desc} */') 110 + 111 + def do_gen_riscv_clmul_consts(v, bits_per_long): 112 + (G, n, lsb) = (v.G, v.bits, v.lsb) 113 + 114 + pow_of_x = 3 * bits_per_long - (1 if lsb else 0) 115 + print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_hi', 116 + reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G') 117 + pow_of_x = 2 * bits_per_long - (1 if lsb else 0) 118 + print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_lo', 119 + reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G') 120 + 121 + pow_of_x = bits_per_long - 1 + n 122 + print_riscv_const(v, bits_per_long, 'barrett_reduction_const_1', 123 + div(1 << pow_of_x, G), f'floor(x^{pow_of_x} / G)') 124 + 125 + val = G - (1 << n) 126 + desc = f'G - x^{n}' 127 + if lsb: 128 + val <<= bits_per_long - n 129 + desc = f'({desc}) * x^{bits_per_long - n}' 130 + print_riscv_const(v, bits_per_long, 'barrett_reduction_const_2', val, desc) 131 + 132 + def gen_riscv_clmul_consts(variants): 133 + print('') 134 + print('struct crc_clmul_consts {'); 135 + print('\tunsigned long fold_across_2_longs_const_hi;'); 136 + print('\tunsigned long fold_across_2_longs_const_lo;'); 137 + print('\tunsigned long barrett_reduction_const_1;'); 138 + print('\tunsigned long barrett_reduction_const_2;'); 139 + print('};'); 140 + for v in variants: 141 + print(''); 142 + if v.bits > 32: 143 + print_header(v, 'Constants') 144 + print('#ifdef CONFIG_64BIT') 145 + print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{') 146 + do_gen_riscv_clmul_consts(v, 64) 147 + print('};') 148 + print('#endif') 149 + else: 150 + print_header(v, 'Constants') 151 + print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{') 152 + print('#ifdef CONFIG_64BIT') 153 + do_gen_riscv_clmul_consts(v, 64) 154 + print('#else') 155 + do_gen_riscv_clmul_consts(v, 32) 156 + print('#endif') 157 + print('};') 158 + 108 159 # Generate constants for carryless multiplication based CRC computation. 109 160 def gen_x86_pclmul_consts(variants): 110 161 # These are the distances, in bits, to generate folding constants for. ··· 264 213 265 214 if len(sys.argv) != 3: 266 215 sys.stderr.write(f'Usage: {sys.argv[0]} CONSTS_TYPE[,CONSTS_TYPE]... CRC_VARIANT[,CRC_VARIANT]...\n') 267 - sys.stderr.write(' CONSTS_TYPE can be sliceby[1-8] or x86_pclmul\n') 216 + sys.stderr.write(' CONSTS_TYPE can be sliceby[1-8], riscv_clmul, or x86_pclmul\n') 268 217 sys.stderr.write(' CRC_VARIANT is crc${num_bits}_${bit_order}_${generator_poly_as_hex}\n') 269 218 sys.stderr.write(' E.g. crc16_msb_0x8bb7 or crc32_lsb_0xedb88320\n') 270 219 sys.stderr.write(' Polynomial must use the given bit_order and exclude x^{num_bits}\n') ··· 283 232 for consts_type in consts_types: 284 233 if consts_type.startswith('sliceby'): 285 234 gen_slicebyN_tables(variants, int(consts_type.removeprefix('sliceby'))) 235 + elif consts_type == 'riscv_clmul': 236 + gen_riscv_clmul_consts(variants) 286 237 elif consts_type == 'x86_pclmul': 287 238 gen_x86_pclmul_consts(variants) 288 239 else:

Configure Feed

Configure Feed