Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

lib/crypto: x86/ghash: Migrate optimized code into library

Remove the "ghash-pclmulqdqni" crypto_shash algorithm. Move the
corresponding assembly code into lib/crypto/, and wire it up to the
GHASH library.

This makes the GHASH library be optimized with x86's carryless
multiplication instructions. It also greatly reduces the amount of
x86-specific glue code that is needed, and it fixes the issue where this
GHASH optimization was disabled by default.

Rename and adjust the prototypes of the assembly functions to make them
fit better with the library. Remove the byte-swaps (pshufb
instructions) that are no longer necessary because the library keeps the
accumulator in POLYVAL format rather than GHASH format.

Rename clmul_ghash_mul() to polyval_mul_pclmul() to reflect that it
really does a POLYVAL style multiplication. Wire it up to both
ghash_mul_arch() and polyval_mul_arch().

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260319061723.1140720-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>

+104 -238
-10
arch/x86/crypto/Kconfig
··· 344 344 345 345 If unsure, say N. 346 346 347 - config CRYPTO_GHASH_CLMUL_NI_INTEL 348 - tristate "Hash functions: GHASH (CLMUL-NI)" 349 - depends on 64BIT 350 - select CRYPTO_CRYPTD 351 - help 352 - GCM GHASH hash function (NIST SP800-38D) 353 - 354 - Architecture: x86_64 using: 355 - - CLMUL-NI (carry-less multiplication new instructions) 356 - 357 347 endmenu
-3
arch/x86/crypto/Makefile
··· 50 50 aes-gcm-vaes-avx512.o \ 51 51 aes-xts-avx-x86_64.o 52 52 53 - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 54 - ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 55 - 56 53 obj-$(CONFIG_CRYPTO_SM3_AVX_X86_64) += sm3-avx-x86_64.o 57 54 sm3-avx-x86_64-y := sm3-avx-asm_64.o sm3_avx_glue.o 58 55
+46 -52
arch/x86/crypto/ghash-clmulni-intel_asm.S lib/crypto/x86/ghash-pclmul.S
··· 21 21 .Lbswap_mask: 22 22 .octa 0x000102030405060708090a0b0c0d0e0f 23 23 24 - #define DATA %xmm0 25 - #define SHASH %xmm1 24 + #define ACC %xmm0 25 + #define KEY %xmm1 26 26 #define T1 %xmm2 27 27 #define T2 %xmm3 28 28 #define T3 %xmm4 ··· 34 34 /* 35 35 * __clmul_gf128mul_ble: internal ABI 36 36 * input: 37 - * DATA: operand1 38 - * SHASH: operand2, hash_key << 1 mod poly 37 + * ACC: operand1 38 + * KEY: operand2, hash_key << 1 mod poly 39 39 * output: 40 - * DATA: operand1 * operand2 mod poly 40 + * ACC: operand1 * operand2 mod poly 41 41 * changed: 42 42 * T1 43 43 * T2 44 44 * T3 45 45 */ 46 46 SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) 47 - movaps DATA, T1 48 - pshufd $0b01001110, DATA, T2 49 - pshufd $0b01001110, SHASH, T3 50 - pxor DATA, T2 51 - pxor SHASH, T3 47 + movaps ACC, T1 48 + pshufd $0b01001110, ACC, T2 49 + pshufd $0b01001110, KEY, T3 50 + pxor ACC, T2 51 + pxor KEY, T3 52 52 53 - pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 54 - pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 53 + pclmulqdq $0x00, KEY, ACC # ACC = a0 * b0 54 + pclmulqdq $0x11, KEY, T1 # T1 = a1 * b1 55 55 pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) 56 - pxor DATA, T2 56 + pxor ACC, T2 57 57 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 58 58 59 59 movaps T2, T3 60 60 pslldq $8, T3 61 61 psrldq $8, T2 62 - pxor T3, DATA 63 - pxor T2, T1 # <T1:DATA> is result of 62 + pxor T3, ACC 63 + pxor T2, T1 # <T1:ACC> is result of 64 64 # carry-less multiplication 65 65 66 66 # first phase of the reduction 67 - movaps DATA, T3 67 + movaps ACC, T3 68 68 psllq $1, T3 69 - pxor DATA, T3 69 + pxor ACC, T3 70 70 psllq $5, T3 71 - pxor DATA, T3 71 + pxor ACC, T3 72 72 psllq $57, T3 73 73 movaps T3, T2 74 74 pslldq $8, T2 75 75 psrldq $8, T3 76 - pxor T2, DATA 76 + pxor T2, ACC 77 77 pxor T3, T1 78 78 79 79 # second phase of the reduction 80 - movaps DATA, T2 80 + movaps ACC, T2 81 81 psrlq $5, T2 82 - pxor DATA, T2 82 + pxor ACC, T2 83 83 psrlq $1, T2 84 - pxor DATA, T2 84 + pxor ACC, T2 85 85 psrlq $1, T2 86 86 pxor T2, T1 87 - pxor T1, DATA 87 + pxor T1, ACC 88 88 RET 89 89 SYM_FUNC_END(__clmul_gf128mul_ble) 90 90 91 - /* void clmul_ghash_mul(char *dst, const le128 *shash) */ 92 - SYM_FUNC_START(clmul_ghash_mul) 91 + /* 92 + * void polyval_mul_pclmul(struct polyval_elem *a, 93 + * const struct polyval_elem *b) 94 + */ 95 + SYM_FUNC_START(polyval_mul_pclmul) 93 96 FRAME_BEGIN 94 - movups (%rdi), DATA 95 - movups (%rsi), SHASH 96 - movaps .Lbswap_mask(%rip), BSWAP 97 - pshufb BSWAP, DATA 97 + movups (%rdi), ACC 98 + movups (%rsi), KEY 98 99 call __clmul_gf128mul_ble 99 - pshufb BSWAP, DATA 100 - movups DATA, (%rdi) 100 + movups ACC, (%rdi) 101 101 FRAME_END 102 102 RET 103 - SYM_FUNC_END(clmul_ghash_mul) 103 + SYM_FUNC_END(polyval_mul_pclmul) 104 104 105 105 /* 106 - * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen, 107 - * const le128 *shash); 106 + * void ghash_blocks_pclmul(struct polyval_elem *acc, 107 + * const struct polyval_elem *key, 108 + * const u8 *data, size_t nblocks) 108 109 */ 109 - SYM_FUNC_START(clmul_ghash_update) 110 + SYM_FUNC_START(ghash_blocks_pclmul) 110 111 FRAME_BEGIN 111 - cmp $16, %rdx 112 - jb .Lupdate_just_ret # check length 113 112 movaps .Lbswap_mask(%rip), BSWAP 114 - movups (%rdi), DATA 115 - movups (%rcx), SHASH 116 - pshufb BSWAP, DATA 113 + movups (%rdi), ACC 114 + movups (%rsi), KEY 117 115 .align 4 118 - .Lupdate_loop: 119 - movups (%rsi), IN1 116 + .Lnext_block: 117 + movups (%rdx), IN1 120 118 pshufb BSWAP, IN1 121 - pxor IN1, DATA 119 + pxor IN1, ACC 122 120 call __clmul_gf128mul_ble 123 - sub $16, %rdx 124 - add $16, %rsi 125 - cmp $16, %rdx 126 - jge .Lupdate_loop 127 - pshufb BSWAP, DATA 128 - movups DATA, (%rdi) 129 - .Lupdate_just_ret: 130 - mov %rdx, %rax 121 + add $16, %rdx 122 + dec %rcx 123 + jnz .Lnext_block 124 + movups ACC, (%rdi) 131 125 FRAME_END 132 126 RET 133 - SYM_FUNC_END(clmul_ghash_update) 127 + SYM_FUNC_END(ghash_blocks_pclmul)
-163
arch/x86/crypto/ghash-clmulni-intel_glue.c
··· 1 - // SPDX-License-Identifier: GPL-2.0-only 2 - /* 3 - * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 4 - * instructions. This file contains glue code. 5 - * 6 - * Copyright (c) 2009 Intel Corp. 7 - * Author: Huang Ying <ying.huang@intel.com> 8 - */ 9 - 10 - #include <asm/cpu_device_id.h> 11 - #include <asm/simd.h> 12 - #include <crypto/b128ops.h> 13 - #include <crypto/ghash.h> 14 - #include <crypto/internal/hash.h> 15 - #include <crypto/utils.h> 16 - #include <linux/errno.h> 17 - #include <linux/kernel.h> 18 - #include <linux/module.h> 19 - #include <linux/string.h> 20 - #include <linux/unaligned.h> 21 - 22 - asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash); 23 - 24 - asmlinkage int clmul_ghash_update(char *dst, const char *src, 25 - unsigned int srclen, const le128 *shash); 26 - 27 - struct x86_ghash_ctx { 28 - le128 shash; 29 - }; 30 - 31 - static int ghash_init(struct shash_desc *desc) 32 - { 33 - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); 34 - 35 - memset(dctx, 0, sizeof(*dctx)); 36 - 37 - return 0; 38 - } 39 - 40 - static int ghash_setkey(struct crypto_shash *tfm, 41 - const u8 *key, unsigned int keylen) 42 - { 43 - struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm); 44 - u64 a, b; 45 - 46 - if (keylen != GHASH_BLOCK_SIZE) 47 - return -EINVAL; 48 - 49 - /* 50 - * GHASH maps bits to polynomial coefficients backwards, which makes it 51 - * hard to implement. But it can be shown that the GHASH multiplication 52 - * 53 - * D * K (mod x^128 + x^7 + x^2 + x + 1) 54 - * 55 - * (where D is a data block and K is the key) is equivalent to: 56 - * 57 - * bitreflect(D) * bitreflect(K) * x^(-127) 58 - * (mod x^128 + x^127 + x^126 + x^121 + 1) 59 - * 60 - * So, the code below precomputes: 61 - * 62 - * bitreflect(K) * x^(-127) (mod x^128 + x^127 + x^126 + x^121 + 1) 63 - * 64 - * ... but in Montgomery form (so that Montgomery multiplication can be 65 - * used), i.e. with an extra x^128 factor, which means actually: 66 - * 67 - * bitreflect(K) * x (mod x^128 + x^127 + x^126 + x^121 + 1) 68 - * 69 - * The within-a-byte part of bitreflect() cancels out GHASH's built-in 70 - * reflection, and thus bitreflect() is actually a byteswap. 71 - */ 72 - a = get_unaligned_be64(key); 73 - b = get_unaligned_be64(key + 8); 74 - ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63)); 75 - ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63)); 76 - if (a >> 63) 77 - ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56); 78 - return 0; 79 - } 80 - 81 - static int ghash_update(struct shash_desc *desc, 82 - const u8 *src, unsigned int srclen) 83 - { 84 - struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); 85 - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); 86 - u8 *dst = dctx->buffer; 87 - int remain; 88 - 89 - kernel_fpu_begin(); 90 - remain = clmul_ghash_update(dst, src, srclen, &ctx->shash); 91 - kernel_fpu_end(); 92 - return remain; 93 - } 94 - 95 - static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx, 96 - const u8 *src, unsigned int len) 97 - { 98 - u8 *dst = dctx->buffer; 99 - 100 - kernel_fpu_begin(); 101 - if (len) { 102 - crypto_xor(dst, src, len); 103 - clmul_ghash_mul(dst, &ctx->shash); 104 - } 105 - kernel_fpu_end(); 106 - } 107 - 108 - static int ghash_finup(struct shash_desc *desc, const u8 *src, 109 - unsigned int len, u8 *dst) 110 - { 111 - struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); 112 - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); 113 - u8 *buf = dctx->buffer; 114 - 115 - ghash_flush(ctx, dctx, src, len); 116 - memcpy(dst, buf, GHASH_BLOCK_SIZE); 117 - 118 - return 0; 119 - } 120 - 121 - static struct shash_alg ghash_alg = { 122 - .digestsize = GHASH_DIGEST_SIZE, 123 - .init = ghash_init, 124 - .update = ghash_update, 125 - .finup = ghash_finup, 126 - .setkey = ghash_setkey, 127 - .descsize = sizeof(struct ghash_desc_ctx), 128 - .base = { 129 - .cra_name = "ghash", 130 - .cra_driver_name = "ghash-pclmulqdqni", 131 - .cra_priority = 400, 132 - .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, 133 - .cra_blocksize = GHASH_BLOCK_SIZE, 134 - .cra_ctxsize = sizeof(struct x86_ghash_ctx), 135 - .cra_module = THIS_MODULE, 136 - }, 137 - }; 138 - 139 - static const struct x86_cpu_id pcmul_cpu_id[] = { 140 - X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */ 141 - {} 142 - }; 143 - MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); 144 - 145 - static int __init ghash_pclmulqdqni_mod_init(void) 146 - { 147 - if (!x86_match_cpu(pcmul_cpu_id)) 148 - return -ENODEV; 149 - 150 - return crypto_register_shash(&ghash_alg); 151 - } 152 - 153 - static void __exit ghash_pclmulqdqni_mod_exit(void) 154 - { 155 - crypto_unregister_shash(&ghash_alg); 156 - } 157 - 158 - module_init(ghash_pclmulqdqni_mod_init); 159 - module_exit(ghash_pclmulqdqni_mod_exit); 160 - 161 - MODULE_LICENSE("GPL"); 162 - MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI"); 163 - MODULE_ALIAS_CRYPTO("ghash");
+2 -1
lib/crypto/Makefile
··· 174 174 endif 175 175 176 176 libgf128hash-$(CONFIG_RISCV) += riscv/ghash-riscv64-zvkg.o 177 - libgf128hash-$(CONFIG_X86) += x86/polyval-pclmul-avx.o 177 + libgf128hash-$(CONFIG_X86) += x86/ghash-pclmul.o \ 178 + x86/polyval-pclmul-avx.o 178 179 endif # CONFIG_CRYPTO_LIB_GF128HASH_ARCH 179 180 180 181 # clean-files must be defined unconditionally
+56 -9
lib/crypto/x86/gf128hash.h
··· 1 1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 2 /* 3 - * POLYVAL library functions, x86_64 optimized 3 + * GHASH and POLYVAL, x86_64 optimized 4 4 * 5 5 * Copyright 2025 Google LLC 6 6 */ ··· 9 9 10 10 #define NUM_H_POWERS 8 11 11 12 + static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul); 12 13 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx); 13 14 15 + asmlinkage void polyval_mul_pclmul(struct polyval_elem *a, 16 + const struct polyval_elem *b); 14 17 asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a, 15 18 const struct polyval_elem *b); 19 + 20 + asmlinkage void ghash_blocks_pclmul(struct polyval_elem *acc, 21 + const struct polyval_elem *key, 22 + const u8 *data, size_t nblocks); 16 23 asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc, 17 24 const struct polyval_key *key, 18 25 const u8 *data, size_t nblocks); ··· 48 41 } 49 42 } 50 43 44 + static void polyval_mul_x86(struct polyval_elem *a, 45 + const struct polyval_elem *b) 46 + { 47 + if (static_branch_likely(&have_pclmul) && irq_fpu_usable()) { 48 + kernel_fpu_begin(); 49 + if (static_branch_likely(&have_pclmul_avx)) 50 + polyval_mul_pclmul_avx(a, b); 51 + else 52 + polyval_mul_pclmul(a, b); 53 + kernel_fpu_end(); 54 + } else { 55 + polyval_mul_generic(a, b); 56 + } 57 + } 58 + 59 + #define ghash_mul_arch ghash_mul_arch 60 + static void ghash_mul_arch(struct polyval_elem *acc, 61 + const struct ghash_key *key) 62 + { 63 + polyval_mul_x86(acc, &key->h); 64 + } 65 + 51 66 #define polyval_mul_arch polyval_mul_arch 52 67 static void polyval_mul_arch(struct polyval_elem *acc, 53 68 const struct polyval_key *key) 54 69 { 55 - if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) { 56 - kernel_fpu_begin(); 57 - polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]); 58 - kernel_fpu_end(); 70 + polyval_mul_x86(acc, &key->h_powers[NUM_H_POWERS - 1]); 71 + } 72 + 73 + #define ghash_blocks_arch ghash_blocks_arch 74 + static void ghash_blocks_arch(struct polyval_elem *acc, 75 + const struct ghash_key *key, 76 + const u8 *data, size_t nblocks) 77 + { 78 + if (static_branch_likely(&have_pclmul) && irq_fpu_usable()) { 79 + do { 80 + /* Allow rescheduling every 4 KiB. */ 81 + size_t n = min_t(size_t, nblocks, 82 + 4096 / GHASH_BLOCK_SIZE); 83 + 84 + kernel_fpu_begin(); 85 + ghash_blocks_pclmul(acc, &key->h, data, n); 86 + kernel_fpu_end(); 87 + data += n * GHASH_BLOCK_SIZE; 88 + nblocks -= n; 89 + } while (nblocks); 59 90 } else { 60 - polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]); 91 + ghash_blocks_generic(acc, &key->h, data, nblocks); 61 92 } 62 93 } 63 94 ··· 125 80 #define gf128hash_mod_init_arch gf128hash_mod_init_arch 126 81 static void gf128hash_mod_init_arch(void) 127 82 { 128 - if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) && 129 - boot_cpu_has(X86_FEATURE_AVX)) 130 - static_branch_enable(&have_pclmul_avx); 83 + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { 84 + static_branch_enable(&have_pclmul); 85 + if (boot_cpu_has(X86_FEATURE_AVX)) 86 + static_branch_enable(&have_pclmul_avx); 87 + } 131 88 }