Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

lib/crypto: x86/aes: Add AES-NI optimization

Optimize the AES library with x86 AES-NI instructions.

The relevant existing assembly functions, aesni_set_key(), aesni_enc(),
and aesni_dec(), are a bit difficult to extract into the library:

- They're coupled to the code for the AES modes.
- They operate on struct crypto_aes_ctx. The AES library now uses
different structs.
- They assume the key is 16-byte aligned. The AES library only
*prefers* 16-byte alignment; it doesn't require it.

Moreover, they're not all that great in the first place:

- They use unrolled loops, which isn't a great choice on x86.
- They use the 'aeskeygenassist' instruction, which is unnecessary, is
slow on Intel CPUs, and forces the loop to be unrolled.
- They have special code for AES-192 key expansion, despite that being
kind of useless. AES-128 and AES-256 are the ones used in practice.

These are small functions anyway.

Therefore, I opted to just write replacements of these functions for the
library. They address all the above issues.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20260112192035.10427-18-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>

+348
+1
lib/crypto/Kconfig
··· 21 21 RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS 22 22 default y if S390 23 23 default y if SPARC64 24 + default y if X86 24 25 25 26 config CRYPTO_LIB_AESCFB 26 27 tristate
+1
lib/crypto/Makefile
··· 52 52 53 53 libaes-$(CONFIG_RISCV) += riscv/aes-riscv64-zvkned.o 54 54 libaes-$(CONFIG_SPARC) += sparc/aes_asm.o 55 + libaes-$(CONFIG_X86) += x86/aes-aesni.o 55 56 endif # CONFIG_CRYPTO_LIB_AES_ARCH 56 57 57 58 ################################################################################
+261
lib/crypto/x86/aes-aesni.S
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + // 3 + // AES block cipher using AES-NI instructions 4 + // 5 + // Copyright 2026 Google LLC 6 + // 7 + // The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require 8 + // AVX. It does use up to SSE4.1, which all CPUs with AES-NI have. 9 + #include <linux/linkage.h> 10 + 11 + .section .rodata 12 + #ifdef __x86_64__ 13 + #define RODATA(label) label(%rip) 14 + #else 15 + #define RODATA(label) label 16 + #endif 17 + 18 + // A mask for pshufb that extracts the last dword, rotates it right by 8 19 + // bits, and copies the result to all four dwords. 20 + .p2align 4 21 + .Lmask: 22 + .byte 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12 23 + 24 + // The AES round constants, used during key expansion 25 + .Lrcon: 26 + .long 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 27 + 28 + .text 29 + 30 + // Transform four dwords [a0, a1, a2, a3] in \a into 31 + // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]. \tmp is a temporary xmm register. 32 + // 33 + // Note: this could be done in four instructions, shufps + pxor + shufps + pxor, 34 + // if the temporary register were zero-initialized ahead of time. We instead do 35 + // it in an easier-to-understand way that doesn't require zero-initialization 36 + // and avoids the unusual shufps instruction. movdqa is usually "free" anyway. 37 + .macro _prefix_sum a, tmp 38 + movdqa \a, \tmp // [a0, a1, a2, a3] 39 + pslldq $4, \a // [0, a0, a1, a2] 40 + pxor \tmp, \a // [a0, a0^a1, a1^a2, a2^a3] 41 + movdqa \a, \tmp 42 + pslldq $8, \a // [0, 0, a0, a0^a1] 43 + pxor \tmp, \a // [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3] 44 + .endm 45 + 46 + .macro _gen_round_key a, b 47 + // Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is 48 + // the last dword of the previous round key (given in \b). 49 + // 50 + // 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)). 51 + // It is used here solely for the SubBytes and the XOR. The ShiftRows 52 + // is a no-op because all four columns are the same here. 53 + // 54 + // Don't use the 'aeskeygenassist' instruction, since: 55 + // - On most Intel CPUs it is microcoded, making it have a much higher 56 + // latency and use more execution ports than 'aesenclast'. 57 + // - It cannot be used in a loop, since it requires an immediate. 58 + // - It doesn't do much more than 'aesenclast' in the first place. 59 + movdqa \b, %xmm2 60 + pshufb MASK, %xmm2 61 + aesenclast RCON, %xmm2 62 + 63 + // XOR in the prefix sum of the four dwords of \a, which is the 64 + // previous round key (AES-128) or the first round key in the previous 65 + // pair of round keys (AES-256). The result is the next round key. 66 + _prefix_sum \a, tmp=%xmm3 67 + pxor %xmm2, \a 68 + 69 + // Store the next round key to memory. Also leave it in \a. 70 + movdqu \a, (RNDKEYS) 71 + .endm 72 + 73 + .macro _aes_expandkey_aesni is_aes128 74 + #ifdef __x86_64__ 75 + // Arguments 76 + .set RNDKEYS, %rdi 77 + .set INV_RNDKEYS, %rsi 78 + .set IN_KEY, %rdx 79 + 80 + // Other local variables 81 + .set RCON_PTR, %rcx 82 + .set COUNTER, %eax 83 + #else 84 + // Arguments, assuming -mregparm=3 85 + .set RNDKEYS, %eax 86 + .set INV_RNDKEYS, %edx 87 + .set IN_KEY, %ecx 88 + 89 + // Other local variables 90 + .set RCON_PTR, %ebx 91 + .set COUNTER, %esi 92 + #endif 93 + .set RCON, %xmm6 94 + .set MASK, %xmm7 95 + 96 + #ifdef __i386__ 97 + push %ebx 98 + push %esi 99 + #endif 100 + 101 + .if \is_aes128 102 + // AES-128: the first round key is simply a copy of the raw key. 103 + movdqu (IN_KEY), %xmm0 104 + movdqu %xmm0, (RNDKEYS) 105 + .else 106 + // AES-256: the first two round keys are simply a copy of the raw key. 107 + movdqu (IN_KEY), %xmm0 108 + movdqu %xmm0, (RNDKEYS) 109 + movdqu 16(IN_KEY), %xmm1 110 + movdqu %xmm1, 16(RNDKEYS) 111 + add $32, RNDKEYS 112 + .endif 113 + 114 + // Generate the remaining round keys. 115 + movdqa RODATA(.Lmask), MASK 116 + .if \is_aes128 117 + lea RODATA(.Lrcon), RCON_PTR 118 + mov $10, COUNTER 119 + .Lgen_next_aes128_round_key: 120 + add $16, RNDKEYS 121 + movd (RCON_PTR), RCON 122 + pshufd $0x00, RCON, RCON 123 + add $4, RCON_PTR 124 + _gen_round_key %xmm0, %xmm0 125 + dec COUNTER 126 + jnz .Lgen_next_aes128_round_key 127 + .else 128 + // AES-256: only the first 7 round constants are needed, so instead of 129 + // loading each one from memory, just start by loading [1, 1, 1, 1] and 130 + // then generate the rest by doubling. 131 + pshufd $0x00, RODATA(.Lrcon), RCON 132 + pxor %xmm5, %xmm5 // All-zeroes 133 + mov $7, COUNTER 134 + .Lgen_next_aes256_round_key_pair: 135 + // Generate the next AES-256 round key: either the first of a pair of 136 + // two, or the last one. 137 + _gen_round_key %xmm0, %xmm1 138 + 139 + dec COUNTER 140 + jz .Lgen_aes256_round_keys_done 141 + 142 + // Generate the second AES-256 round key of the pair. Compared to the 143 + // first, there's no rotation and no XOR of a round constant. 144 + pshufd $0xff, %xmm0, %xmm2 // Get four copies of last dword 145 + aesenclast %xmm5, %xmm2 // Just does SubBytes 146 + _prefix_sum %xmm1, tmp=%xmm3 147 + pxor %xmm2, %xmm1 148 + movdqu %xmm1, 16(RNDKEYS) 149 + add $32, RNDKEYS 150 + paddd RCON, RCON // RCON <<= 1 151 + jmp .Lgen_next_aes256_round_key_pair 152 + .Lgen_aes256_round_keys_done: 153 + .endif 154 + 155 + // If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent 156 + // Inverse Cipher to it. To do that, reverse the standard round keys, 157 + // and apply aesimc (InvMixColumn) to each except the first and last. 158 + test INV_RNDKEYS, INV_RNDKEYS 159 + jz .Ldone\@ 160 + movdqu (RNDKEYS), %xmm0 // Last standard round key 161 + movdqu %xmm0, (INV_RNDKEYS) // => First inverse round key 162 + .if \is_aes128 163 + mov $9, COUNTER 164 + .else 165 + mov $13, COUNTER 166 + .endif 167 + .Lgen_next_inv_round_key\@: 168 + sub $16, RNDKEYS 169 + add $16, INV_RNDKEYS 170 + movdqu (RNDKEYS), %xmm0 171 + aesimc %xmm0, %xmm0 172 + movdqu %xmm0, (INV_RNDKEYS) 173 + dec COUNTER 174 + jnz .Lgen_next_inv_round_key\@ 175 + movdqu -16(RNDKEYS), %xmm0 // First standard round key 176 + movdqu %xmm0, 16(INV_RNDKEYS) // => Last inverse round key 177 + 178 + .Ldone\@: 179 + #ifdef __i386__ 180 + pop %esi 181 + pop %ebx 182 + #endif 183 + RET 184 + .endm 185 + 186 + // void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, 187 + // const u8 in_key[AES_KEYSIZE_128]); 188 + SYM_FUNC_START(aes128_expandkey_aesni) 189 + _aes_expandkey_aesni 1 190 + SYM_FUNC_END(aes128_expandkey_aesni) 191 + 192 + // void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, 193 + // const u8 in_key[AES_KEYSIZE_256]); 194 + SYM_FUNC_START(aes256_expandkey_aesni) 195 + _aes_expandkey_aesni 0 196 + SYM_FUNC_END(aes256_expandkey_aesni) 197 + 198 + .macro _aes_crypt_aesni enc 199 + #ifdef __x86_64__ 200 + .set RNDKEYS, %rdi 201 + .set NROUNDS, %esi 202 + .set OUT, %rdx 203 + .set IN, %rcx 204 + #else 205 + // Assuming -mregparm=3 206 + .set RNDKEYS, %eax 207 + .set NROUNDS, %edx 208 + .set OUT, %ecx 209 + .set IN, %ebx // Passed on stack 210 + #endif 211 + 212 + #ifdef __i386__ 213 + push %ebx 214 + mov 8(%esp), %ebx 215 + #endif 216 + 217 + // Zero-th round 218 + movdqu (IN), %xmm0 219 + movdqu (RNDKEYS), %xmm1 220 + pxor %xmm1, %xmm0 221 + 222 + // Normal rounds 223 + add $16, RNDKEYS 224 + dec NROUNDS 225 + .Lnext_round\@: 226 + movdqu (RNDKEYS), %xmm1 227 + .if \enc 228 + aesenc %xmm1, %xmm0 229 + .else 230 + aesdec %xmm1, %xmm0 231 + .endif 232 + add $16, RNDKEYS 233 + dec NROUNDS 234 + jne .Lnext_round\@ 235 + 236 + // Last round 237 + movdqu (RNDKEYS), %xmm1 238 + .if \enc 239 + aesenclast %xmm1, %xmm0 240 + .else 241 + aesdeclast %xmm1, %xmm0 242 + .endif 243 + movdqu %xmm0, (OUT) 244 + 245 + #ifdef __i386__ 246 + pop %ebx 247 + #endif 248 + RET 249 + .endm 250 + 251 + // void aes_encrypt_aesni(const u32 rndkeys[], int nrounds, 252 + // u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); 253 + SYM_FUNC_START(aes_encrypt_aesni) 254 + _aes_crypt_aesni 1 255 + SYM_FUNC_END(aes_encrypt_aesni) 256 + 257 + // void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds, 258 + // u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); 259 + SYM_FUNC_START(aes_decrypt_aesni) 260 + _aes_crypt_aesni 0 261 + SYM_FUNC_END(aes_decrypt_aesni)
+85
lib/crypto/x86/aes.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * AES block cipher using AES-NI instructions 4 + * 5 + * Copyright 2026 Google LLC 6 + */ 7 + 8 + #include <asm/fpu/api.h> 9 + 10 + static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes); 11 + 12 + void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, 13 + const u8 in_key[AES_KEYSIZE_128]); 14 + void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys, 15 + const u8 in_key[AES_KEYSIZE_256]); 16 + void aes_encrypt_aesni(const u32 rndkeys[], int nrounds, 17 + u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); 18 + void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds, 19 + u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]); 20 + 21 + /* 22 + * Expand an AES key using AES-NI if supported and usable or generic code 23 + * otherwise. The expanded key format is compatible between the two cases. The 24 + * outputs are @k->rndkeys (required) and @inv_k->inv_rndkeys (optional). 25 + * 26 + * We could just always use the generic key expansion code. AES key expansion 27 + * is usually less performance-critical than AES en/decryption. However, 28 + * there's still *some* value in speed here, as well as in non-key-dependent 29 + * execution time which AES-NI provides. So, do use AES-NI to expand AES-128 30 + * and AES-256 keys. (Don't bother with AES-192, as it's almost never used.) 31 + */ 32 + static void aes_preparekey_arch(union aes_enckey_arch *k, 33 + union aes_invkey_arch *inv_k, 34 + const u8 *in_key, int key_len, int nrounds) 35 + { 36 + u32 *rndkeys = k->rndkeys; 37 + u32 *inv_rndkeys = inv_k ? inv_k->inv_rndkeys : NULL; 38 + 39 + if (static_branch_likely(&have_aes) && key_len != AES_KEYSIZE_192 && 40 + irq_fpu_usable()) { 41 + kernel_fpu_begin(); 42 + if (key_len == AES_KEYSIZE_128) 43 + aes128_expandkey_aesni(rndkeys, inv_rndkeys, in_key); 44 + else 45 + aes256_expandkey_aesni(rndkeys, inv_rndkeys, in_key); 46 + kernel_fpu_end(); 47 + } else { 48 + aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len); 49 + } 50 + } 51 + 52 + static void aes_encrypt_arch(const struct aes_enckey *key, 53 + u8 out[AES_BLOCK_SIZE], 54 + const u8 in[AES_BLOCK_SIZE]) 55 + { 56 + if (static_branch_likely(&have_aes) && irq_fpu_usable()) { 57 + kernel_fpu_begin(); 58 + aes_encrypt_aesni(key->k.rndkeys, key->nrounds, out, in); 59 + kernel_fpu_end(); 60 + } else { 61 + aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in); 62 + } 63 + } 64 + 65 + static void aes_decrypt_arch(const struct aes_key *key, 66 + u8 out[AES_BLOCK_SIZE], 67 + const u8 in[AES_BLOCK_SIZE]) 68 + { 69 + if (static_branch_likely(&have_aes) && irq_fpu_usable()) { 70 + kernel_fpu_begin(); 71 + aes_decrypt_aesni(key->inv_k.inv_rndkeys, key->nrounds, 72 + out, in); 73 + kernel_fpu_end(); 74 + } else { 75 + aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds, 76 + out, in); 77 + } 78 + } 79 + 80 + #define aes_mod_init_arch aes_mod_init_arch 81 + static void aes_mod_init_arch(void) 82 + { 83 + if (boot_cpu_has(X86_FEATURE_AES)) 84 + static_branch_enable(&have_aes); 85 + }