Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'aes-gcm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull AES-GCM optimizations from Eric Biggers:
"More optimizations and cleanups for the x86_64 AES-GCM code:

- Add a VAES+AVX2 optimized implementation of AES-GCM. This is very
helpful on CPUs that have VAES but not AVX512, such as AMD Zen 3.

- Make the VAES+AVX512 optimized implementation of AES-GCM handle
large amounts of associated data efficiently.

- Remove the "avx10_256" implementation of AES-GCM. It's superseded
by the VAES+AVX2 optimized implementation.

- Rename the "avx10_512" implementation to "avx512"

Overall, this fills in a gap where AES-GCM wasn't fully optimized on
some recent CPUs. It also drops code that won't be as useful as
initially expected due to AVX10/256 being dropped from the AVX10 spec"

* tag 'aes-gcm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux:
crypto: x86/aes-gcm-vaes-avx2 - initialize full %rax return register
crypto: x86/aes-gcm - optimize long AAD processing with AVX512
crypto: x86/aes-gcm - optimize AVX512 precomputation of H^2 from H^1
crypto: x86/aes-gcm - revise some comments in AVX512 code
crypto: x86/aes-gcm - reorder AVX512 precompute and aad_update functions
crypto: x86/aes-gcm - clean up AVX512 code to assume 512-bit vectors
crypto: x86/aes-gcm - rename avx10 and avx10_512 to avx512
crypto: x86/aes-gcm - remove VAES+AVX10/256 optimized code
crypto: x86/aes-gcm - add VAES+AVX2 optimized code

+1663 -486
+3 -2
arch/x86/crypto/Makefile
··· 46 46 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 47 47 aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ 48 48 aes-gcm-aesni-x86_64.o \ 49 - aes-xts-avx-x86_64.o \ 50 - aes-gcm-avx10-x86_64.o 49 + aes-gcm-vaes-avx2.o \ 50 + aes-gcm-vaes-avx512.o \ 51 + aes-xts-avx-x86_64.o 51 52 52 53 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o 53 54 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+6 -6
arch/x86/crypto/aes-gcm-aesni-x86_64.S
··· 61 61 // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems 62 62 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) 63 63 // 64 - // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is 64 + // The design generally follows that of aes-gcm-vaes-avx512.S, and that file is 65 65 // more thoroughly commented. This file has the following notable changes: 66 66 // 67 67 // - The vector length is fixed at 128-bit, i.e. xmm registers. This means 68 68 // there is only one AES block (and GHASH block) per register. 69 69 // 70 - // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of 71 - // 32. We work around this by being much more careful about using 72 - // registers, relying heavily on loads to load values as they are needed. 70 + // - Without AVX512, only 16 SIMD registers are available instead of 32. We 71 + // work around this by being much more careful about using registers, 72 + // relying heavily on loads to load values as they are needed. 73 73 // 74 74 // - Masking is not available either. We work around this by implementing 75 75 // partial block loads and stores using overlapping scalar loads and stores ··· 90 90 // multiplication instead of schoolbook multiplication. This saves one 91 91 // pclmulqdq instruction per block, at the cost of one 64-bit load, one 92 92 // pshufd, and 0.25 pxors per block. (This is without the three-argument 93 - // XOR support that would be provided by AVX512 / AVX10, which would be 94 - // more beneficial to schoolbook than Karatsuba.) 93 + // XOR support that would be provided by AVX512, which would be more 94 + // beneficial to schoolbook than Karatsuba.) 95 95 // 96 96 // As a rough approximation, we can assume that Karatsuba multiplication is 97 97 // faster than schoolbook multiplication in this context if one pshufd and
+343 -379
arch/x86/crypto/aes-gcm-avx10-x86_64.S arch/x86/crypto/aes-gcm-vaes-avx512.S
··· 1 1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2 2 // 3 - // VAES and VPCLMULQDQ optimized AES-GCM for x86_64 3 + // AES-GCM implementation for x86_64 CPUs that support the following CPU 4 + // features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 4 5 // 5 6 // Copyright 2024 Google LLC 6 7 // ··· 46 45 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 47 46 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 48 47 // POSSIBILITY OF SUCH DAMAGE. 49 - // 50 - //------------------------------------------------------------------------------ 51 - // 52 - // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that 53 - // support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and 54 - // either AVX512 or AVX10. Some of the functions, notably the encryption and 55 - // decryption update functions which are the most performance-critical, are 56 - // provided in two variants generated from a macro: one using 256-bit vectors 57 - // (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The 58 - // other, "shared" functions (vaes_avx10) use at most 256-bit vectors. 59 - // 60 - // The functions that use 512-bit vectors are intended for CPUs that support 61 - // 512-bit vectors *and* where using them doesn't cause significant 62 - // downclocking. They require the following CPU features: 63 - // 64 - // VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) 65 - // 66 - // The other functions require the following CPU features: 67 - // 68 - // VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) 69 - // 70 - // All functions use the "System V" ABI. The Windows ABI is not supported. 71 - // 72 - // Note that we use "avx10" in the names of the functions as a shorthand to 73 - // really mean "AVX10 or a certain set of AVX512 features". Due to Intel's 74 - // introduction of AVX512 and then its replacement by AVX10, there doesn't seem 75 - // to be a simple way to name things that makes sense on all CPUs. 76 - // 77 - // Note that the macros that support both 256-bit and 512-bit vectors could 78 - // fairly easily be changed to support 128-bit too. However, this would *not* 79 - // be sufficient to allow the code to run on CPUs without AVX512 or AVX10, 80 - // because the code heavily uses several features of these extensions other than 81 - // the vector length: the increase in the number of SIMD registers from 16 to 82 - // 32, masking support, and new instructions such as vpternlogd (which can do a 83 - // three-argument XOR). These features are very useful for AES-GCM. 84 48 85 49 #include <linux/linkage.h> 86 50 ··· 70 104 .Lgfpoly_and_internal_carrybit: 71 105 .octa 0xc2000000000000010000000000000001 72 106 73 - // The below constants are used for incrementing the counter blocks. 74 - // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. 75 - // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and 76 - // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. 107 + // Values needed to prepare the initial vector of counter blocks. 77 108 .Lctr_pattern: 78 109 .octa 0 79 110 .octa 1 80 - .Linc_2blocks: 81 111 .octa 2 82 112 .octa 3 113 + 114 + // The number of AES blocks per vector, as a 128-bit value. 83 115 .Linc_4blocks: 84 116 .octa 4 85 117 ··· 94 130 // Offset to end of hash key powers array in the key struct. 95 131 // 96 132 // This is immediately followed by three zeroized padding blocks, which are 97 - // included so that partial vectors can be handled more easily. E.g. if VL=64 98 - // and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most 99 - // padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. 133 + // included so that partial vectors can be handled more easily. E.g. if two 134 + // blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most padding 135 + // blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. 100 136 #define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) 101 137 102 138 .text 103 - 104 - // Set the vector length in bytes. This sets the VL variable and defines 105 - // register aliases V0-V31 that map to the ymm or zmm registers. 106 - .macro _set_veclen vl 107 - .set VL, \vl 108 - .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 109 - 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 110 - .if VL == 32 111 - .set V\i, %ymm\i 112 - .elseif VL == 64 113 - .set V\i, %zmm\i 114 - .else 115 - .error "Unsupported vector length" 116 - .endif 117 - .endr 118 - .endm 119 139 120 140 // The _ghash_mul_step macro does one step of GHASH multiplication of the 121 141 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the ··· 260 312 vpternlogd $0x96, \t0, \mi, \hi 261 313 .endm 262 314 263 - // void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); 315 + // This is a specialized version of _ghash_mul that computes \a * \a, i.e. it 316 + // squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0. 317 + .macro _ghash_square a, dst, gfpoly, t0, t1 318 + vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L 319 + vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H 320 + vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57) 321 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO 322 + vpxord \t0, \t1, \t1 // Fold LO into MI 323 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) 324 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI 325 + vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI 326 + .endm 327 + 328 + // void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); 264 329 // 265 - // Given the expanded AES key |key->aes_key|, this function derives the GHASH 266 - // subkey and initializes |key->ghash_key_powers| with powers of it. 267 - // 268 - // The number of key powers initialized is NUM_H_POWERS, and they are stored in 269 - // the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key 270 - // powers themselves are also initialized. 271 - // 272 - // This macro supports both VL=32 and VL=64. _set_veclen must have been invoked 273 - // with the desired length. In the VL=32 case, the function computes twice as 274 - // many key powers than are actually used by the VL=32 GCM update functions. 275 - // This is done to keep the key format the same regardless of vector length. 276 - .macro _aes_gcm_precompute 330 + // Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and 331 + // initialize |key->h_powers| and |key->padding|. 332 + SYM_FUNC_START(aes_gcm_precompute_vaes_avx512) 277 333 278 334 // Function arguments 279 335 .set KEY, %rdi 280 336 281 - // Additional local variables. V0-V2 and %rax are used as temporaries. 337 + // Additional local variables. 338 + // %zmm[0-2] and %rax are used as temporaries. 282 339 .set POWERS_PTR, %rsi 283 340 .set RNDKEYLAST_PTR, %rdx 284 - .set H_CUR, V3 341 + .set H_CUR, %zmm3 285 342 .set H_CUR_YMM, %ymm3 286 343 .set H_CUR_XMM, %xmm3 287 - .set H_INC, V4 344 + .set H_INC, %zmm4 288 345 .set H_INC_YMM, %ymm4 289 346 .set H_INC_XMM, %xmm4 290 - .set GFPOLY, V5 347 + .set GFPOLY, %zmm5 291 348 .set GFPOLY_YMM, %ymm5 292 349 .set GFPOLY_XMM, %xmm5 293 350 294 351 // Get pointer to lowest set of key powers (located at end of array). 295 - lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR 352 + lea OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR 296 353 297 354 // Encrypt an all-zeroes block to get the raw hash subkey. 298 355 movl OFFSETOF_AESKEYLEN(KEY), %eax ··· 316 363 317 364 // Zeroize the padding blocks. 318 365 vpxor %xmm0, %xmm0, %xmm0 319 - vmovdqu %ymm0, VL(POWERS_PTR) 320 - vmovdqu %xmm0, VL+2*16(POWERS_PTR) 366 + vmovdqu %ymm0, 64(POWERS_PTR) 367 + vmovdqu %xmm0, 64+2*16(POWERS_PTR) 321 368 322 369 // Finish preprocessing the first key power, H^1. Since this GHASH 323 370 // implementation operates directly on values with the backwards bit ··· 350 397 // special needs to be done to make this happen, though: H^1 * H^1 would 351 398 // end up with two factors of x^-1, but the multiplication consumes one. 352 399 // So the product H^2 ends up with the desired one factor of x^-1. 353 - _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ 354 - %xmm0, %xmm1, %xmm2 400 + _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1 355 401 356 402 // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. 357 403 vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM 358 404 vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM 359 405 360 - .if VL == 64 361 406 // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. 362 407 _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ 363 408 %ymm0, %ymm1, %ymm2 364 409 vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR 365 410 vshufi64x2 $0, H_INC, H_INC, H_INC 366 - .endif 367 411 368 412 // Store the lowest set of key powers. 369 413 vmovdqu8 H_CUR, (POWERS_PTR) 370 414 371 - // Compute and store the remaining key powers. With VL=32, repeatedly 372 - // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. 373 - // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by 415 + // Compute and store the remaining key powers. 416 + // Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by 374 417 // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. 375 - mov $(NUM_H_POWERS*16/VL) - 1, %eax 376 - .Lprecompute_next\@: 377 - sub $VL, POWERS_PTR 378 - _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 418 + mov $3, %eax 419 + .Lprecompute_next: 420 + sub $64, POWERS_PTR 421 + _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2 379 422 vmovdqu8 H_CUR, (POWERS_PTR) 380 423 dec %eax 381 - jnz .Lprecompute_next\@ 424 + jnz .Lprecompute_next 382 425 383 426 vzeroupper // This is needed after using ymm or zmm registers. 384 427 RET 385 - .endm 428 + SYM_FUNC_END(aes_gcm_precompute_vaes_avx512) 386 429 387 430 // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store 388 431 // the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. 389 432 .macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm 390 433 vextracti32x4 $1, \src, \t0_xmm 391 - .if VL == 32 392 - vpxord \t0_xmm, \src_xmm, \dst_xmm 393 - .elseif VL == 64 394 434 vextracti32x4 $2, \src, \t1_xmm 395 435 vextracti32x4 $3, \src, \t2_xmm 396 436 vpxord \t0_xmm, \src_xmm, \dst_xmm 397 437 vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm 398 - .else 399 - .error "Unsupported vector length" 400 - .endif 401 438 .endm 402 439 403 440 // Do one step of the GHASH update of the data blocks given in the vector ··· 401 458 // 402 459 // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + 403 460 // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the 404 - // operations are vectorized operations on vectors of 16-byte blocks. E.g., 405 - // with VL=32 there are 2 blocks per vector and the vectorized terms correspond 406 - // to the following non-vectorized terms: 461 + // operations are vectorized operations on 512-bit vectors of 128-bit blocks. 462 + // The vectorized terms correspond to the following non-vectorized terms: 407 463 // 408 - // H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) 409 - // H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 410 - // H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 411 - // H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 412 - // 413 - // With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. 464 + // H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM), 465 + // H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0) 466 + // H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7 467 + // H_POW2*GHASHDATA2 => H^8*blk8, H^7*blk9, H^6*blk10, and H^5*blk11 468 + // H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15 414 469 // 415 470 // More concretely, this code does: 416 471 // - Do vectorized "schoolbook" multiplications to compute the intermediate 417 472 // 256-bit product of each block and its corresponding hash key power. 418 - // There are 4*VL/16 of these intermediate products. 419 - // - Sum (XOR) the intermediate 256-bit products across vectors. This leaves 420 - // VL/16 256-bit intermediate values. 473 + // - Sum (XOR) the intermediate 256-bit products across vectors. 421 474 // - Do a vectorized reduction of these 256-bit intermediate values to 422 - // 128-bits each. This leaves VL/16 128-bit intermediate values. 475 + // 128-bits each. 423 476 // - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. 424 477 // 425 478 // See _ghash_mul_step for the full explanation of the operations performed for ··· 471 532 .endif 472 533 .endm 473 534 474 - // Do one non-last round of AES encryption on the counter blocks in V0-V3 using 475 - // the round key that has been broadcast to all 128-bit lanes of \round_key. 535 + // Update GHASH with four vectors of data blocks. See _ghash_step_4x for full 536 + // explanation. 537 + .macro _ghash_4x 538 + .irp i, 0,1,2,3,4,5,6,7,8,9 539 + _ghash_step_4x \i 540 + .endr 541 + .endm 542 + 543 + // void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 544 + // u8 ghash_acc[16], 545 + // const u8 *aad, int aadlen); 546 + // 547 + // This function processes the AAD (Additional Authenticated Data) in GCM. 548 + // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the 549 + // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all 550 + // zeroes. |aadlen| must be a multiple of 16, except on the last call where it 551 + // can be any length. The caller must do any buffering needed to ensure this. 552 + // 553 + // This handles large amounts of AAD efficiently, while also keeping overhead 554 + // low for small amounts which is the common case. TLS and IPsec use less than 555 + // one block of AAD, but (uncommonly) other use cases may use much more. 556 + SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512) 557 + 558 + // Function arguments 559 + .set KEY, %rdi 560 + .set GHASH_ACC_PTR, %rsi 561 + .set AAD, %rdx 562 + .set AADLEN, %ecx 563 + .set AADLEN64, %rcx // Zero-extend AADLEN before using! 564 + 565 + // Additional local variables. 566 + // %rax and %k1 are used as temporary registers. 567 + .set GHASHDATA0, %zmm0 568 + .set GHASHDATA0_XMM, %xmm0 569 + .set GHASHDATA1, %zmm1 570 + .set GHASHDATA1_XMM, %xmm1 571 + .set GHASHDATA2, %zmm2 572 + .set GHASHDATA2_XMM, %xmm2 573 + .set GHASHDATA3, %zmm3 574 + .set BSWAP_MASK, %zmm4 575 + .set BSWAP_MASK_XMM, %xmm4 576 + .set GHASH_ACC, %zmm5 577 + .set GHASH_ACC_XMM, %xmm5 578 + .set H_POW4, %zmm6 579 + .set H_POW3, %zmm7 580 + .set H_POW2, %zmm8 581 + .set H_POW1, %zmm9 582 + .set H_POW1_XMM, %xmm9 583 + .set GFPOLY, %zmm10 584 + .set GFPOLY_XMM, %xmm10 585 + .set GHASHTMP0, %zmm11 586 + .set GHASHTMP1, %zmm12 587 + .set GHASHTMP2, %zmm13 588 + 589 + // Load the GHASH accumulator. 590 + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 591 + 592 + // Check for the common case of AADLEN <= 16, as well as AADLEN == 0. 593 + cmp $16, AADLEN 594 + jg .Laad_more_than_16bytes 595 + test AADLEN, AADLEN 596 + jz .Laad_done 597 + 598 + // Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD. 599 + vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM 600 + vmovdqu .Lgfpoly(%rip), GFPOLY_XMM 601 + mov $-1, %eax 602 + bzhi AADLEN, %eax, %eax 603 + kmovd %eax, %k1 604 + vmovdqu8 (AAD), GHASHDATA0_XMM{%k1}{z} 605 + vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM 606 + vpshufb BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM 607 + vpxor GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 608 + _ghash_mul H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \ 609 + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM 610 + jmp .Laad_done 611 + 612 + .Laad_more_than_16bytes: 613 + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK 614 + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY 615 + 616 + // If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time. 617 + sub $256, AADLEN 618 + jl .Laad_loop_4x_done 619 + vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4 620 + vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3 621 + vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2 622 + vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 623 + .Laad_loop_4x: 624 + vmovdqu8 0*64(AAD), GHASHDATA0 625 + vmovdqu8 1*64(AAD), GHASHDATA1 626 + vmovdqu8 2*64(AAD), GHASHDATA2 627 + vmovdqu8 3*64(AAD), GHASHDATA3 628 + _ghash_4x 629 + add $256, AAD 630 + sub $256, AADLEN 631 + jge .Laad_loop_4x 632 + .Laad_loop_4x_done: 633 + 634 + // If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time. 635 + add $192, AADLEN 636 + jl .Laad_loop_1x_done 637 + vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 638 + .Laad_loop_1x: 639 + vmovdqu8 (AAD), GHASHDATA0 640 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 641 + vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC 642 + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 643 + GHASHDATA0, GHASHDATA1, GHASHDATA2 644 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ 645 + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM 646 + add $64, AAD 647 + sub $64, AADLEN 648 + jge .Laad_loop_1x 649 + .Laad_loop_1x_done: 650 + 651 + // Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD. 652 + add $64, AADLEN 653 + jz .Laad_done 654 + mov $-1, %rax 655 + bzhi AADLEN64, %rax, %rax 656 + kmovq %rax, %k1 657 + vmovdqu8 (AAD), GHASHDATA0{%k1}{z} 658 + neg AADLEN64 659 + and $~15, AADLEN64 // -round_up(AADLEN, 16) 660 + vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 661 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 662 + vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC 663 + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 664 + GHASHDATA0, GHASHDATA1, GHASHDATA2 665 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ 666 + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM 667 + 668 + .Laad_done: 669 + // Store the updated GHASH accumulator back to memory. 670 + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 671 + 672 + vzeroupper // This is needed after using ymm or zmm registers. 673 + RET 674 + SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512) 675 + 676 + // Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the 677 + // round key that has been broadcast to all 128-bit lanes of \round_key. 476 678 .macro _vaesenc_4x round_key 477 - vaesenc \round_key, V0, V0 478 - vaesenc \round_key, V1, V1 479 - vaesenc \round_key, V2, V2 480 - vaesenc \round_key, V3, V3 679 + vaesenc \round_key, %zmm0, %zmm0 680 + vaesenc \round_key, %zmm1, %zmm1 681 + vaesenc \round_key, %zmm2, %zmm2 682 + vaesenc \round_key, %zmm3, %zmm3 481 683 .endm 482 684 483 685 // Start the AES encryption of four vectors of counter blocks. 484 686 .macro _ctr_begin_4x 485 687 486 688 // Increment LE_CTR four times to generate four vectors of little-endian 487 - // counter blocks, swap each to big-endian, and store them in V0-V3. 488 - vpshufb BSWAP_MASK, LE_CTR, V0 689 + // counter blocks, swap each to big-endian, and store them in %zmm[0-3]. 690 + vpshufb BSWAP_MASK, LE_CTR, %zmm0 489 691 vpaddd LE_CTR_INC, LE_CTR, LE_CTR 490 - vpshufb BSWAP_MASK, LE_CTR, V1 692 + vpshufb BSWAP_MASK, LE_CTR, %zmm1 491 693 vpaddd LE_CTR_INC, LE_CTR, LE_CTR 492 - vpshufb BSWAP_MASK, LE_CTR, V2 694 + vpshufb BSWAP_MASK, LE_CTR, %zmm2 493 695 vpaddd LE_CTR_INC, LE_CTR, LE_CTR 494 - vpshufb BSWAP_MASK, LE_CTR, V3 696 + vpshufb BSWAP_MASK, LE_CTR, %zmm3 495 697 vpaddd LE_CTR_INC, LE_CTR, LE_CTR 496 698 497 699 // AES "round zero": XOR in the zero-th round key. 498 - vpxord RNDKEY0, V0, V0 499 - vpxord RNDKEY0, V1, V1 500 - vpxord RNDKEY0, V2, V2 501 - vpxord RNDKEY0, V3, V3 700 + vpxord RNDKEY0, %zmm0, %zmm0 701 + vpxord RNDKEY0, %zmm1, %zmm1 702 + vpxord RNDKEY0, %zmm2, %zmm2 703 + vpxord RNDKEY0, %zmm3, %zmm3 502 704 .endm 503 705 504 - // Do the last AES round for four vectors of counter blocks V0-V3, XOR source 505 - // data with the resulting keystream, and write the result to DST and 706 + // Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR 707 + // source data with the resulting keystream, and write the result to DST and 506 708 // GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) 507 709 .macro _aesenclast_and_xor_4x 508 710 // XOR the source data with the last round key, saving the result in 509 711 // GHASHDATA[0-3]. This reduces latency by taking advantage of the 510 712 // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). 511 - vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 512 - vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 513 - vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 514 - vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 713 + vpxord 0*64(SRC), RNDKEYLAST, GHASHDATA0 714 + vpxord 1*64(SRC), RNDKEYLAST, GHASHDATA1 715 + vpxord 2*64(SRC), RNDKEYLAST, GHASHDATA2 716 + vpxord 3*64(SRC), RNDKEYLAST, GHASHDATA3 515 717 516 718 // Do the last AES round. This handles the XOR with the source data 517 719 // too, as per the optimization described above. 518 - vaesenclast GHASHDATA0, V0, GHASHDATA0 519 - vaesenclast GHASHDATA1, V1, GHASHDATA1 520 - vaesenclast GHASHDATA2, V2, GHASHDATA2 521 - vaesenclast GHASHDATA3, V3, GHASHDATA3 720 + vaesenclast GHASHDATA0, %zmm0, GHASHDATA0 721 + vaesenclast GHASHDATA1, %zmm1, GHASHDATA1 722 + vaesenclast GHASHDATA2, %zmm2, GHASHDATA2 723 + vaesenclast GHASHDATA3, %zmm3, GHASHDATA3 522 724 523 725 // Store the en/decrypted data to DST. 524 - vmovdqu8 GHASHDATA0, 0*VL(DST) 525 - vmovdqu8 GHASHDATA1, 1*VL(DST) 526 - vmovdqu8 GHASHDATA2, 2*VL(DST) 527 - vmovdqu8 GHASHDATA3, 3*VL(DST) 726 + vmovdqu8 GHASHDATA0, 0*64(DST) 727 + vmovdqu8 GHASHDATA1, 1*64(DST) 728 + vmovdqu8 GHASHDATA2, 2*64(DST) 729 + vmovdqu8 GHASHDATA3, 3*64(DST) 528 730 .endm 529 731 530 - // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, 531 - // const u32 le_ctr[4], u8 ghash_acc[16], 532 - // const u8 *src, u8 *dst, int datalen); 732 + // void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 733 + // const u32 le_ctr[4], u8 ghash_acc[16], 734 + // const u8 *src, u8 *dst, int datalen); 533 735 // 534 736 // This macro generates a GCM encryption or decryption update function with the 535 - // above prototype (with \enc selecting which one). This macro supports both 536 - // VL=32 and VL=64. _set_veclen must have been invoked with the desired length. 537 - // 538 - // This function computes the next portion of the CTR keystream, XOR's it with 539 - // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted 540 - // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the 541 - // next |datalen| ciphertext bytes. 737 + // above prototype (with \enc selecting which one). The function computes the 738 + // next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|, 739 + // and writes the resulting encrypted or decrypted data to |dst|. It also 740 + // updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext 741 + // bytes. 542 742 // 543 743 // |datalen| must be a multiple of 16, except on the last call where it can be 544 744 // any length. The caller must do any buffering needed to ensure this. Both 545 745 // in-place and out-of-place en/decryption are supported. 546 746 // 547 - // |le_ctr| must give the current counter in little-endian format. For a new 548 - // message, the low word of the counter must be 2. This function loads the 549 - // counter from |le_ctr| and increments the loaded counter as needed, but it 550 - // does *not* store the updated counter back to |le_ctr|. The caller must 551 - // update |le_ctr| if any more data segments follow. Internally, only the low 552 - // 32-bit word of the counter is incremented, following the GCM standard. 747 + // |le_ctr| must give the current counter in little-endian format. This 748 + // function loads the counter from |le_ctr| and increments the loaded counter as 749 + // needed, but it does *not* store the updated counter back to |le_ctr|. The 750 + // caller must update |le_ctr| if any more data segments follow. Internally, 751 + // only the low 32-bit word of the counter is incremented, following the GCM 752 + // standard. 553 753 .macro _aes_gcm_update enc 554 754 555 755 // Function arguments ··· 712 634 // Pointer to the last AES round key for the chosen AES variant 713 635 .set RNDKEYLAST_PTR, %r11 714 636 715 - // In the main loop, V0-V3 are used as AES input and output. Elsewhere 716 - // they are used as temporary registers. 637 + // In the main loop, %zmm[0-3] are used as AES input and output. 638 + // Elsewhere they are used as temporary registers. 717 639 718 640 // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. 719 - .set GHASHDATA0, V4 641 + .set GHASHDATA0, %zmm4 720 642 .set GHASHDATA0_XMM, %xmm4 721 - .set GHASHDATA1, V5 643 + .set GHASHDATA1, %zmm5 722 644 .set GHASHDATA1_XMM, %xmm5 723 - .set GHASHDATA2, V6 645 + .set GHASHDATA2, %zmm6 724 646 .set GHASHDATA2_XMM, %xmm6 725 - .set GHASHDATA3, V7 647 + .set GHASHDATA3, %zmm7 726 648 727 649 // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values 728 650 // using vpshufb, copied to all 128-bit lanes. 729 - .set BSWAP_MASK, V8 651 + .set BSWAP_MASK, %zmm8 730 652 731 653 // RNDKEY temporarily holds the next AES round key. 732 - .set RNDKEY, V9 654 + .set RNDKEY, %zmm9 733 655 734 656 // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, 735 657 // only the lowest 128-bit lane can be nonzero. When not fully reduced, 736 658 // more than one lane may be used, and they need to be XOR'd together. 737 - .set GHASH_ACC, V10 659 + .set GHASH_ACC, %zmm10 738 660 .set GHASH_ACC_XMM, %xmm10 739 661 740 662 // LE_CTR_INC is the vector of 32-bit words that need to be added to a 741 663 // vector of little-endian counter blocks to advance it forwards. 742 - .set LE_CTR_INC, V11 664 + .set LE_CTR_INC, %zmm11 743 665 744 666 // LE_CTR contains the next set of little-endian counter blocks. 745 - .set LE_CTR, V12 667 + .set LE_CTR, %zmm12 746 668 747 669 // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, 748 670 // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, 749 671 // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. 750 - .set RNDKEY0, V13 751 - .set RNDKEYLAST, V14 752 - .set RNDKEY_M9, V15 753 - .set RNDKEY_M8, V16 754 - .set RNDKEY_M7, V17 755 - .set RNDKEY_M6, V18 756 - .set RNDKEY_M5, V19 757 - .set RNDKEY_M4, V20 758 - .set RNDKEY_M3, V21 759 - .set RNDKEY_M2, V22 760 - .set RNDKEY_M1, V23 672 + .set RNDKEY0, %zmm13 673 + .set RNDKEYLAST, %zmm14 674 + .set RNDKEY_M9, %zmm15 675 + .set RNDKEY_M8, %zmm16 676 + .set RNDKEY_M7, %zmm17 677 + .set RNDKEY_M6, %zmm18 678 + .set RNDKEY_M5, %zmm19 679 + .set RNDKEY_M4, %zmm20 680 + .set RNDKEY_M3, %zmm21 681 + .set RNDKEY_M2, %zmm22 682 + .set RNDKEY_M1, %zmm23 761 683 762 684 // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These 763 685 // cannot coincide with anything used for AES encryption, since for 764 686 // performance reasons GHASH and AES encryption are interleaved. 765 - .set GHASHTMP0, V24 766 - .set GHASHTMP1, V25 767 - .set GHASHTMP2, V26 687 + .set GHASHTMP0, %zmm24 688 + .set GHASHTMP1, %zmm25 689 + .set GHASHTMP2, %zmm26 768 690 769 - // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The 691 + // H_POW[4-1] contain the powers of the hash key H^16...H^1. The 770 692 // descending numbering reflects the order of the key powers. 771 - .set H_POW4, V27 772 - .set H_POW3, V28 773 - .set H_POW2, V29 774 - .set H_POW1, V30 693 + .set H_POW4, %zmm27 694 + .set H_POW3, %zmm28 695 + .set H_POW2, %zmm29 696 + .set H_POW1, %zmm30 775 697 776 698 // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. 777 - .set GFPOLY, V31 699 + .set GFPOLY, %zmm31 778 700 779 701 // Load some constants. 780 702 vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK ··· 797 719 // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. 798 720 vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR 799 721 800 - // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. 801 - .if VL == 32 802 - vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC 803 - .elseif VL == 64 722 + // Load 4 into all 128-bit lanes of LE_CTR_INC. 804 723 vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC 805 - .else 806 - .error "Unsupported vector length" 807 - .endif 808 724 809 - // If there are at least 4*VL bytes of data, then continue into the loop 810 - // that processes 4*VL bytes of data at a time. Otherwise skip it. 725 + // If there are at least 256 bytes of data, then continue into the loop 726 + // that processes 256 bytes of data at a time. Otherwise skip it. 811 727 // 812 - // Pre-subtracting 4*VL from DATALEN saves an instruction from the main 728 + // Pre-subtracting 256 from DATALEN saves an instruction from the main 813 729 // loop and also ensures that at least one write always occurs to 814 730 // DATALEN, zero-extending it and allowing DATALEN64 to be used later. 815 - add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 731 + sub $256, DATALEN 816 732 jl .Lcrypt_loop_4x_done\@ 817 733 818 734 // Load powers of the hash key. 819 - vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 820 - vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 821 - vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 822 - vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 735 + vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4 736 + vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3 737 + vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2 738 + vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 823 739 824 740 // Main loop: en/decrypt and hash 4 vectors at a time. 825 741 // ··· 842 770 cmp %rax, RNDKEYLAST_PTR 843 771 jne 1b 844 772 _aesenclast_and_xor_4x 845 - sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 846 - sub $-4*VL, DST 847 - add $-4*VL, DATALEN 773 + add $256, SRC 774 + add $256, DST 775 + sub $256, DATALEN 848 776 jl .Lghash_last_ciphertext_4x\@ 849 777 .endif 850 778 ··· 858 786 // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If 859 787 // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. 860 788 .if !\enc 861 - vmovdqu8 0*VL(SRC), GHASHDATA0 862 - vmovdqu8 1*VL(SRC), GHASHDATA1 863 - vmovdqu8 2*VL(SRC), GHASHDATA2 864 - vmovdqu8 3*VL(SRC), GHASHDATA3 789 + vmovdqu8 0*64(SRC), GHASHDATA0 790 + vmovdqu8 1*64(SRC), GHASHDATA1 791 + vmovdqu8 2*64(SRC), GHASHDATA2 792 + vmovdqu8 3*64(SRC), GHASHDATA3 865 793 .endif 866 794 867 795 // Start the AES encryption of the counter blocks. ··· 881 809 _vaesenc_4x RNDKEY 882 810 128: 883 811 884 - // Finish the AES encryption of the counter blocks in V0-V3, interleaved 885 - // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. 812 + // Finish the AES encryption of the counter blocks in %zmm[0-3], 813 + // interleaved with the GHASH update of the ciphertext blocks in 814 + // GHASHDATA[0-3]. 886 815 .irp i, 9,8,7,6,5,4,3,2,1 887 816 _ghash_step_4x (9 - \i) 888 817 _vaesenc_4x RNDKEY_M\i 889 818 .endr 890 819 _ghash_step_4x 9 891 820 _aesenclast_and_xor_4x 892 - sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 893 - sub $-4*VL, DST 894 - add $-4*VL, DATALEN 821 + add $256, SRC 822 + add $256, DST 823 + sub $256, DATALEN 895 824 jge .Lcrypt_loop_4x\@ 896 825 897 826 .if \enc 898 827 .Lghash_last_ciphertext_4x\@: 899 828 // Update GHASH with the last set of ciphertext blocks. 900 - .irp i, 0,1,2,3,4,5,6,7,8,9 901 - _ghash_step_4x \i 902 - .endr 829 + _ghash_4x 903 830 .endif 904 831 905 832 .Lcrypt_loop_4x_done\@: 906 833 907 - // Undo the extra subtraction by 4*VL and check whether data remains. 908 - sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 834 + // Undo the extra subtraction by 256 and check whether data remains. 835 + add $256, DATALEN 909 836 jz .Ldone\@ 910 837 911 - // The data length isn't a multiple of 4*VL. Process the remaining data 912 - // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. 913 - // Going one vector at a time may seem inefficient compared to having 914 - // separate code paths for each possible number of vectors remaining. 915 - // However, using a loop keeps the code size down, and it performs 916 - // surprising well; modern CPUs will start executing the next iteration 917 - // before the previous one finishes and also predict the number of loop 918 - // iterations. For a similar reason, we roll up the AES rounds. 838 + // The data length isn't a multiple of 256 bytes. Process the remaining 839 + // data of length 1 <= DATALEN < 256, up to one 64-byte vector at a 840 + // time. Going one vector at a time may seem inefficient compared to 841 + // having separate code paths for each possible number of vectors 842 + // remaining. However, using a loop keeps the code size down, and it 843 + // performs surprising well; modern CPUs will start executing the next 844 + // iteration before the previous one finishes and also predict the 845 + // number of loop iterations. For a similar reason, we roll up the AES 846 + // rounds. 919 847 // 920 - // On the last iteration, the remaining length may be less than VL. 921 - // Handle this using masking. 848 + // On the last iteration, the remaining length may be less than 64 849 + // bytes. Handle this using masking. 922 850 // 923 851 // Since there are enough key powers available for all remaining data, 924 852 // there is no need to do a GHASH reduction after each iteration. ··· 947 875 .Lcrypt_loop_1x\@: 948 876 949 877 // Select the appropriate mask for this iteration: all 1's if 950 - // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the 878 + // DATALEN >= 64, otherwise DATALEN 1's. Do this branchlessly using the 951 879 // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) 952 - .if VL < 64 953 - mov $-1, %eax 954 - bzhi DATALEN, %eax, %eax 955 - kmovd %eax, %k1 956 - .else 957 880 mov $-1, %rax 958 881 bzhi DATALEN64, %rax, %rax 959 882 kmovq %rax, %k1 960 - .endif 961 883 962 884 // Encrypt a vector of counter blocks. This does not need to be masked. 963 - vpshufb BSWAP_MASK, LE_CTR, V0 885 + vpshufb BSWAP_MASK, LE_CTR, %zmm0 964 886 vpaddd LE_CTR_INC, LE_CTR, LE_CTR 965 - vpxord RNDKEY0, V0, V0 887 + vpxord RNDKEY0, %zmm0, %zmm0 966 888 lea 16(KEY), %rax 967 889 1: 968 890 vbroadcasti32x4 (%rax), RNDKEY 969 - vaesenc RNDKEY, V0, V0 891 + vaesenc RNDKEY, %zmm0, %zmm0 970 892 add $16, %rax 971 893 cmp %rax, RNDKEYLAST_PTR 972 894 jne 1b 973 - vaesenclast RNDKEYLAST, V0, V0 895 + vaesenclast RNDKEYLAST, %zmm0, %zmm0 974 896 975 897 // XOR the data with the appropriate number of keystream bytes. 976 - vmovdqu8 (SRC), V1{%k1}{z} 977 - vpxord V1, V0, V0 978 - vmovdqu8 V0, (DST){%k1} 898 + vmovdqu8 (SRC), %zmm1{%k1}{z} 899 + vpxord %zmm1, %zmm0, %zmm0 900 + vmovdqu8 %zmm0, (DST){%k1} 979 901 980 902 // Update GHASH with the ciphertext block(s), without reducing. 981 903 // 982 - // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. 983 - // (If decrypting, it's done by the above masked load. If encrypting, 984 - // it's done by the below masked register-to-register move.) Note that 985 - // if DATALEN <= VL - 16, there will be additional padding beyond the 986 - // padding of the last block specified by GHASH itself; i.e., there may 987 - // be whole block(s) that get processed by the GHASH multiplication and 988 - // reduction instructions but should not actually be included in the 904 + // In the case of DATALEN < 64, the ciphertext is zero-padded to 64 905 + // bytes. (If decrypting, it's done by the above masked load. If 906 + // encrypting, it's done by the below masked register-to-register move.) 907 + // Note that if DATALEN <= 48, there will be additional padding beyond 908 + // the padding of the last block specified by GHASH itself; i.e., there 909 + // may be whole block(s) that get processed by the GHASH multiplication 910 + // and reduction instructions but should not actually be included in the 989 911 // GHASH. However, any such blocks are all-zeroes, and the values that 990 912 // they're multiplied with are also all-zeroes. Therefore they just add 991 913 // 0 * 0 = 0 to the final GHASH result, which makes no difference. 992 914 vmovdqu8 (POWERS_PTR), H_POW1 993 915 .if \enc 994 - vmovdqu8 V0, V1{%k1}{z} 916 + vmovdqu8 %zmm0, %zmm1{%k1}{z} 995 917 .endif 996 - vpshufb BSWAP_MASK, V1, V0 997 - vpxord GHASH_ACC, V0, V0 998 - _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 918 + vpshufb BSWAP_MASK, %zmm1, %zmm0 919 + vpxord GHASH_ACC, %zmm0, %zmm0 920 + _ghash_mul_noreduce H_POW1, %zmm0, LO, MI, HI, \ 921 + GHASHDATA3, %zmm1, %zmm2, %zmm3 999 922 vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 1000 923 1001 - add $VL, POWERS_PTR 1002 - add $VL, SRC 1003 - add $VL, DST 1004 - sub $VL, DATALEN 924 + add $64, POWERS_PTR 925 + add $64, SRC 926 + add $64, DST 927 + sub $64, DATALEN 1005 928 jg .Lcrypt_loop_1x\@ 1006 929 1007 930 // Finally, do the GHASH reduction. 1008 - _ghash_reduce LO, MI, HI, GFPOLY, V0 931 + _ghash_reduce LO, MI, HI, GFPOLY, %zmm0 1009 932 _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 1010 933 1011 934 .Ldone\@: ··· 1011 944 RET 1012 945 .endm 1013 946 1014 - // void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1015 - // const u32 le_ctr[4], u8 ghash_acc[16], 1016 - // u64 total_aadlen, u64 total_datalen); 1017 - // bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1018 - // const u32 le_ctr[4], 1019 - // const u8 ghash_acc[16], 1020 - // u64 total_aadlen, u64 total_datalen, 1021 - // const u8 tag[16], int taglen); 947 + // void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 948 + // const u32 le_ctr[4], u8 ghash_acc[16], 949 + // u64 total_aadlen, u64 total_datalen); 950 + // bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 951 + // const u32 le_ctr[4], 952 + // const u8 ghash_acc[16], 953 + // u64 total_aadlen, u64 total_datalen, 954 + // const u8 tag[16], int taglen); 1022 955 // 1023 956 // This macro generates one of the above two functions (with \enc selecting 1024 957 // which one). Both functions finish computing the GCM authentication tag by ··· 1148 1081 RET 1149 1082 .endm 1150 1083 1151 - _set_veclen 32 1152 - SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) 1153 - _aes_gcm_precompute 1154 - SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) 1155 - SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) 1084 + SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512) 1156 1085 _aes_gcm_update 1 1157 - SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) 1158 - SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) 1086 + SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512) 1087 + SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512) 1159 1088 _aes_gcm_update 0 1160 - SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) 1089 + SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512) 1161 1090 1162 - _set_veclen 64 1163 - SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) 1164 - _aes_gcm_precompute 1165 - SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) 1166 - SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) 1167 - _aes_gcm_update 1 1168 - SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) 1169 - SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) 1170 - _aes_gcm_update 0 1171 - SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) 1172 - 1173 - // void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1174 - // u8 ghash_acc[16], 1175 - // const u8 *aad, int aadlen); 1176 - // 1177 - // This function processes the AAD (Additional Authenticated Data) in GCM. 1178 - // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the 1179 - // data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been 1180 - // initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| 1181 - // must be a multiple of 16, except on the last call where it can be any length. 1182 - // The caller must do any buffering needed to ensure this. 1183 - // 1184 - // AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. 1185 - // Therefore, for AAD processing we currently only provide this implementation 1186 - // which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This 1187 - // keeps the code size down, and it enables some micro-optimizations, e.g. using 1188 - // VEX-coded instructions instead of EVEX-coded to save some instruction bytes. 1189 - // To optimize for large amounts of AAD, we could implement a 4x-wide loop and 1190 - // provide a version using 512-bit vectors, but that doesn't seem to be useful. 1191 - SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) 1192 - 1193 - // Function arguments 1194 - .set KEY, %rdi 1195 - .set GHASH_ACC_PTR, %rsi 1196 - .set AAD, %rdx 1197 - .set AADLEN, %ecx 1198 - .set AADLEN64, %rcx // Zero-extend AADLEN before using! 1199 - 1200 - // Additional local variables. 1201 - // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. 1202 - .set BSWAP_MASK, %ymm4 1203 - .set GFPOLY, %ymm5 1204 - .set GHASH_ACC, %ymm6 1205 - .set GHASH_ACC_XMM, %xmm6 1206 - .set H_POW1, %ymm7 1207 - 1208 - // Load some constants. 1209 - vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK 1210 - vbroadcasti128 .Lgfpoly(%rip), GFPOLY 1211 - 1212 - // Load the GHASH accumulator. 1213 - vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 1214 - 1215 - // Update GHASH with 32 bytes of AAD at a time. 1216 - // 1217 - // Pre-subtracting 32 from AADLEN saves an instruction from the loop and 1218 - // also ensures that at least one write always occurs to AADLEN, 1219 - // zero-extending it and allowing AADLEN64 to be used later. 1220 - sub $32, AADLEN 1221 - jl .Laad_loop_1x_done 1222 - vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] 1223 - .Laad_loop_1x: 1224 - vmovdqu (AAD), %ymm0 1225 - vpshufb BSWAP_MASK, %ymm0, %ymm0 1226 - vpxor %ymm0, GHASH_ACC, GHASH_ACC 1227 - _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1228 - %ymm0, %ymm1, %ymm2 1229 - vextracti128 $1, GHASH_ACC, %xmm0 1230 - vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM 1231 - add $32, AAD 1232 - sub $32, AADLEN 1233 - jge .Laad_loop_1x 1234 - .Laad_loop_1x_done: 1235 - add $32, AADLEN 1236 - jz .Laad_done 1237 - 1238 - // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. 1239 - mov $-1, %eax 1240 - bzhi AADLEN, %eax, %eax 1241 - kmovd %eax, %k1 1242 - vmovdqu8 (AAD), %ymm0{%k1}{z} 1243 - neg AADLEN64 1244 - and $~15, AADLEN64 // -round_up(AADLEN, 16) 1245 - vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 1246 - vpshufb BSWAP_MASK, %ymm0, %ymm0 1247 - vpxor %ymm0, GHASH_ACC, GHASH_ACC 1248 - _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1249 - %ymm0, %ymm1, %ymm2 1250 - vextracti128 $1, GHASH_ACC, %xmm0 1251 - vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM 1252 - 1253 - .Laad_done: 1254 - // Store the updated GHASH accumulator back to memory. 1255 - vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 1256 - 1257 - vzeroupper // This is needed after using ymm or zmm registers. 1258 - RET 1259 - SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) 1260 - 1261 - SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) 1091 + SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512) 1262 1092 _aes_gcm_final 1 1263 - SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) 1264 - SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) 1093 + SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512) 1094 + SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512) 1265 1095 _aes_gcm_final 0 1266 - SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) 1096 + SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512)
+1146
arch/x86/crypto/aes-gcm-vaes-avx2.S
··· 1 + /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2 + // 3 + // AES-GCM implementation for x86_64 CPUs that support the following CPU 4 + // features: VAES && VPCLMULQDQ && AVX2 5 + // 6 + // Copyright 2025 Google LLC 7 + // 8 + // Author: Eric Biggers <ebiggers@google.com> 9 + // 10 + //------------------------------------------------------------------------------ 11 + // 12 + // This file is dual-licensed, meaning that you can use it under your choice of 13 + // either of the following two licenses: 14 + // 15 + // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy 16 + // of the License at 17 + // 18 + // http://www.apache.org/licenses/LICENSE-2.0 19 + // 20 + // Unless required by applicable law or agreed to in writing, software 21 + // distributed under the License is distributed on an "AS IS" BASIS, 22 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23 + // See the License for the specific language governing permissions and 24 + // limitations under the License. 25 + // 26 + // or 27 + // 28 + // Redistribution and use in source and binary forms, with or without 29 + // modification, are permitted provided that the following conditions are met: 30 + // 31 + // 1. Redistributions of source code must retain the above copyright notice, 32 + // this list of conditions and the following disclaimer. 33 + // 34 + // 2. Redistributions in binary form must reproduce the above copyright 35 + // notice, this list of conditions and the following disclaimer in the 36 + // documentation and/or other materials provided with the distribution. 37 + // 38 + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 39 + // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 40 + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 41 + // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 42 + // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 43 + // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 44 + // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 45 + // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 46 + // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 47 + // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 48 + // POSSIBILITY OF SUCH DAMAGE. 49 + // 50 + // ----------------------------------------------------------------------------- 51 + // 52 + // This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512. 53 + // This means it can only use 16 vector registers instead of 32, the maximum 54 + // vector length is 32 bytes, and some instructions such as vpternlogd and 55 + // masked loads/stores are unavailable. However, it is able to run on CPUs that 56 + // have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs), 57 + // various Intel client CPUs such as Alder Lake, and Intel Sierra Forest. 58 + // 59 + // This implementation also uses Karatsuba multiplication instead of schoolbook 60 + // multiplication for GHASH in its main loop. This does not help much on Intel, 61 + // but it improves performance by ~5% on AMD Zen 3. Other factors weighing 62 + // slightly in favor of Karatsuba multiplication in this implementation are the 63 + // lower maximum vector length (which means there are fewer key powers, so we 64 + // can cache the halves of each key power XOR'd together and still use less 65 + // memory than the AVX512 implementation), and the unavailability of the 66 + // vpternlogd instruction (which helped schoolbook a bit more than Karatsuba). 67 + 68 + #include <linux/linkage.h> 69 + 70 + .section .rodata 71 + .p2align 4 72 + 73 + // The below three 16-byte values must be in the order that they are, as 74 + // they are really two 32-byte tables and a 16-byte value that overlap: 75 + // 76 + // - The first 32-byte table begins at .Lselect_high_bytes_table. 77 + // For 0 <= len <= 16, the 16-byte value at 78 + // '.Lselect_high_bytes_table + len' selects the high 'len' bytes of 79 + // another 16-byte value when AND'ed with it. 80 + // 81 + // - The second 32-byte table begins at .Lrshift_and_bswap_table. 82 + // For 0 <= len <= 16, the 16-byte value at 83 + // '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the 84 + // following operation: right-shift by '16 - len' bytes (shifting in 85 + // zeroes), then reflect all 16 bytes. 86 + // 87 + // - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects 88 + // all 16 bytes. 89 + .Lselect_high_bytes_table: 90 + .octa 0 91 + .Lrshift_and_bswap_table: 92 + .octa 0xffffffffffffffffffffffffffffffff 93 + .Lbswap_mask: 94 + .octa 0x000102030405060708090a0b0c0d0e0f 95 + 96 + // Sixteen 0x0f bytes. By XOR'ing an entry of .Lrshift_and_bswap_table 97 + // with this, we get a mask that left-shifts by '16 - len' bytes. 98 + .Lfifteens: 99 + .octa 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 100 + 101 + // This is the GHASH reducing polynomial without its constant term, i.e. 102 + // x^128 + x^7 + x^2 + x, represented using the backwards mapping 103 + // between bits and polynomial coefficients. 104 + // 105 + // Alternatively, it can be interpreted as the naturally-ordered 106 + // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the 107 + // "reversed" GHASH reducing polynomial without its x^128 term. 108 + .Lgfpoly: 109 + .octa 0xc2000000000000000000000000000001 110 + 111 + // Same as above, but with the (1 << 64) bit set. 112 + .Lgfpoly_and_internal_carrybit: 113 + .octa 0xc2000000000000010000000000000001 114 + 115 + // Values needed to prepare the initial vector of counter blocks. 116 + .Lctr_pattern: 117 + .octa 0 118 + .octa 1 119 + 120 + // The number of AES blocks per vector, as a 128-bit value. 121 + .Linc_2blocks: 122 + .octa 2 123 + 124 + // Offsets in struct aes_gcm_key_vaes_avx2 125 + #define OFFSETOF_AESKEYLEN 480 126 + #define OFFSETOF_H_POWERS 512 127 + #define NUM_H_POWERS 8 128 + #define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) 129 + #define OFFSETOF_H_POWERS_XORED OFFSETOFEND_H_POWERS 130 + 131 + .text 132 + 133 + // Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes 134 + // of \b and storing the reduced products in \dst. Uses schoolbook 135 + // multiplication. 136 + .macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 137 + .if \i == 0 138 + vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L 139 + vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H 140 + .elseif \i == 1 141 + vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L 142 + .elseif \i == 2 143 + vpxor \t2, \t1, \t1 // MI = MI_0 + MI_1 144 + .elseif \i == 3 145 + vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) 146 + .elseif \i == 4 147 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO 148 + .elseif \i == 5 149 + vpxor \t0, \t1, \t1 // Fold LO into MI (part 1) 150 + vpxor \t2, \t1, \t1 // Fold LO into MI (part 2) 151 + .elseif \i == 6 152 + vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H 153 + .elseif \i == 7 154 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) 155 + .elseif \i == 8 156 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI 157 + .elseif \i == 9 158 + vpxor \t1, \dst, \dst // Fold MI into HI (part 1) 159 + vpxor \t0, \dst, \dst // Fold MI into HI (part 2) 160 + .endif 161 + .endm 162 + 163 + // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store 164 + // the reduced products in \dst. See _ghash_mul_step for full explanation. 165 + .macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 166 + .irp i, 0,1,2,3,4,5,6,7,8,9 167 + _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 168 + .endr 169 + .endm 170 + 171 + // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the 172 + // *unreduced* products to \lo, \mi, and \hi. 173 + .macro _ghash_mul_noreduce a, b, lo, mi, hi, t0 174 + vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L 175 + vpxor \t0, \lo, \lo 176 + vpclmulqdq $0x01, \a, \b, \t0 // a_L * b_H 177 + vpxor \t0, \mi, \mi 178 + vpclmulqdq $0x10, \a, \b, \t0 // a_H * b_L 179 + vpxor \t0, \mi, \mi 180 + vpclmulqdq $0x11, \a, \b, \t0 // a_H * b_H 181 + vpxor \t0, \hi, \hi 182 + .endm 183 + 184 + // Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit 185 + // reduced products in \hi. See _ghash_mul_step for explanation of reduction. 186 + .macro _ghash_reduce lo, mi, hi, gfpoly, t0 187 + vpclmulqdq $0x01, \lo, \gfpoly, \t0 188 + vpshufd $0x4e, \lo, \lo 189 + vpxor \lo, \mi, \mi 190 + vpxor \t0, \mi, \mi 191 + vpclmulqdq $0x01, \mi, \gfpoly, \t0 192 + vpshufd $0x4e, \mi, \mi 193 + vpxor \mi, \hi, \hi 194 + vpxor \t0, \hi, \hi 195 + .endm 196 + 197 + // This is a specialized version of _ghash_mul that computes \a * \a, i.e. it 198 + // squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0. 199 + .macro _ghash_square a, dst, gfpoly, t0, t1 200 + vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L 201 + vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H 202 + vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57) 203 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO 204 + vpxor \t0, \t1, \t1 // Fold LO into MI 205 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) 206 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI 207 + vpxor \t1, \dst, \dst // Fold MI into HI (part 1) 208 + vpxor \t0, \dst, \dst // Fold MI into HI (part 2) 209 + .endm 210 + 211 + // void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); 212 + // 213 + // Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and 214 + // initialize |key->h_powers| and |key->h_powers_xored|. 215 + // 216 + // We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to 217 + // store the 64-bit halves of the key powers XOR'd together (for Karatsuba 218 + // multiplication) in the order 8,6,7,5,4,2,3,1. 219 + SYM_FUNC_START(aes_gcm_precompute_vaes_avx2) 220 + 221 + // Function arguments 222 + .set KEY, %rdi 223 + 224 + // Additional local variables 225 + .set POWERS_PTR, %rsi 226 + .set RNDKEYLAST_PTR, %rdx 227 + .set TMP0, %ymm0 228 + .set TMP0_XMM, %xmm0 229 + .set TMP1, %ymm1 230 + .set TMP1_XMM, %xmm1 231 + .set TMP2, %ymm2 232 + .set TMP2_XMM, %xmm2 233 + .set H_CUR, %ymm3 234 + .set H_CUR_XMM, %xmm3 235 + .set H_CUR2, %ymm4 236 + .set H_INC, %ymm5 237 + .set H_INC_XMM, %xmm5 238 + .set GFPOLY, %ymm6 239 + .set GFPOLY_XMM, %xmm6 240 + 241 + // Encrypt an all-zeroes block to get the raw hash subkey. 242 + movl OFFSETOF_AESKEYLEN(KEY), %eax 243 + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR 244 + vmovdqu (KEY), H_CUR_XMM // Zero-th round key XOR all-zeroes block 245 + lea 16(KEY), %rax 246 + 1: 247 + vaesenc (%rax), H_CUR_XMM, H_CUR_XMM 248 + add $16, %rax 249 + cmp %rax, RNDKEYLAST_PTR 250 + jne 1b 251 + vaesenclast (RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM 252 + 253 + // Reflect the bytes of the raw hash subkey. 254 + vpshufb .Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM 255 + 256 + // Finish preprocessing the byte-reflected hash subkey by multiplying it 257 + // by x^-1 ("standard" interpretation of polynomial coefficients) or 258 + // equivalently x^1 (natural interpretation). This gets the key into a 259 + // format that avoids having to bit-reflect the data blocks later. 260 + vpshufd $0xd3, H_CUR_XMM, TMP0_XMM 261 + vpsrad $31, TMP0_XMM, TMP0_XMM 262 + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM 263 + vpand .Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM 264 + vpxor TMP0_XMM, H_CUR_XMM, H_CUR_XMM 265 + 266 + // Load the gfpoly constant. 267 + vbroadcasti128 .Lgfpoly(%rip), GFPOLY 268 + 269 + // Square H^1 to get H^2. 270 + _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM 271 + 272 + // Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. 273 + vinserti128 $1, H_CUR_XMM, H_INC, H_CUR 274 + vinserti128 $1, H_INC_XMM, H_INC, H_INC 275 + 276 + // Compute H_CUR2 = [H^4, H^3]. 277 + _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 278 + 279 + // Store [H^2, H^1] and [H^4, H^3]. 280 + vmovdqu H_CUR, OFFSETOF_H_POWERS+3*32(KEY) 281 + vmovdqu H_CUR2, OFFSETOF_H_POWERS+2*32(KEY) 282 + 283 + // For Karatsuba multiplication: compute and store the two 64-bit halves 284 + // of each key power XOR'd together. Order is 4,2,3,1. 285 + vpunpcklqdq H_CUR, H_CUR2, TMP0 286 + vpunpckhqdq H_CUR, H_CUR2, TMP1 287 + vpxor TMP1, TMP0, TMP0 288 + vmovdqu TMP0, OFFSETOF_H_POWERS_XORED+32(KEY) 289 + 290 + // Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. 291 + _ghash_mul H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2 292 + _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 293 + vmovdqu H_CUR, OFFSETOF_H_POWERS+1*32(KEY) 294 + vmovdqu H_CUR2, OFFSETOF_H_POWERS+0*32(KEY) 295 + 296 + // Again, compute and store the two 64-bit halves of each key power 297 + // XOR'd together. Order is 8,6,7,5. 298 + vpunpcklqdq H_CUR, H_CUR2, TMP0 299 + vpunpckhqdq H_CUR, H_CUR2, TMP1 300 + vpxor TMP1, TMP0, TMP0 301 + vmovdqu TMP0, OFFSETOF_H_POWERS_XORED(KEY) 302 + 303 + vzeroupper 304 + RET 305 + SYM_FUNC_END(aes_gcm_precompute_vaes_avx2) 306 + 307 + // Do one step of the GHASH update of four vectors of data blocks. 308 + // \i: the step to do, 0 through 9 309 + // \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) 310 + // KEY: pointer to struct aes_gcm_key_vaes_avx2 311 + // BSWAP_MASK: mask for reflecting the bytes of blocks 312 + // H_POW[2-1]_XORED: cached values from KEY->h_powers_xored 313 + // TMP[0-2]: temporary registers. TMP[1-2] must be preserved across steps. 314 + // LO, MI: working state for this macro that must be preserved across steps 315 + // GHASH_ACC: the GHASH accumulator (input/output) 316 + .macro _ghash_step_4x i, ghashdata_ptr 317 + .set HI, GHASH_ACC # alias 318 + .set HI_XMM, GHASH_ACC_XMM 319 + .if \i == 0 320 + // First vector 321 + vmovdqu 0*32(\ghashdata_ptr), TMP1 322 + vpshufb BSWAP_MASK, TMP1, TMP1 323 + vmovdqu OFFSETOF_H_POWERS+0*32(KEY), TMP2 324 + vpxor GHASH_ACC, TMP1, TMP1 325 + vpclmulqdq $0x00, TMP2, TMP1, LO 326 + vpclmulqdq $0x11, TMP2, TMP1, HI 327 + vpunpckhqdq TMP1, TMP1, TMP0 328 + vpxor TMP1, TMP0, TMP0 329 + vpclmulqdq $0x00, H_POW2_XORED, TMP0, MI 330 + .elseif \i == 1 331 + .elseif \i == 2 332 + // Second vector 333 + vmovdqu 1*32(\ghashdata_ptr), TMP1 334 + vpshufb BSWAP_MASK, TMP1, TMP1 335 + vmovdqu OFFSETOF_H_POWERS+1*32(KEY), TMP2 336 + vpclmulqdq $0x00, TMP2, TMP1, TMP0 337 + vpxor TMP0, LO, LO 338 + vpclmulqdq $0x11, TMP2, TMP1, TMP0 339 + vpxor TMP0, HI, HI 340 + vpunpckhqdq TMP1, TMP1, TMP0 341 + vpxor TMP1, TMP0, TMP0 342 + vpclmulqdq $0x10, H_POW2_XORED, TMP0, TMP0 343 + vpxor TMP0, MI, MI 344 + .elseif \i == 3 345 + // Third vector 346 + vmovdqu 2*32(\ghashdata_ptr), TMP1 347 + vpshufb BSWAP_MASK, TMP1, TMP1 348 + vmovdqu OFFSETOF_H_POWERS+2*32(KEY), TMP2 349 + .elseif \i == 4 350 + vpclmulqdq $0x00, TMP2, TMP1, TMP0 351 + vpxor TMP0, LO, LO 352 + vpclmulqdq $0x11, TMP2, TMP1, TMP0 353 + vpxor TMP0, HI, HI 354 + .elseif \i == 5 355 + vpunpckhqdq TMP1, TMP1, TMP0 356 + vpxor TMP1, TMP0, TMP0 357 + vpclmulqdq $0x00, H_POW1_XORED, TMP0, TMP0 358 + vpxor TMP0, MI, MI 359 + 360 + // Fourth vector 361 + vmovdqu 3*32(\ghashdata_ptr), TMP1 362 + vpshufb BSWAP_MASK, TMP1, TMP1 363 + .elseif \i == 6 364 + vmovdqu OFFSETOF_H_POWERS+3*32(KEY), TMP2 365 + vpclmulqdq $0x00, TMP2, TMP1, TMP0 366 + vpxor TMP0, LO, LO 367 + vpclmulqdq $0x11, TMP2, TMP1, TMP0 368 + vpxor TMP0, HI, HI 369 + vpunpckhqdq TMP1, TMP1, TMP0 370 + vpxor TMP1, TMP0, TMP0 371 + vpclmulqdq $0x10, H_POW1_XORED, TMP0, TMP0 372 + vpxor TMP0, MI, MI 373 + .elseif \i == 7 374 + // Finalize 'mi' following Karatsuba multiplication. 375 + vpxor LO, MI, MI 376 + vpxor HI, MI, MI 377 + 378 + // Fold lo into mi. 379 + vbroadcasti128 .Lgfpoly(%rip), TMP2 380 + vpclmulqdq $0x01, LO, TMP2, TMP0 381 + vpshufd $0x4e, LO, LO 382 + vpxor LO, MI, MI 383 + vpxor TMP0, MI, MI 384 + .elseif \i == 8 385 + // Fold mi into hi. 386 + vpclmulqdq $0x01, MI, TMP2, TMP0 387 + vpshufd $0x4e, MI, MI 388 + vpxor MI, HI, HI 389 + vpxor TMP0, HI, HI 390 + .elseif \i == 9 391 + vextracti128 $1, HI, TMP0_XMM 392 + vpxor TMP0_XMM, HI_XMM, GHASH_ACC_XMM 393 + .endif 394 + .endm 395 + 396 + // Update GHASH with four vectors of data blocks. See _ghash_step_4x for full 397 + // explanation. 398 + .macro _ghash_4x ghashdata_ptr 399 + .irp i, 0,1,2,3,4,5,6,7,8,9 400 + _ghash_step_4x \i, \ghashdata_ptr 401 + .endr 402 + .endm 403 + 404 + // Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst 405 + // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. 406 + .macro _load_partial_block src, dst, tmp64, tmp32 407 + sub $8, %ecx // LEN - 8 408 + jle .Lle8\@ 409 + 410 + // Load 9 <= LEN <= 16 bytes. 411 + vmovq (\src), \dst // Load first 8 bytes 412 + mov (\src, %rcx), %rax // Load last 8 bytes 413 + neg %ecx 414 + shl $3, %ecx 415 + shr %cl, %rax // Discard overlapping bytes 416 + vpinsrq $1, %rax, \dst, \dst 417 + jmp .Ldone\@ 418 + 419 + .Lle8\@: 420 + add $4, %ecx // LEN - 4 421 + jl .Llt4\@ 422 + 423 + // Load 4 <= LEN <= 8 bytes. 424 + mov (\src), %eax // Load first 4 bytes 425 + mov (\src, %rcx), \tmp32 // Load last 4 bytes 426 + jmp .Lcombine\@ 427 + 428 + .Llt4\@: 429 + // Load 1 <= LEN <= 3 bytes. 430 + add $2, %ecx // LEN - 2 431 + movzbl (\src), %eax // Load first byte 432 + jl .Lmovq\@ 433 + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes 434 + .Lcombine\@: 435 + shl $3, %ecx 436 + shl %cl, \tmp64 437 + or \tmp64, %rax // Combine the two parts 438 + .Lmovq\@: 439 + vmovq %rax, \dst 440 + .Ldone\@: 441 + .endm 442 + 443 + // Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst. 444 + // Clobbers %rax, %rcx, and \tmp{64,32}. 445 + .macro _store_partial_block src, dst, tmp64, tmp32 446 + sub $8, %ecx // LEN - 8 447 + jl .Llt8\@ 448 + 449 + // Store 8 <= LEN <= 16 bytes. 450 + vpextrq $1, \src, %rax 451 + mov %ecx, \tmp32 452 + shl $3, %ecx 453 + ror %cl, %rax 454 + mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes 455 + vmovq \src, (\dst) // Store first 8 bytes 456 + jmp .Ldone\@ 457 + 458 + .Llt8\@: 459 + add $4, %ecx // LEN - 4 460 + jl .Llt4\@ 461 + 462 + // Store 4 <= LEN <= 7 bytes. 463 + vpextrd $1, \src, %eax 464 + mov %ecx, \tmp32 465 + shl $3, %ecx 466 + ror %cl, %eax 467 + mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes 468 + vmovd \src, (\dst) // Store first 4 bytes 469 + jmp .Ldone\@ 470 + 471 + .Llt4\@: 472 + // Store 1 <= LEN <= 3 bytes. 473 + vpextrb $0, \src, 0(\dst) 474 + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? 475 + jl .Ldone\@ 476 + vpextrb $1, \src, 1(\dst) 477 + je .Ldone\@ 478 + vpextrb $2, \src, 2(\dst) 479 + .Ldone\@: 480 + .endm 481 + 482 + // void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 483 + // u8 ghash_acc[16], 484 + // const u8 *aad, int aadlen); 485 + // 486 + // This function processes the AAD (Additional Authenticated Data) in GCM. 487 + // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the 488 + // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all 489 + // zeroes. |aadlen| must be a multiple of 16, except on the last call where it 490 + // can be any length. The caller must do any buffering needed to ensure this. 491 + // 492 + // This handles large amounts of AAD efficiently, while also keeping overhead 493 + // low for small amounts which is the common case. TLS and IPsec use less than 494 + // one block of AAD, but (uncommonly) other use cases may use much more. 495 + SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2) 496 + 497 + // Function arguments 498 + .set KEY, %rdi 499 + .set GHASH_ACC_PTR, %rsi 500 + .set AAD, %rdx 501 + .set AADLEN, %ecx // Must be %ecx for _load_partial_block 502 + .set AADLEN64, %rcx // Zero-extend AADLEN before using! 503 + 504 + // Additional local variables. 505 + // %rax and %r8 are used as temporary registers. 506 + .set TMP0, %ymm0 507 + .set TMP0_XMM, %xmm0 508 + .set TMP1, %ymm1 509 + .set TMP1_XMM, %xmm1 510 + .set TMP2, %ymm2 511 + .set TMP2_XMM, %xmm2 512 + .set LO, %ymm3 513 + .set LO_XMM, %xmm3 514 + .set MI, %ymm4 515 + .set MI_XMM, %xmm4 516 + .set GHASH_ACC, %ymm5 517 + .set GHASH_ACC_XMM, %xmm5 518 + .set BSWAP_MASK, %ymm6 519 + .set BSWAP_MASK_XMM, %xmm6 520 + .set GFPOLY, %ymm7 521 + .set GFPOLY_XMM, %xmm7 522 + .set H_POW2_XORED, %ymm8 523 + .set H_POW1_XORED, %ymm9 524 + 525 + // Load the bswap_mask and gfpoly constants. Since AADLEN is usually 526 + // small, usually only 128-bit vectors will be used. So as an 527 + // optimization, don't broadcast these constants to both 128-bit lanes 528 + // quite yet. 529 + vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM 530 + vmovdqu .Lgfpoly(%rip), GFPOLY_XMM 531 + 532 + // Load the GHASH accumulator. 533 + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 534 + 535 + // Check for the common case of AADLEN <= 16, as well as AADLEN == 0. 536 + test AADLEN, AADLEN 537 + jz .Laad_done 538 + cmp $16, AADLEN 539 + jle .Laad_lastblock 540 + 541 + // AADLEN > 16, so we'll operate on full vectors. Broadcast bswap_mask 542 + // and gfpoly to both 128-bit lanes. 543 + vinserti128 $1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK 544 + vinserti128 $1, GFPOLY_XMM, GFPOLY, GFPOLY 545 + 546 + // If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time. 547 + add $-128, AADLEN // 128 is 4 bytes, -128 is 1 byte 548 + jl .Laad_loop_4x_done 549 + vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED 550 + vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED 551 + .Laad_loop_4x: 552 + _ghash_4x AAD 553 + sub $-128, AAD 554 + add $-128, AADLEN 555 + jge .Laad_loop_4x 556 + .Laad_loop_4x_done: 557 + 558 + // If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time. 559 + add $96, AADLEN 560 + jl .Laad_loop_1x_done 561 + .Laad_loop_1x: 562 + vmovdqu (AAD), TMP0 563 + vpshufb BSWAP_MASK, TMP0, TMP0 564 + vpxor TMP0, GHASH_ACC, GHASH_ACC 565 + vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 566 + _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO 567 + vextracti128 $1, GHASH_ACC, TMP0_XMM 568 + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 569 + add $32, AAD 570 + sub $32, AADLEN 571 + jge .Laad_loop_1x 572 + .Laad_loop_1x_done: 573 + add $32, AADLEN 574 + // Now 0 <= AADLEN < 32. 575 + 576 + jz .Laad_done 577 + cmp $16, AADLEN 578 + jle .Laad_lastblock 579 + 580 + // Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD. 581 + mov AADLEN, AADLEN // Zero-extend AADLEN to AADLEN64. 582 + vmovdqu (AAD), TMP0_XMM 583 + vmovdqu -16(AAD, AADLEN64), TMP1_XMM 584 + vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM 585 + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 586 + lea .Lrshift_and_bswap_table(%rip), %rax 587 + vpshufb -16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM 588 + vinserti128 $1, TMP1_XMM, GHASH_ACC, GHASH_ACC 589 + vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 590 + _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO 591 + vextracti128 $1, GHASH_ACC, TMP0_XMM 592 + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 593 + jmp .Laad_done 594 + 595 + .Laad_lastblock: 596 + // Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD. 597 + _load_partial_block AAD, TMP0_XMM, %r8, %r8d 598 + vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM 599 + vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 600 + vmovdqu OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM 601 + _ghash_mul TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \ 602 + TMP1_XMM, TMP2_XMM, LO_XMM 603 + 604 + .Laad_done: 605 + // Store the updated GHASH accumulator back to memory. 606 + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 607 + 608 + vzeroupper 609 + RET 610 + SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2) 611 + 612 + // Do one non-last round of AES encryption on the blocks in the given AESDATA 613 + // vectors using the round key that has been broadcast to all 128-bit lanes of 614 + // \round_key. 615 + .macro _vaesenc round_key, vecs:vararg 616 + .irp i, \vecs 617 + vaesenc \round_key, AESDATA\i, AESDATA\i 618 + .endr 619 + .endm 620 + 621 + // Generate counter blocks in the given AESDATA vectors, then do the zero-th AES 622 + // round on them. Clobbers TMP0. 623 + .macro _ctr_begin vecs:vararg 624 + vbroadcasti128 .Linc_2blocks(%rip), TMP0 625 + .irp i, \vecs 626 + vpshufb BSWAP_MASK, LE_CTR, AESDATA\i 627 + vpaddd TMP0, LE_CTR, LE_CTR 628 + .endr 629 + .irp i, \vecs 630 + vpxor RNDKEY0, AESDATA\i, AESDATA\i 631 + .endr 632 + .endm 633 + 634 + // Generate and encrypt counter blocks in the given AESDATA vectors, excluding 635 + // the last AES round. Clobbers %rax and TMP0. 636 + .macro _aesenc_loop vecs:vararg 637 + _ctr_begin \vecs 638 + lea 16(KEY), %rax 639 + .Laesenc_loop\@: 640 + vbroadcasti128 (%rax), TMP0 641 + _vaesenc TMP0, \vecs 642 + add $16, %rax 643 + cmp %rax, RNDKEYLAST_PTR 644 + jne .Laesenc_loop\@ 645 + .endm 646 + 647 + // Finalize the keystream blocks in the given AESDATA vectors by doing the last 648 + // AES round, then XOR those keystream blocks with the corresponding data. 649 + // Reduce latency by doing the XOR before the vaesenclast, utilizing the 650 + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). Clobbers TMP0. 651 + .macro _aesenclast_and_xor vecs:vararg 652 + .irp i, \vecs 653 + vpxor \i*32(SRC), RNDKEYLAST, TMP0 654 + vaesenclast TMP0, AESDATA\i, AESDATA\i 655 + .endr 656 + .irp i, \vecs 657 + vmovdqu AESDATA\i, \i*32(DST) 658 + .endr 659 + .endm 660 + 661 + // void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 662 + // const u32 le_ctr[4], u8 ghash_acc[16], 663 + // const u8 *src, u8 *dst, int datalen); 664 + // 665 + // This macro generates a GCM encryption or decryption update function with the 666 + // above prototype (with \enc selecting which one). The function computes the 667 + // next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|, 668 + // and writes the resulting encrypted or decrypted data to |dst|. It also 669 + // updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext 670 + // bytes. 671 + // 672 + // |datalen| must be a multiple of 16, except on the last call where it can be 673 + // any length. The caller must do any buffering needed to ensure this. Both 674 + // in-place and out-of-place en/decryption are supported. 675 + // 676 + // |le_ctr| must give the current counter in little-endian format. This 677 + // function loads the counter from |le_ctr| and increments the loaded counter as 678 + // needed, but it does *not* store the updated counter back to |le_ctr|. The 679 + // caller must update |le_ctr| if any more data segments follow. Internally, 680 + // only the low 32-bit word of the counter is incremented, following the GCM 681 + // standard. 682 + .macro _aes_gcm_update enc 683 + 684 + // Function arguments 685 + .set KEY, %rdi 686 + .set LE_CTR_PTR, %rsi 687 + .set LE_CTR_PTR32, %esi 688 + .set GHASH_ACC_PTR, %rdx 689 + .set SRC, %rcx // Assumed to be %rcx. 690 + // See .Ltail_xor_and_ghash_1to16bytes 691 + .set DST, %r8 692 + .set DATALEN, %r9d 693 + .set DATALEN64, %r9 // Zero-extend DATALEN before using! 694 + 695 + // Additional local variables 696 + 697 + // %rax is used as a temporary register. LE_CTR_PTR is also available 698 + // as a temporary register after the counter is loaded. 699 + 700 + // AES key length in bytes 701 + .set AESKEYLEN, %r10d 702 + .set AESKEYLEN64, %r10 703 + 704 + // Pointer to the last AES round key for the chosen AES variant 705 + .set RNDKEYLAST_PTR, %r11 706 + 707 + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values 708 + // using vpshufb, copied to all 128-bit lanes. 709 + .set BSWAP_MASK, %ymm0 710 + .set BSWAP_MASK_XMM, %xmm0 711 + 712 + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, 713 + // only the lowest 128-bit lane can be nonzero. When not fully reduced, 714 + // more than one lane may be used, and they need to be XOR'd together. 715 + .set GHASH_ACC, %ymm1 716 + .set GHASH_ACC_XMM, %xmm1 717 + 718 + // TMP[0-2] are temporary registers. 719 + .set TMP0, %ymm2 720 + .set TMP0_XMM, %xmm2 721 + .set TMP1, %ymm3 722 + .set TMP1_XMM, %xmm3 723 + .set TMP2, %ymm4 724 + .set TMP2_XMM, %xmm4 725 + 726 + // LO and MI are used to accumulate unreduced GHASH products. 727 + .set LO, %ymm5 728 + .set LO_XMM, %xmm5 729 + .set MI, %ymm6 730 + .set MI_XMM, %xmm6 731 + 732 + // H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored. The 733 + // descending numbering reflects the order of the key powers. 734 + .set H_POW2_XORED, %ymm7 735 + .set H_POW2_XORED_XMM, %xmm7 736 + .set H_POW1_XORED, %ymm8 737 + 738 + // RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. 739 + .set RNDKEY0, %ymm9 740 + .set RNDKEYLAST, %ymm10 741 + 742 + // LE_CTR contains the next set of little-endian counter blocks. 743 + .set LE_CTR, %ymm11 744 + 745 + // AESDATA[0-3] hold the counter blocks that are being encrypted by AES. 746 + .set AESDATA0, %ymm12 747 + .set AESDATA0_XMM, %xmm12 748 + .set AESDATA1, %ymm13 749 + .set AESDATA1_XMM, %xmm13 750 + .set AESDATA2, %ymm14 751 + .set AESDATA3, %ymm15 752 + 753 + .if \enc 754 + .set GHASHDATA_PTR, DST 755 + .else 756 + .set GHASHDATA_PTR, SRC 757 + .endif 758 + 759 + vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK 760 + 761 + // Load the GHASH accumulator and the starting counter. 762 + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM 763 + vbroadcasti128 (LE_CTR_PTR), LE_CTR 764 + 765 + // Load the AES key length in bytes. 766 + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 767 + 768 + // Make RNDKEYLAST_PTR point to the last AES round key. This is the 769 + // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 770 + // respectively. Then load the zero-th and last round keys. 771 + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR 772 + vbroadcasti128 (KEY), RNDKEY0 773 + vbroadcasti128 (RNDKEYLAST_PTR), RNDKEYLAST 774 + 775 + // Finish initializing LE_CTR by adding 1 to the second block. 776 + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR 777 + 778 + // If there are at least 128 bytes of data, then continue into the loop 779 + // that processes 128 bytes of data at a time. Otherwise skip it. 780 + add $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte 781 + jl .Lcrypt_loop_4x_done\@ 782 + 783 + vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED 784 + vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED 785 + 786 + // Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. 787 + 788 + .if \enc 789 + // Encrypt the first 4 vectors of plaintext blocks. 790 + _aesenc_loop 0,1,2,3 791 + _aesenclast_and_xor 0,1,2,3 792 + sub $-128, SRC // 128 is 4 bytes, -128 is 1 byte 793 + add $-128, DATALEN 794 + jl .Lghash_last_ciphertext_4x\@ 795 + .endif 796 + 797 + .align 16 798 + .Lcrypt_loop_4x\@: 799 + 800 + // Start the AES encryption of the counter blocks. 801 + _ctr_begin 0,1,2,3 802 + cmp $24, AESKEYLEN 803 + jl 128f // AES-128? 804 + je 192f // AES-192? 805 + // AES-256 806 + vbroadcasti128 -13*16(RNDKEYLAST_PTR), TMP0 807 + _vaesenc TMP0, 0,1,2,3 808 + vbroadcasti128 -12*16(RNDKEYLAST_PTR), TMP0 809 + _vaesenc TMP0, 0,1,2,3 810 + 192: 811 + vbroadcasti128 -11*16(RNDKEYLAST_PTR), TMP0 812 + _vaesenc TMP0, 0,1,2,3 813 + vbroadcasti128 -10*16(RNDKEYLAST_PTR), TMP0 814 + _vaesenc TMP0, 0,1,2,3 815 + 128: 816 + 817 + // Finish the AES encryption of the counter blocks in AESDATA[0-3], 818 + // interleaved with the GHASH update of the ciphertext blocks. 819 + .irp i, 9,8,7,6,5,4,3,2,1 820 + _ghash_step_4x (9 - \i), GHASHDATA_PTR 821 + vbroadcasti128 -\i*16(RNDKEYLAST_PTR), TMP0 822 + _vaesenc TMP0, 0,1,2,3 823 + .endr 824 + _ghash_step_4x 9, GHASHDATA_PTR 825 + .if \enc 826 + sub $-128, DST // 128 is 4 bytes, -128 is 1 byte 827 + .endif 828 + _aesenclast_and_xor 0,1,2,3 829 + sub $-128, SRC 830 + .if !\enc 831 + sub $-128, DST 832 + .endif 833 + add $-128, DATALEN 834 + jge .Lcrypt_loop_4x\@ 835 + 836 + .if \enc 837 + .Lghash_last_ciphertext_4x\@: 838 + // Update GHASH with the last set of ciphertext blocks. 839 + _ghash_4x DST 840 + sub $-128, DST 841 + .endif 842 + 843 + .Lcrypt_loop_4x_done\@: 844 + 845 + // Undo the extra subtraction by 128 and check whether data remains. 846 + sub $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte 847 + jz .Ldone\@ 848 + 849 + // The data length isn't a multiple of 128 bytes. Process the remaining 850 + // data of length 1 <= DATALEN < 128. 851 + // 852 + // Since there are enough key powers available for all remaining data, 853 + // there is no need to do a GHASH reduction after each iteration. 854 + // Instead, multiply each remaining block by its own key power, and only 855 + // do a GHASH reduction at the very end. 856 + 857 + // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N 858 + // is the number of blocks that remain. 859 + .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. 860 + .set POWERS_PTR32, LE_CTR_PTR32 861 + mov DATALEN, %eax 862 + neg %rax 863 + and $~15, %rax // -round_up(DATALEN, 16) 864 + lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR 865 + 866 + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. 867 + .set HI, H_POW2_XORED // H_POW2_XORED is free to be reused. 868 + .set HI_XMM, H_POW2_XORED_XMM 869 + vpxor LO_XMM, LO_XMM, LO_XMM 870 + vpxor MI_XMM, MI_XMM, MI_XMM 871 + vpxor HI_XMM, HI_XMM, HI_XMM 872 + 873 + // 1 <= DATALEN < 128. Generate 2 or 4 more vectors of keystream blocks 874 + // excluding the last AES round, depending on the remaining DATALEN. 875 + cmp $64, DATALEN 876 + jg .Ltail_gen_4_keystream_vecs\@ 877 + _aesenc_loop 0,1 878 + cmp $32, DATALEN 879 + jge .Ltail_xor_and_ghash_full_vec_loop\@ 880 + jmp .Ltail_xor_and_ghash_partial_vec\@ 881 + .Ltail_gen_4_keystream_vecs\@: 882 + _aesenc_loop 0,1,2,3 883 + 884 + // XOR the remaining data and accumulate the unreduced GHASH products 885 + // for DATALEN >= 32, starting with one full 32-byte vector at a time. 886 + .Ltail_xor_and_ghash_full_vec_loop\@: 887 + .if \enc 888 + _aesenclast_and_xor 0 889 + vpshufb BSWAP_MASK, AESDATA0, AESDATA0 890 + .else 891 + vmovdqu (SRC), TMP1 892 + vpxor TMP1, RNDKEYLAST, TMP0 893 + vaesenclast TMP0, AESDATA0, AESDATA0 894 + vmovdqu AESDATA0, (DST) 895 + vpshufb BSWAP_MASK, TMP1, AESDATA0 896 + .endif 897 + // The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0. 898 + vpxor GHASH_ACC, AESDATA0, AESDATA0 899 + vmovdqu (POWERS_PTR), TMP2 900 + _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 901 + vmovdqa AESDATA1, AESDATA0 902 + vmovdqa AESDATA2, AESDATA1 903 + vmovdqa AESDATA3, AESDATA2 904 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 905 + add $32, SRC 906 + add $32, DST 907 + add $32, POWERS_PTR 908 + sub $32, DATALEN 909 + cmp $32, DATALEN 910 + jge .Ltail_xor_and_ghash_full_vec_loop\@ 911 + test DATALEN, DATALEN 912 + jz .Ltail_ghash_reduce\@ 913 + 914 + .Ltail_xor_and_ghash_partial_vec\@: 915 + // XOR the remaining data and accumulate the unreduced GHASH products, 916 + // for 1 <= DATALEN < 32. 917 + vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 918 + cmp $16, DATALEN 919 + jle .Ltail_xor_and_ghash_1to16bytes\@ 920 + 921 + // Handle 17 <= DATALEN < 32. 922 + 923 + // Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes 924 + // (shifting in zeroes), then reflect all 16 bytes. 925 + lea .Lrshift_and_bswap_table(%rip), %rax 926 + vmovdqu -16(%rax, DATALEN64), TMP2_XMM 927 + 928 + // Move the second keystream block to its own register and left-align it 929 + vextracti128 $1, AESDATA0, AESDATA1_XMM 930 + vpxor .Lfifteens(%rip), TMP2_XMM, TMP0_XMM 931 + vpshufb TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM 932 + 933 + // Using overlapping loads and stores, XOR the source data with the 934 + // keystream and write the destination data. Then prepare the GHASH 935 + // input data: the full ciphertext block and the zero-padded partial 936 + // ciphertext block, both byte-reflected, in AESDATA0. 937 + .if \enc 938 + vpxor -16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM 939 + vpxor (SRC), AESDATA0_XMM, AESDATA0_XMM 940 + vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) 941 + vmovdqu AESDATA0_XMM, (DST) 942 + vpshufb TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM 943 + vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM 944 + .else 945 + vmovdqu -16(SRC, DATALEN64), TMP1_XMM 946 + vmovdqu (SRC), TMP0_XMM 947 + vpxor TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM 948 + vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM 949 + vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) 950 + vmovdqu AESDATA0_XMM, (DST) 951 + vpshufb TMP2_XMM, TMP1_XMM, AESDATA1_XMM 952 + vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM 953 + .endif 954 + vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM 955 + vinserti128 $1, AESDATA1_XMM, AESDATA0, AESDATA0 956 + vmovdqu (POWERS_PTR), TMP2 957 + jmp .Ltail_ghash_last_vec\@ 958 + 959 + .Ltail_xor_and_ghash_1to16bytes\@: 960 + // Handle 1 <= DATALEN <= 16. Carefully load and store the 961 + // possibly-partial block, which we mustn't access out of bounds. 962 + vmovdqu (POWERS_PTR), TMP2_XMM 963 + mov SRC, KEY // Free up %rcx, assuming SRC == %rcx 964 + mov DATALEN, %ecx 965 + _load_partial_block KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32 966 + vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM 967 + mov DATALEN, %ecx 968 + _store_partial_block AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32 969 + .if \enc 970 + lea .Lselect_high_bytes_table(%rip), %rax 971 + vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM 972 + vpand (%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM 973 + .else 974 + vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM 975 + .endif 976 + vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM 977 + 978 + .Ltail_ghash_last_vec\@: 979 + // Accumulate the unreduced GHASH products for the last 1-2 blocks. The 980 + // GHASH input data is in AESDATA0. If only one block remains, then the 981 + // second block in AESDATA0 is zero and does not affect the result. 982 + _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 983 + 984 + .Ltail_ghash_reduce\@: 985 + // Finally, do the GHASH reduction. 986 + vbroadcasti128 .Lgfpoly(%rip), TMP0 987 + _ghash_reduce LO, MI, HI, TMP0, TMP1 988 + vextracti128 $1, HI, GHASH_ACC_XMM 989 + vpxor HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM 990 + 991 + .Ldone\@: 992 + // Store the updated GHASH accumulator back to memory. 993 + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) 994 + 995 + vzeroupper 996 + RET 997 + .endm 998 + 999 + // void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 1000 + // const u32 le_ctr[4], u8 ghash_acc[16], 1001 + // u64 total_aadlen, u64 total_datalen); 1002 + // bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 1003 + // const u32 le_ctr[4], const u8 ghash_acc[16], 1004 + // u64 total_aadlen, u64 total_datalen, 1005 + // const u8 tag[16], int taglen); 1006 + // 1007 + // This macro generates one of the above two functions (with \enc selecting 1008 + // which one). Both functions finish computing the GCM authentication tag by 1009 + // updating GHASH with the lengths block and encrypting the GHASH accumulator. 1010 + // |total_aadlen| and |total_datalen| must be the total length of the additional 1011 + // authenticated data and the en/decrypted data in bytes, respectively. 1012 + // 1013 + // The encryption function then stores the full-length (16-byte) computed 1014 + // authentication tag to |ghash_acc|. The decryption function instead loads the 1015 + // expected authentication tag (the one that was transmitted) from the 16-byte 1016 + // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the 1017 + // computed tag in constant time, and returns true if and only if they match. 1018 + .macro _aes_gcm_final enc 1019 + 1020 + // Function arguments 1021 + .set KEY, %rdi 1022 + .set LE_CTR_PTR, %rsi 1023 + .set GHASH_ACC_PTR, %rdx 1024 + .set TOTAL_AADLEN, %rcx 1025 + .set TOTAL_DATALEN, %r8 1026 + .set TAG, %r9 1027 + .set TAGLEN, %r10d // Originally at 8(%rsp) 1028 + .set TAGLEN64, %r10 1029 + 1030 + // Additional local variables. 1031 + // %rax and %xmm0-%xmm3 are used as temporary registers. 1032 + .set AESKEYLEN, %r11d 1033 + .set AESKEYLEN64, %r11 1034 + .set GFPOLY, %xmm4 1035 + .set BSWAP_MASK, %xmm5 1036 + .set LE_CTR, %xmm6 1037 + .set GHASH_ACC, %xmm7 1038 + .set H_POW1, %xmm8 1039 + 1040 + // Load some constants. 1041 + vmovdqa .Lgfpoly(%rip), GFPOLY 1042 + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK 1043 + 1044 + // Load the AES key length in bytes. 1045 + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN 1046 + 1047 + // Set up a counter block with 1 in the low 32-bit word. This is the 1048 + // counter that produces the ciphertext needed to encrypt the auth tag. 1049 + // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. 1050 + vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR 1051 + 1052 + // Build the lengths block and XOR it with the GHASH accumulator. 1053 + // Although the lengths block is defined as the AAD length followed by 1054 + // the en/decrypted data length, both in big-endian byte order, a byte 1055 + // reflection of the full block is needed because of the way we compute 1056 + // GHASH (see _ghash_mul_step). By using little-endian values in the 1057 + // opposite order, we avoid having to reflect any bytes here. 1058 + vmovq TOTAL_DATALEN, %xmm0 1059 + vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 1060 + vpsllq $3, %xmm0, %xmm0 // Bytes to bits 1061 + vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC 1062 + 1063 + // Load the first hash key power (H^1), which is stored last. 1064 + vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1 1065 + 1066 + // Load TAGLEN if decrypting. 1067 + .if !\enc 1068 + movl 8(%rsp), TAGLEN 1069 + .endif 1070 + 1071 + // Make %rax point to the last AES round key for the chosen AES variant. 1072 + lea 6*16(KEY,AESKEYLEN64,4), %rax 1073 + 1074 + // Start the AES encryption of the counter block by swapping the counter 1075 + // block to big-endian and XOR-ing it with the zero-th AES round key. 1076 + vpshufb BSWAP_MASK, LE_CTR, %xmm0 1077 + vpxor (KEY), %xmm0, %xmm0 1078 + 1079 + // Complete the AES encryption and multiply GHASH_ACC by H^1. 1080 + // Interleave the AES and GHASH instructions to improve performance. 1081 + cmp $24, AESKEYLEN 1082 + jl 128f // AES-128? 1083 + je 192f // AES-192? 1084 + // AES-256 1085 + vaesenc -13*16(%rax), %xmm0, %xmm0 1086 + vaesenc -12*16(%rax), %xmm0, %xmm0 1087 + 192: 1088 + vaesenc -11*16(%rax), %xmm0, %xmm0 1089 + vaesenc -10*16(%rax), %xmm0, %xmm0 1090 + 128: 1091 + .irp i, 0,1,2,3,4,5,6,7,8 1092 + _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1093 + %xmm1, %xmm2, %xmm3 1094 + vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 1095 + .endr 1096 + _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ 1097 + %xmm1, %xmm2, %xmm3 1098 + 1099 + // Undo the byte reflection of the GHASH accumulator. 1100 + vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC 1101 + 1102 + // Do the last AES round and XOR the resulting keystream block with the 1103 + // GHASH accumulator to produce the full computed authentication tag. 1104 + // 1105 + // Reduce latency by taking advantage of the property vaesenclast(key, 1106 + // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last 1107 + // round key, instead of XOR'ing the final AES output with GHASH_ACC. 1108 + // 1109 + // enc_final then returns the computed auth tag, while dec_final 1110 + // compares it with the transmitted one and returns a bool. To compare 1111 + // the tags, dec_final XORs them together and uses vptest to check 1112 + // whether the result is all-zeroes. This should be constant-time. 1113 + // dec_final applies the vaesenclast optimization to this additional 1114 + // value XOR'd too. 1115 + .if \enc 1116 + vpxor (%rax), GHASH_ACC, %xmm1 1117 + vaesenclast %xmm1, %xmm0, GHASH_ACC 1118 + vmovdqu GHASH_ACC, (GHASH_ACC_PTR) 1119 + .else 1120 + vpxor (TAG), GHASH_ACC, GHASH_ACC 1121 + vpxor (%rax), GHASH_ACC, GHASH_ACC 1122 + vaesenclast GHASH_ACC, %xmm0, %xmm0 1123 + lea .Lselect_high_bytes_table(%rip), %rax 1124 + vmovdqu (%rax, TAGLEN64), %xmm1 1125 + vpshufb BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high 1126 + xor %eax, %eax 1127 + vptest %xmm1, %xmm0 1128 + sete %al 1129 + .endif 1130 + // No need for vzeroupper here, since only used xmm registers were used. 1131 + RET 1132 + .endm 1133 + 1134 + SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2) 1135 + _aes_gcm_update 1 1136 + SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2) 1137 + SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2) 1138 + _aes_gcm_update 0 1139 + SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2) 1140 + 1141 + SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2) 1142 + _aes_gcm_final 1 1143 + SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2) 1144 + SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2) 1145 + _aes_gcm_final 0 1146 + SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2)
+165 -99
arch/x86/crypto/aesni-intel_glue.c
··· 874 874 #define AES_GCM_KEY_AESNI_SIZE \ 875 875 (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) 876 876 877 - /* Key struct used by the VAES + AVX10 implementations of AES-GCM */ 878 - struct aes_gcm_key_avx10 { 877 + /* Key struct used by the VAES + AVX2 implementation of AES-GCM */ 878 + struct aes_gcm_key_vaes_avx2 { 879 + /* 880 + * Common part of the key. The assembly code prefers 16-byte alignment 881 + * for the round keys; we get this by them being located at the start of 882 + * the struct and the whole struct being 32-byte aligned. 883 + */ 884 + struct aes_gcm_key base; 885 + 886 + /* 887 + * Powers of the hash key H^8 through H^1. These are 128-bit values. 888 + * They all have an extra factor of x^-1 and are byte-reversed. 889 + * The assembly code prefers 32-byte alignment for this. 890 + */ 891 + u64 h_powers[8][2] __aligned(32); 892 + 893 + /* 894 + * Each entry in this array contains the two halves of an entry of 895 + * h_powers XOR'd together, in the following order: 896 + * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7. 897 + * This is used for Karatsuba multiplication. 898 + */ 899 + u64 h_powers_xored[8]; 900 + }; 901 + 902 + #define AES_GCM_KEY_VAES_AVX2(key) \ 903 + container_of((key), struct aes_gcm_key_vaes_avx2, base) 904 + #define AES_GCM_KEY_VAES_AVX2_SIZE \ 905 + (sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1))) 906 + 907 + /* Key struct used by the VAES + AVX512 implementation of AES-GCM */ 908 + struct aes_gcm_key_vaes_avx512 { 879 909 /* 880 910 * Common part of the key. The assembly code prefers 16-byte alignment 881 911 * for the round keys; we get this by them being located at the start of ··· 925 895 /* Three padding blocks required by the assembly code */ 926 896 u64 padding[3][2]; 927 897 }; 928 - #define AES_GCM_KEY_AVX10(key) \ 929 - container_of((key), struct aes_gcm_key_avx10, base) 930 - #define AES_GCM_KEY_AVX10_SIZE \ 931 - (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) 898 + #define AES_GCM_KEY_VAES_AVX512(key) \ 899 + container_of((key), struct aes_gcm_key_vaes_avx512, base) 900 + #define AES_GCM_KEY_VAES_AVX512_SIZE \ 901 + (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1))) 932 902 933 903 /* 934 904 * These flags are passed to the AES-GCM helper functions to specify the ··· 940 910 #define FLAG_RFC4106 BIT(0) 941 911 #define FLAG_ENC BIT(1) 942 912 #define FLAG_AVX BIT(2) 943 - #define FLAG_AVX10_256 BIT(3) 944 - #define FLAG_AVX10_512 BIT(4) 913 + #define FLAG_VAES_AVX2 BIT(3) 914 + #define FLAG_VAES_AVX512 BIT(4) 945 915 946 916 static inline struct aes_gcm_key * 947 917 aes_gcm_key_get(struct crypto_aead *tfm, int flags) 948 918 { 949 - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 919 + if (flags & FLAG_VAES_AVX512) 950 920 return PTR_ALIGN(crypto_aead_ctx(tfm), 64); 921 + else if (flags & FLAG_VAES_AVX2) 922 + return PTR_ALIGN(crypto_aead_ctx(tfm), 32); 951 923 else 952 924 return PTR_ALIGN(crypto_aead_ctx(tfm), 16); 953 925 } ··· 959 927 asmlinkage void 960 928 aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); 961 929 asmlinkage void 962 - aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); 930 + aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); 963 931 asmlinkage void 964 - aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); 932 + aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); 965 933 966 934 static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) 967 935 { 968 - /* 969 - * To make things a bit easier on the assembly side, the AVX10 970 - * implementations use the same key format. Therefore, a single 971 - * function using 256-bit vectors would suffice here. However, it's 972 - * straightforward to provide a 512-bit one because of how the assembly 973 - * code is structured, and it works nicely because the total size of the 974 - * key powers is a multiple of 512 bits. So we take advantage of that. 975 - * 976 - * A similar situation applies to the AES-NI implementations. 977 - */ 978 - if (flags & FLAG_AVX10_512) 979 - aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); 980 - else if (flags & FLAG_AVX10_256) 981 - aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); 936 + if (flags & FLAG_VAES_AVX512) 937 + aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key)); 938 + else if (flags & FLAG_VAES_AVX2) 939 + aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key)); 982 940 else if (flags & FLAG_AVX) 983 941 aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); 984 942 else ··· 982 960 aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, 983 961 u8 ghash_acc[16], const u8 *aad, int aadlen); 984 962 asmlinkage void 985 - aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, 986 - u8 ghash_acc[16], const u8 *aad, int aadlen); 963 + aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 964 + u8 ghash_acc[16], const u8 *aad, int aadlen); 965 + asmlinkage void 966 + aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 967 + u8 ghash_acc[16], const u8 *aad, int aadlen); 987 968 988 969 static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], 989 970 const u8 *aad, int aadlen, int flags) 990 971 { 991 - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 992 - aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, 993 - aad, aadlen); 972 + if (flags & FLAG_VAES_AVX512) 973 + aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), 974 + ghash_acc, aad, aadlen); 975 + else if (flags & FLAG_VAES_AVX2) 976 + aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), 977 + ghash_acc, aad, aadlen); 994 978 else if (flags & FLAG_AVX) 995 979 aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, 996 980 aad, aadlen); ··· 1014 986 const u32 le_ctr[4], u8 ghash_acc[16], 1015 987 const u8 *src, u8 *dst, int datalen); 1016 988 asmlinkage void 1017 - aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, 1018 - const u32 le_ctr[4], u8 ghash_acc[16], 1019 - const u8 *src, u8 *dst, int datalen); 989 + aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 990 + const u32 le_ctr[4], u8 ghash_acc[16], 991 + const u8 *src, u8 *dst, int datalen); 1020 992 asmlinkage void 1021 - aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, 1022 - const u32 le_ctr[4], u8 ghash_acc[16], 1023 - const u8 *src, u8 *dst, int datalen); 993 + aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 994 + const u32 le_ctr[4], u8 ghash_acc[16], 995 + const u8 *src, u8 *dst, int datalen); 1024 996 1025 997 asmlinkage void 1026 998 aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, ··· 1031 1003 const u32 le_ctr[4], u8 ghash_acc[16], 1032 1004 const u8 *src, u8 *dst, int datalen); 1033 1005 asmlinkage void 1034 - aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, 1035 - const u32 le_ctr[4], u8 ghash_acc[16], 1036 - const u8 *src, u8 *dst, int datalen); 1006 + aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 1007 + const u32 le_ctr[4], u8 ghash_acc[16], 1008 + const u8 *src, u8 *dst, int datalen); 1037 1009 asmlinkage void 1038 - aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, 1039 - const u32 le_ctr[4], u8 ghash_acc[16], 1040 - const u8 *src, u8 *dst, int datalen); 1010 + aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 1011 + const u32 le_ctr[4], u8 ghash_acc[16], 1012 + const u8 *src, u8 *dst, int datalen); 1041 1013 1042 1014 /* __always_inline to optimize out the branches based on @flags */ 1043 1015 static __always_inline void ··· 1046 1018 const u8 *src, u8 *dst, int datalen, int flags) 1047 1019 { 1048 1020 if (flags & FLAG_ENC) { 1049 - if (flags & FLAG_AVX10_512) 1050 - aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), 1051 - le_ctr, ghash_acc, 1052 - src, dst, datalen); 1053 - else if (flags & FLAG_AVX10_256) 1054 - aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), 1055 - le_ctr, ghash_acc, 1056 - src, dst, datalen); 1021 + if (flags & FLAG_VAES_AVX512) 1022 + aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), 1023 + le_ctr, ghash_acc, 1024 + src, dst, datalen); 1025 + else if (flags & FLAG_VAES_AVX2) 1026 + aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), 1027 + le_ctr, ghash_acc, 1028 + src, dst, datalen); 1057 1029 else if (flags & FLAG_AVX) 1058 1030 aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), 1059 1031 le_ctr, ghash_acc, ··· 1062 1034 aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, 1063 1035 ghash_acc, src, dst, datalen); 1064 1036 } else { 1065 - if (flags & FLAG_AVX10_512) 1066 - aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), 1067 - le_ctr, ghash_acc, 1068 - src, dst, datalen); 1069 - else if (flags & FLAG_AVX10_256) 1070 - aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), 1071 - le_ctr, ghash_acc, 1072 - src, dst, datalen); 1037 + if (flags & FLAG_VAES_AVX512) 1038 + aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), 1039 + le_ctr, ghash_acc, 1040 + src, dst, datalen); 1041 + else if (flags & FLAG_VAES_AVX2) 1042 + aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), 1043 + le_ctr, ghash_acc, 1044 + src, dst, datalen); 1073 1045 else if (flags & FLAG_AVX) 1074 1046 aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), 1075 1047 le_ctr, ghash_acc, ··· 1090 1062 const u32 le_ctr[4], u8 ghash_acc[16], 1091 1063 u64 total_aadlen, u64 total_datalen); 1092 1064 asmlinkage void 1093 - aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1094 - const u32 le_ctr[4], u8 ghash_acc[16], 1095 - u64 total_aadlen, u64 total_datalen); 1065 + aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 1066 + const u32 le_ctr[4], u8 ghash_acc[16], 1067 + u64 total_aadlen, u64 total_datalen); 1068 + asmlinkage void 1069 + aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 1070 + const u32 le_ctr[4], u8 ghash_acc[16], 1071 + u64 total_aadlen, u64 total_datalen); 1096 1072 1097 1073 /* __always_inline to optimize out the branches based on @flags */ 1098 1074 static __always_inline void ··· 1104 1072 const u32 le_ctr[4], u8 ghash_acc[16], 1105 1073 u64 total_aadlen, u64 total_datalen, int flags) 1106 1074 { 1107 - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 1108 - aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), 1109 - le_ctr, ghash_acc, 1110 - total_aadlen, total_datalen); 1075 + if (flags & FLAG_VAES_AVX512) 1076 + aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), 1077 + le_ctr, ghash_acc, 1078 + total_aadlen, total_datalen); 1079 + else if (flags & FLAG_VAES_AVX2) 1080 + aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), 1081 + le_ctr, ghash_acc, 1082 + total_aadlen, total_datalen); 1111 1083 else if (flags & FLAG_AVX) 1112 1084 aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), 1113 1085 le_ctr, ghash_acc, ··· 1133 1097 u64 total_aadlen, u64 total_datalen, 1134 1098 const u8 tag[16], int taglen); 1135 1099 asmlinkage bool __must_check 1136 - aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, 1137 - const u32 le_ctr[4], const u8 ghash_acc[16], 1138 - u64 total_aadlen, u64 total_datalen, 1139 - const u8 tag[16], int taglen); 1100 + aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, 1101 + const u32 le_ctr[4], const u8 ghash_acc[16], 1102 + u64 total_aadlen, u64 total_datalen, 1103 + const u8 tag[16], int taglen); 1104 + asmlinkage bool __must_check 1105 + aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, 1106 + const u32 le_ctr[4], const u8 ghash_acc[16], 1107 + u64 total_aadlen, u64 total_datalen, 1108 + const u8 tag[16], int taglen); 1140 1109 1141 1110 /* __always_inline to optimize out the branches based on @flags */ 1142 1111 static __always_inline bool __must_check ··· 1149 1108 u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, 1150 1109 u8 tag[16], int taglen, int flags) 1151 1110 { 1152 - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) 1153 - return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), 1154 - le_ctr, ghash_acc, 1155 - total_aadlen, total_datalen, 1156 - tag, taglen); 1111 + if (flags & FLAG_VAES_AVX512) 1112 + return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), 1113 + le_ctr, ghash_acc, 1114 + total_aadlen, total_datalen, 1115 + tag, taglen); 1116 + else if (flags & FLAG_VAES_AVX2) 1117 + return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), 1118 + le_ctr, ghash_acc, 1119 + total_aadlen, total_datalen, 1120 + tag, taglen); 1157 1121 else if (flags & FLAG_AVX) 1158 1122 return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), 1159 1123 le_ctr, ghash_acc, ··· 1241 1195 BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); 1242 1196 BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); 1243 1197 BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); 1244 - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); 1245 - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); 1246 - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); 1247 - BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); 1198 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0); 1199 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480); 1200 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512); 1201 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640); 1202 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0); 1203 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480); 1204 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512); 1205 + BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768); 1248 1206 1249 1207 if (likely(crypto_simd_usable())) { 1250 1208 err = aes_check_keylen(keylen); ··· 1281 1231 gf128mul_lle(&h, (const be128 *)x_to_the_minus1); 1282 1232 1283 1233 /* Compute the needed key powers */ 1284 - if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { 1285 - struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); 1234 + if (flags & FLAG_VAES_AVX512) { 1235 + struct aes_gcm_key_vaes_avx512 *k = 1236 + AES_GCM_KEY_VAES_AVX512(key); 1286 1237 1287 1238 for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { 1288 1239 k->h_powers[i][0] = be64_to_cpu(h.b); ··· 1291 1240 gf128mul_lle(&h, &h1); 1292 1241 } 1293 1242 memset(k->padding, 0, sizeof(k->padding)); 1243 + } else if (flags & FLAG_VAES_AVX2) { 1244 + struct aes_gcm_key_vaes_avx2 *k = 1245 + AES_GCM_KEY_VAES_AVX2(key); 1246 + static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 }; 1247 + 1248 + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { 1249 + k->h_powers[i][0] = be64_to_cpu(h.b); 1250 + k->h_powers[i][1] = be64_to_cpu(h.a); 1251 + gf128mul_lle(&h, &h1); 1252 + } 1253 + for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) { 1254 + int j = indices[i]; 1255 + 1256 + k->h_powers_xored[i] = k->h_powers[j][0] ^ 1257 + k->h_powers[j][1]; 1258 + } 1294 1259 } else { 1295 1260 struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); 1296 1261 ··· 1575 1508 "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", 1576 1509 AES_GCM_KEY_AESNI_SIZE, 500); 1577 1510 1578 - /* aes_gcm_algs_vaes_avx10_256 */ 1579 - DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, 1580 - "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", 1581 - AES_GCM_KEY_AVX10_SIZE, 700); 1511 + /* aes_gcm_algs_vaes_avx2 */ 1512 + DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2, 1513 + "generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2", 1514 + AES_GCM_KEY_VAES_AVX2_SIZE, 600); 1582 1515 1583 - /* aes_gcm_algs_vaes_avx10_512 */ 1584 - DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, 1585 - "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", 1586 - AES_GCM_KEY_AVX10_SIZE, 800); 1516 + /* aes_gcm_algs_vaes_avx512 */ 1517 + DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512, 1518 + "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512", 1519 + AES_GCM_KEY_VAES_AVX512_SIZE, 800); 1587 1520 1588 1521 static int __init register_avx_algs(void) 1589 1522 { ··· 1615 1548 ARRAY_SIZE(skcipher_algs_vaes_avx2)); 1616 1549 if (err) 1617 1550 return err; 1551 + err = crypto_register_aeads(aes_gcm_algs_vaes_avx2, 1552 + ARRAY_SIZE(aes_gcm_algs_vaes_avx2)); 1553 + if (err) 1554 + return err; 1618 1555 1619 1556 if (!boot_cpu_has(X86_FEATURE_AVX512BW) || 1620 1557 !boot_cpu_has(X86_FEATURE_AVX512VL) || ··· 1627 1556 XFEATURE_MASK_AVX512, NULL)) 1628 1557 return 0; 1629 1558 1630 - err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256, 1631 - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256)); 1632 - if (err) 1633 - return err; 1634 - 1635 1559 if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { 1636 1560 int i; 1637 1561 1638 1562 for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) 1639 1563 skcipher_algs_vaes_avx512[i].base.cra_priority = 1; 1640 - for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) 1641 - aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; 1564 + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++) 1565 + aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1; 1642 1566 } 1643 1567 1644 1568 err = crypto_register_skciphers(skcipher_algs_vaes_avx512, 1645 1569 ARRAY_SIZE(skcipher_algs_vaes_avx512)); 1646 1570 if (err) 1647 1571 return err; 1648 - err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512, 1649 - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512)); 1572 + err = crypto_register_aeads(aes_gcm_algs_vaes_avx512, 1573 + ARRAY_SIZE(aes_gcm_algs_vaes_avx512)); 1650 1574 if (err) 1651 1575 return err; 1652 1576 ··· 1661 1595 unregister_aeads(aes_gcm_algs_aesni_avx); 1662 1596 unregister_skciphers(skcipher_algs_vaes_avx2); 1663 1597 unregister_skciphers(skcipher_algs_vaes_avx512); 1664 - unregister_aeads(aes_gcm_algs_vaes_avx10_256); 1665 - unregister_aeads(aes_gcm_algs_vaes_avx10_512); 1598 + unregister_aeads(aes_gcm_algs_vaes_avx2); 1599 + unregister_aeads(aes_gcm_algs_vaes_avx512); 1666 1600 } 1667 1601 #else /* CONFIG_X86_64 */ 1668 1602 static struct aead_alg aes_gcm_algs_aesni[0];