crypto: x86/aes-ctr - rewrite AESNI+AVX optimized CTR and add VAES support

+1 -1

arch/x86/crypto/Makefile

··· 48 48 49 49 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 50 50 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 51 - aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \ 51 + aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ 52 52 aes-gcm-aesni-x86_64.o \ 53 53 aes-xts-avx-x86_64.o 54 54 ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)

+592

arch/x86/crypto/aes-ctr-avx-x86_64.S

··· 1 + /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2 + // 3 + // Copyright 2025 Google LLC 4 + // 5 + // Author: Eric Biggers <ebiggers@google.com> 6 + // 7 + // This file is dual-licensed, meaning that you can use it under your choice of 8 + // either of the following two licenses: 9 + // 10 + // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy 11 + // of the License at 12 + // 13 + // http://www.apache.org/licenses/LICENSE-2.0 14 + // 15 + // Unless required by applicable law or agreed to in writing, software 16 + // distributed under the License is distributed on an "AS IS" BASIS, 17 + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 + // See the License for the specific language governing permissions and 19 + // limitations under the License. 20 + // 21 + // or 22 + // 23 + // Redistribution and use in source and binary forms, with or without 24 + // modification, are permitted provided that the following conditions are met: 25 + // 26 + // 1. Redistributions of source code must retain the above copyright notice, 27 + // this list of conditions and the following disclaimer. 28 + // 29 + // 2. Redistributions in binary form must reproduce the above copyright 30 + // notice, this list of conditions and the following disclaimer in the 31 + // documentation and/or other materials provided with the distribution. 32 + // 33 + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 34 + // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 + // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 37 + // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 38 + // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 39 + // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 40 + // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 41 + // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 42 + // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 43 + // POSSIBILITY OF SUCH DAMAGE. 44 + // 45 + //------------------------------------------------------------------------------ 46 + // 47 + // This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR 48 + // using the following sets of CPU features: 49 + // - AES-NI && AVX 50 + // - VAES && AVX2 51 + // - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2 52 + // - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2 53 + // 54 + // See the function definitions at the bottom of the file for more information. 55 + 56 + #include <linux/linkage.h> 57 + #include <linux/cfi_types.h> 58 + 59 + .section .rodata 60 + .p2align 4 61 + 62 + .Lbswap_mask: 63 + .octa 0x000102030405060708090a0b0c0d0e0f 64 + 65 + .Lctr_pattern: 66 + .quad 0, 0 67 + .Lone: 68 + .quad 1, 0 69 + .Ltwo: 70 + .quad 2, 0 71 + .quad 3, 0 72 + 73 + .Lfour: 74 + .quad 4, 0 75 + 76 + .text 77 + 78 + // Move a vector between memory and a register. 79 + // The register operand must be in the first 16 vector registers. 80 + .macro _vmovdqu src, dst 81 + .if VL < 64 82 + vmovdqu \src, \dst 83 + .else 84 + vmovdqu8 \src, \dst 85 + .endif 86 + .endm 87 + 88 + // Move a vector between registers. 89 + // The registers must be in the first 16 vector registers. 90 + .macro _vmovdqa src, dst 91 + .if VL < 64 92 + vmovdqa \src, \dst 93 + .else 94 + vmovdqa64 \src, \dst 95 + .endif 96 + .endm 97 + 98 + // Broadcast a 128-bit value from memory to all 128-bit lanes of a vector 99 + // register. The register operand must be in the first 16 vector registers. 100 + .macro _vbroadcast128 src, dst 101 + .if VL == 16 102 + vmovdqu \src, \dst 103 + .elseif VL == 32 104 + vbroadcasti128 \src, \dst 105 + .else 106 + vbroadcasti32x4 \src, \dst 107 + .endif 108 + .endm 109 + 110 + // XOR two vectors together. 111 + // Any register operands must be in the first 16 vector registers. 112 + .macro _vpxor src1, src2, dst 113 + .if VL < 64 114 + vpxor \src1, \src2, \dst 115 + .else 116 + vpxord \src1, \src2, \dst 117 + .endif 118 + .endm 119 + 120 + // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst 121 + // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. 122 + .macro _load_partial_block src, dst, tmp64, tmp32 123 + sub $8, %ecx // LEN - 8 124 + jle .Lle8\@ 125 + 126 + // Load 9 <= LEN <= 15 bytes. 127 + vmovq (\src), \dst // Load first 8 bytes 128 + mov (\src, %rcx), %rax // Load last 8 bytes 129 + neg %ecx 130 + shl $3, %ecx 131 + shr %cl, %rax // Discard overlapping bytes 132 + vpinsrq $1, %rax, \dst, \dst 133 + jmp .Ldone\@ 134 + 135 + .Lle8\@: 136 + add $4, %ecx // LEN - 4 137 + jl .Llt4\@ 138 + 139 + // Load 4 <= LEN <= 8 bytes. 140 + mov (\src), %eax // Load first 4 bytes 141 + mov (\src, %rcx), \tmp32 // Load last 4 bytes 142 + jmp .Lcombine\@ 143 + 144 + .Llt4\@: 145 + // Load 1 <= LEN <= 3 bytes. 146 + add $2, %ecx // LEN - 2 147 + movzbl (\src), %eax // Load first byte 148 + jl .Lmovq\@ 149 + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes 150 + .Lcombine\@: 151 + shl $3, %ecx 152 + shl %cl, \tmp64 153 + or \tmp64, %rax // Combine the two parts 154 + .Lmovq\@: 155 + vmovq %rax, \dst 156 + .Ldone\@: 157 + .endm 158 + 159 + // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. 160 + // Clobbers %rax, %rcx, and \tmp{64,32}. 161 + .macro _store_partial_block src, dst, tmp64, tmp32 162 + sub $8, %ecx // LEN - 8 163 + jl .Llt8\@ 164 + 165 + // Store 8 <= LEN <= 15 bytes. 166 + vpextrq $1, \src, %rax 167 + mov %ecx, \tmp32 168 + shl $3, %ecx 169 + ror %cl, %rax 170 + mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes 171 + vmovq \src, (\dst) // Store first 8 bytes 172 + jmp .Ldone\@ 173 + 174 + .Llt8\@: 175 + add $4, %ecx // LEN - 4 176 + jl .Llt4\@ 177 + 178 + // Store 4 <= LEN <= 7 bytes. 179 + vpextrd $1, \src, %eax 180 + mov %ecx, \tmp32 181 + shl $3, %ecx 182 + ror %cl, %eax 183 + mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes 184 + vmovd \src, (\dst) // Store first 4 bytes 185 + jmp .Ldone\@ 186 + 187 + .Llt4\@: 188 + // Store 1 <= LEN <= 3 bytes. 189 + vpextrb $0, \src, 0(\dst) 190 + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? 191 + jl .Ldone\@ 192 + vpextrb $1, \src, 1(\dst) 193 + je .Ldone\@ 194 + vpextrb $2, \src, 2(\dst) 195 + .Ldone\@: 196 + .endm 197 + 198 + // Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and 199 + // XOR each with the zero-th round key. Also update LE_CTR if !\final. 200 + .macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 201 + .if \is_xctr 202 + .if USE_AVX10 203 + _vmovdqa LE_CTR, AESDATA\i0 204 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 205 + .else 206 + vpxor XCTR_IV, LE_CTR, AESDATA\i0 207 + vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 208 + .endif 209 + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 210 + 211 + .if USE_AVX10 212 + vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 213 + .else 214 + vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 215 + vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 216 + .endif 217 + .else 218 + vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 219 + _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 220 + vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 221 + vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 222 + _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 223 + .endif 224 + .if !\final 225 + vpaddq LE_CTR_INC2, LE_CTR, LE_CTR 226 + .endif 227 + .endm 228 + 229 + // Do all AES rounds on the data in the given AESDATA vectors, excluding the 230 + // zero-th and last rounds. 231 + .macro _aesenc_loop vecs:vararg 232 + mov KEY, %rax 233 + 1: 234 + _vbroadcast128 (%rax), RNDKEY 235 + .irp i, \vecs 236 + vaesenc RNDKEY, AESDATA\i, AESDATA\i 237 + .endr 238 + add $16, %rax 239 + cmp %rax, RNDKEYLAST_PTR 240 + jne 1b 241 + .endm 242 + 243 + // Finalize the keystream blocks in the given AESDATA vectors by doing the last 244 + // AES round, then XOR those keystream blocks with the corresponding data. 245 + // Reduce latency by doing the XOR before the vaesenclast, utilizing the 246 + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). 247 + .macro _aesenclast_and_xor vecs:vararg 248 + .irp i, \vecs 249 + _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY 250 + vaesenclast RNDKEY, AESDATA\i, AESDATA\i 251 + .endr 252 + .irp i, \vecs 253 + _vmovdqu AESDATA\i, \i*VL(DST) 254 + .endr 255 + .endm 256 + 257 + // XOR the keystream blocks in the specified AESDATA vectors with the 258 + // corresponding data. 259 + .macro _xor_data vecs:vararg 260 + .irp i, \vecs 261 + _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i 262 + .endr 263 + .irp i, \vecs 264 + _vmovdqu AESDATA\i, \i*VL(DST) 265 + .endr 266 + .endm 267 + 268 + .macro _aes_ctr_crypt is_xctr 269 + 270 + // Define register aliases V0-V15 that map to the xmm, ymm, or zmm 271 + // registers according to the selected Vector Length (VL). 272 + .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 273 + .if VL == 16 274 + .set V\i, %xmm\i 275 + .elseif VL == 32 276 + .set V\i, %ymm\i 277 + .elseif VL == 64 278 + .set V\i, %zmm\i 279 + .else 280 + .error "Unsupported Vector Length (VL)" 281 + .endif 282 + .endr 283 + 284 + // Function arguments 285 + .set KEY, %rdi // Initially points to the start of the 286 + // crypto_aes_ctx, then is advanced to 287 + // point to the index 1 round key 288 + .set KEY32, %edi // Available as temp register after all 289 + // keystream blocks have been generated 290 + .set SRC, %rsi // Pointer to next source data 291 + .set DST, %rdx // Pointer to next destination data 292 + .set LEN, %ecx // Remaining length in bytes. 293 + // Note: _load_partial_block relies on 294 + // this being in %ecx. 295 + .set LEN64, %rcx // Zero-extend LEN before using! 296 + .set LEN8, %cl 297 + .if \is_xctr 298 + .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; 299 + .set XCTR_CTR, %r9 // u64 ctr; 300 + .else 301 + .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; 302 + .endif 303 + 304 + // Additional local variables 305 + .set RNDKEYLAST_PTR, %r10 306 + .set AESDATA0, V0 307 + .set AESDATA0_XMM, %xmm0 308 + .set AESDATA1, V1 309 + .set AESDATA1_XMM, %xmm1 310 + .set AESDATA2, V2 311 + .set AESDATA3, V3 312 + .set AESDATA4, V4 313 + .set AESDATA5, V5 314 + .set AESDATA6, V6 315 + .set AESDATA7, V7 316 + .if \is_xctr 317 + .set XCTR_IV, V8 318 + .else 319 + .set BSWAP_MASK, V8 320 + .endif 321 + .set LE_CTR, V9 322 + .set LE_CTR_XMM, %xmm9 323 + .set LE_CTR_INC1, V10 324 + .set LE_CTR_INC2, V11 325 + .set RNDKEY0, V12 326 + .set RNDKEYLAST, V13 327 + .set RNDKEY, V14 328 + 329 + // Create the first vector of counters. 330 + .if \is_xctr 331 + .if VL == 16 332 + vmovq XCTR_CTR, LE_CTR 333 + .elseif VL == 32 334 + vmovq XCTR_CTR, LE_CTR_XMM 335 + inc XCTR_CTR 336 + vmovq XCTR_CTR, AESDATA0_XMM 337 + vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR 338 + .else 339 + vpbroadcastq XCTR_CTR, LE_CTR 340 + vpsrldq $8, LE_CTR, LE_CTR 341 + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR 342 + .endif 343 + _vbroadcast128 (XCTR_IV_PTR), XCTR_IV 344 + .else 345 + _vbroadcast128 (LE_CTR_PTR), LE_CTR 346 + .if VL > 16 347 + vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR 348 + .endif 349 + _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK 350 + .endif 351 + 352 + .if VL == 16 353 + _vbroadcast128 .Lone(%rip), LE_CTR_INC1 354 + .elseif VL == 32 355 + _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 356 + .else 357 + _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 358 + .endif 359 + vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 360 + 361 + // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). 362 + movl 480(KEY), %eax 363 + 364 + // Compute the pointer to the last round key. 365 + lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR 366 + 367 + // Load the zero-th and last round keys. 368 + _vbroadcast128 (KEY), RNDKEY0 369 + _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST 370 + 371 + // Make KEY point to the first round key. 372 + add $16, KEY 373 + 374 + // This is the main loop, which encrypts 8 vectors of data at a time. 375 + add $-8*VL, LEN 376 + jl .Lloop_8x_done\@ 377 + .Lloop_8x\@: 378 + _prepare_2_ctr_vecs \is_xctr, 0, 1 379 + _prepare_2_ctr_vecs \is_xctr, 2, 3 380 + _prepare_2_ctr_vecs \is_xctr, 4, 5 381 + _prepare_2_ctr_vecs \is_xctr, 6, 7 382 + _aesenc_loop 0,1,2,3,4,5,6,7 383 + _aesenclast_and_xor 0,1,2,3,4,5,6,7 384 + sub $-8*VL, SRC 385 + sub $-8*VL, DST 386 + add $-8*VL, LEN 387 + jge .Lloop_8x\@ 388 + .Lloop_8x_done\@: 389 + sub $-8*VL, LEN 390 + jz .Ldone\@ 391 + 392 + // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream 393 + // blocks, depending on the remaining LEN. 394 + 395 + _prepare_2_ctr_vecs \is_xctr, 0, 1 396 + _prepare_2_ctr_vecs \is_xctr, 2, 3 397 + cmp $4*VL, LEN 398 + jle .Lenc_tail_atmost4vecs\@ 399 + 400 + // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the 401 + // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. 402 + _prepare_2_ctr_vecs \is_xctr, 4, 5 403 + _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 404 + _aesenc_loop 0,1,2,3,4,5,6,7 405 + _aesenclast_and_xor 0,1,2,3 406 + vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 407 + vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 408 + vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 409 + vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 410 + sub $-4*VL, SRC 411 + sub $-4*VL, DST 412 + add $-4*VL, LEN 413 + cmp $1*VL-1, LEN 414 + jle .Lxor_tail_partial_vec_0\@ 415 + _xor_data 0 416 + cmp $2*VL-1, LEN 417 + jle .Lxor_tail_partial_vec_1\@ 418 + _xor_data 1 419 + cmp $3*VL-1, LEN 420 + jle .Lxor_tail_partial_vec_2\@ 421 + _xor_data 2 422 + cmp $4*VL-1, LEN 423 + jle .Lxor_tail_partial_vec_3\@ 424 + _xor_data 3 425 + jmp .Ldone\@ 426 + 427 + .Lenc_tail_atmost4vecs\@: 428 + cmp $2*VL, LEN 429 + jle .Lenc_tail_atmost2vecs\@ 430 + 431 + // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the 432 + // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. 433 + _aesenc_loop 0,1,2,3 434 + _aesenclast_and_xor 0,1 435 + vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 436 + vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 437 + sub $-2*VL, SRC 438 + sub $-2*VL, DST 439 + add $-2*VL, LEN 440 + jmp .Lxor_tail_upto2vecs\@ 441 + 442 + .Lenc_tail_atmost2vecs\@: 443 + // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR 444 + // the remaining data. 445 + _aesenc_loop 0,1 446 + vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 447 + vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 448 + 449 + .Lxor_tail_upto2vecs\@: 450 + cmp $1*VL-1, LEN 451 + jle .Lxor_tail_partial_vec_0\@ 452 + _xor_data 0 453 + cmp $2*VL-1, LEN 454 + jle .Lxor_tail_partial_vec_1\@ 455 + _xor_data 1 456 + jmp .Ldone\@ 457 + 458 + .Lxor_tail_partial_vec_1\@: 459 + add $-1*VL, LEN 460 + jz .Ldone\@ 461 + sub $-1*VL, SRC 462 + sub $-1*VL, DST 463 + _vmovdqa AESDATA1, AESDATA0 464 + jmp .Lxor_tail_partial_vec_0\@ 465 + 466 + .Lxor_tail_partial_vec_2\@: 467 + add $-2*VL, LEN 468 + jz .Ldone\@ 469 + sub $-2*VL, SRC 470 + sub $-2*VL, DST 471 + _vmovdqa AESDATA2, AESDATA0 472 + jmp .Lxor_tail_partial_vec_0\@ 473 + 474 + .Lxor_tail_partial_vec_3\@: 475 + add $-3*VL, LEN 476 + jz .Ldone\@ 477 + sub $-3*VL, SRC 478 + sub $-3*VL, DST 479 + _vmovdqa AESDATA3, AESDATA0 480 + 481 + .Lxor_tail_partial_vec_0\@: 482 + // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked 483 + // loads/stores are available; otherwise it's a bit harder... 484 + .if USE_AVX10 485 + .if VL <= 32 486 + mov $-1, %eax 487 + bzhi LEN, %eax, %eax 488 + kmovd %eax, %k1 489 + .else 490 + mov $-1, %rax 491 + bzhi LEN64, %rax, %rax 492 + kmovq %rax, %k1 493 + .endif 494 + vmovdqu8 (SRC), AESDATA1{%k1}{z} 495 + _vpxor AESDATA1, AESDATA0, AESDATA0 496 + vmovdqu8 AESDATA0, (DST){%k1} 497 + .else 498 + .if VL == 32 499 + cmp $16, LEN 500 + jl 1f 501 + vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM 502 + vmovdqu AESDATA1_XMM, (DST) 503 + add $16, SRC 504 + add $16, DST 505 + sub $16, LEN 506 + jz .Ldone\@ 507 + vextracti128 $1, AESDATA0, AESDATA0_XMM 508 + 1: 509 + .endif 510 + mov LEN, %r10d 511 + _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 512 + vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM 513 + mov %r10d, %ecx 514 + _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 515 + .endif 516 + 517 + .Ldone\@: 518 + .if VL > 16 519 + vzeroupper 520 + .endif 521 + RET 522 + .endm 523 + 524 + // Below are the definitions of the functions generated by the above macro. 525 + // They have the following prototypes: 526 + // 527 + // 528 + // void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, 529 + // const u8 *src, u8 *dst, int len, 530 + // const u64 le_ctr[2]); 531 + // 532 + // void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, 533 + // const u8 *src, u8 *dst, int len, 534 + // const u8 iv[AES_BLOCK_SIZE], u64 ctr); 535 + // 536 + // Both functions generate |len| bytes of keystream, XOR it with the data from 537 + // |src|, and write the result to |dst|. On non-final calls, |len| must be a 538 + // multiple of 16. On the final call, |len| can be any value. 539 + // 540 + // aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated 541 + // from a 128-bit big endian counter that increments by 1 for each AES block. 542 + // HOWEVER, to keep the assembly code simple, some of the counter management is 543 + // left to the caller. aes_ctr64_crypt_* take the counter in little endian 544 + // form, only increment the low 64 bits internally, do the conversion to big 545 + // endian internally, and don't write the updated counter back to memory. The 546 + // caller is responsible for converting the starting IV to the little endian 547 + // le_ctr, detecting the (very rare) case of a carry out of the low 64 bits 548 + // being needed and splitting at that point with a carry done in between, and 549 + // updating le_ctr after each part if the message is multi-part. 550 + // 551 + // aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption 552 + // with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an 553 + // easier-to-implement variant of CTR that uses little endian byte order and 554 + // eliminates carries. |ctr| is the per-message block counter starting at 1. 555 + 556 + .set VL, 16 557 + .set USE_AVX10, 0 558 + SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) 559 + _aes_ctr_crypt 0 560 + SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) 561 + SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) 562 + _aes_ctr_crypt 1 563 + SYM_FUNC_END(aes_xctr_crypt_aesni_avx) 564 + 565 + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 566 + .set VL, 32 567 + .set USE_AVX10, 0 568 + SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) 569 + _aes_ctr_crypt 0 570 + SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) 571 + SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) 572 + _aes_ctr_crypt 1 573 + SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) 574 + 575 + .set VL, 32 576 + .set USE_AVX10, 1 577 + SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256) 578 + _aes_ctr_crypt 0 579 + SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256) 580 + SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256) 581 + _aes_ctr_crypt 1 582 + SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256) 583 + 584 + .set VL, 64 585 + .set USE_AVX10, 1 586 + SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512) 587 + _aes_ctr_crypt 0 588 + SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512) 589 + SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512) 590 + _aes_ctr_crypt 1 591 + SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512) 592 + #endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ

-597

arch/x86/crypto/aes_ctrby8_avx-x86_64.S

··· 1 - /* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ 2 - /* 3 - * AES CTR mode by8 optimization with AVX instructions. (x86_64) 4 - * 5 - * Copyright(c) 2014 Intel Corporation. 6 - * 7 - * Contact Information: 8 - * James Guilford <james.guilford@intel.com> 9 - * Sean Gulley <sean.m.gulley@intel.com> 10 - * Chandramouli Narayanan <mouli@linux.intel.com> 11 - */ 12 - /* 13 - * This is AES128/192/256 CTR mode optimization implementation. It requires 14 - * the support of Intel(R) AESNI and AVX instructions. 15 - * 16 - * This work was inspired by the AES CTR mode optimization published 17 - * in Intel Optimized IPSEC Cryptographic library. 18 - * Additional information on it can be found at: 19 - * https://github.com/intel/intel-ipsec-mb 20 - */ 21 - 22 - #include <linux/linkage.h> 23 - 24 - #define VMOVDQ vmovdqu 25 - 26 - /* 27 - * Note: the "x" prefix in these aliases means "this is an xmm register". The 28 - * alias prefixes have no relation to XCTR where the "X" prefix means "XOR 29 - * counter". 30 - */ 31 - #define xdata0 %xmm0 32 - #define xdata1 %xmm1 33 - #define xdata2 %xmm2 34 - #define xdata3 %xmm3 35 - #define xdata4 %xmm4 36 - #define xdata5 %xmm5 37 - #define xdata6 %xmm6 38 - #define xdata7 %xmm7 39 - #define xcounter %xmm8 // CTR mode only 40 - #define xiv %xmm8 // XCTR mode only 41 - #define xbyteswap %xmm9 // CTR mode only 42 - #define xtmp %xmm9 // XCTR mode only 43 - #define xkey0 %xmm10 44 - #define xkey4 %xmm11 45 - #define xkey8 %xmm12 46 - #define xkey12 %xmm13 47 - #define xkeyA %xmm14 48 - #define xkeyB %xmm15 49 - 50 - #define p_in %rdi 51 - #define p_iv %rsi 52 - #define p_keys %rdx 53 - #define p_out %rcx 54 - #define num_bytes %r8 55 - #define counter %r9 // XCTR mode only 56 - #define tmp %r10 57 - #define DDQ_DATA 0 58 - #define XDATA 1 59 - #define KEY_128 1 60 - #define KEY_192 2 61 - #define KEY_256 3 62 - 63 - .section .rodata 64 - .align 16 65 - 66 - byteswap_const: 67 - .octa 0x000102030405060708090A0B0C0D0E0F 68 - ddq_low_msk: 69 - .octa 0x0000000000000000FFFFFFFFFFFFFFFF 70 - ddq_high_add_1: 71 - .octa 0x00000000000000010000000000000000 72 - ddq_add_1: 73 - .octa 0x00000000000000000000000000000001 74 - ddq_add_2: 75 - .octa 0x00000000000000000000000000000002 76 - ddq_add_3: 77 - .octa 0x00000000000000000000000000000003 78 - ddq_add_4: 79 - .octa 0x00000000000000000000000000000004 80 - ddq_add_5: 81 - .octa 0x00000000000000000000000000000005 82 - ddq_add_6: 83 - .octa 0x00000000000000000000000000000006 84 - ddq_add_7: 85 - .octa 0x00000000000000000000000000000007 86 - ddq_add_8: 87 - .octa 0x00000000000000000000000000000008 88 - 89 - .text 90 - 91 - /* generate a unique variable for ddq_add_x */ 92 - 93 - /* generate a unique variable for xmm register */ 94 - .macro setxdata n 95 - var_xdata = %xmm\n 96 - .endm 97 - 98 - /* club the numeric 'id' to the symbol 'name' */ 99 - 100 - .macro club name, id 101 - .altmacro 102 - .if \name == XDATA 103 - setxdata %\id 104 - .endif 105 - .noaltmacro 106 - .endm 107 - 108 - /* 109 - * do_aes num_in_par load_keys key_len 110 - * This increments p_in, but not p_out 111 - */ 112 - .macro do_aes b, k, key_len, xctr 113 - .set by, \b 114 - .set load_keys, \k 115 - .set klen, \key_len 116 - 117 - .if (load_keys) 118 - vmovdqa 0*16(p_keys), xkey0 119 - .endif 120 - 121 - .if \xctr 122 - movq counter, xtmp 123 - .set i, 0 124 - .rept (by) 125 - club XDATA, i 126 - vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata 127 - .set i, (i +1) 128 - .endr 129 - .set i, 0 130 - .rept (by) 131 - club XDATA, i 132 - vpxor xiv, var_xdata, var_xdata 133 - .set i, (i +1) 134 - .endr 135 - .else 136 - vpshufb xbyteswap, xcounter, xdata0 137 - .set i, 1 138 - .rept (by - 1) 139 - club XDATA, i 140 - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 141 - vptest ddq_low_msk(%rip), var_xdata 142 - jnz 1f 143 - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 144 - vpaddq ddq_high_add_1(%rip), xcounter, xcounter 145 - 1: 146 - vpshufb xbyteswap, var_xdata, var_xdata 147 - .set i, (i +1) 148 - .endr 149 - .endif 150 - 151 - vmovdqa 1*16(p_keys), xkeyA 152 - 153 - vpxor xkey0, xdata0, xdata0 154 - .if \xctr 155 - add $by, counter 156 - .else 157 - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 158 - vptest ddq_low_msk(%rip), xcounter 159 - jnz 1f 160 - vpaddq ddq_high_add_1(%rip), xcounter, xcounter 161 - 1: 162 - .endif 163 - 164 - .set i, 1 165 - .rept (by - 1) 166 - club XDATA, i 167 - vpxor xkey0, var_xdata, var_xdata 168 - .set i, (i +1) 169 - .endr 170 - 171 - vmovdqa 2*16(p_keys), xkeyB 172 - 173 - .set i, 0 174 - .rept by 175 - club XDATA, i 176 - vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 177 - .set i, (i +1) 178 - .endr 179 - 180 - .if (klen == KEY_128) 181 - .if (load_keys) 182 - vmovdqa 3*16(p_keys), xkey4 183 - .endif 184 - .else 185 - vmovdqa 3*16(p_keys), xkeyA 186 - .endif 187 - 188 - .set i, 0 189 - .rept by 190 - club XDATA, i 191 - vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 192 - .set i, (i +1) 193 - .endr 194 - 195 - add $(16*by), p_in 196 - 197 - .if (klen == KEY_128) 198 - vmovdqa 4*16(p_keys), xkeyB 199 - .else 200 - .if (load_keys) 201 - vmovdqa 4*16(p_keys), xkey4 202 - .endif 203 - .endif 204 - 205 - .set i, 0 206 - .rept by 207 - club XDATA, i 208 - /* key 3 */ 209 - .if (klen == KEY_128) 210 - vaesenc xkey4, var_xdata, var_xdata 211 - .else 212 - vaesenc xkeyA, var_xdata, var_xdata 213 - .endif 214 - .set i, (i +1) 215 - .endr 216 - 217 - vmovdqa 5*16(p_keys), xkeyA 218 - 219 - .set i, 0 220 - .rept by 221 - club XDATA, i 222 - /* key 4 */ 223 - .if (klen == KEY_128) 224 - vaesenc xkeyB, var_xdata, var_xdata 225 - .else 226 - vaesenc xkey4, var_xdata, var_xdata 227 - .endif 228 - .set i, (i +1) 229 - .endr 230 - 231 - .if (klen == KEY_128) 232 - .if (load_keys) 233 - vmovdqa 6*16(p_keys), xkey8 234 - .endif 235 - .else 236 - vmovdqa 6*16(p_keys), xkeyB 237 - .endif 238 - 239 - .set i, 0 240 - .rept by 241 - club XDATA, i 242 - vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 243 - .set i, (i +1) 244 - .endr 245 - 246 - vmovdqa 7*16(p_keys), xkeyA 247 - 248 - .set i, 0 249 - .rept by 250 - club XDATA, i 251 - /* key 6 */ 252 - .if (klen == KEY_128) 253 - vaesenc xkey8, var_xdata, var_xdata 254 - .else 255 - vaesenc xkeyB, var_xdata, var_xdata 256 - .endif 257 - .set i, (i +1) 258 - .endr 259 - 260 - .if (klen == KEY_128) 261 - vmovdqa 8*16(p_keys), xkeyB 262 - .else 263 - .if (load_keys) 264 - vmovdqa 8*16(p_keys), xkey8 265 - .endif 266 - .endif 267 - 268 - .set i, 0 269 - .rept by 270 - club XDATA, i 271 - vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 272 - .set i, (i +1) 273 - .endr 274 - 275 - .if (klen == KEY_128) 276 - .if (load_keys) 277 - vmovdqa 9*16(p_keys), xkey12 278 - .endif 279 - .else 280 - vmovdqa 9*16(p_keys), xkeyA 281 - .endif 282 - 283 - .set i, 0 284 - .rept by 285 - club XDATA, i 286 - /* key 8 */ 287 - .if (klen == KEY_128) 288 - vaesenc xkeyB, var_xdata, var_xdata 289 - .else 290 - vaesenc xkey8, var_xdata, var_xdata 291 - .endif 292 - .set i, (i +1) 293 - .endr 294 - 295 - vmovdqa 10*16(p_keys), xkeyB 296 - 297 - .set i, 0 298 - .rept by 299 - club XDATA, i 300 - /* key 9 */ 301 - .if (klen == KEY_128) 302 - vaesenc xkey12, var_xdata, var_xdata 303 - .else 304 - vaesenc xkeyA, var_xdata, var_xdata 305 - .endif 306 - .set i, (i +1) 307 - .endr 308 - 309 - .if (klen != KEY_128) 310 - vmovdqa 11*16(p_keys), xkeyA 311 - .endif 312 - 313 - .set i, 0 314 - .rept by 315 - club XDATA, i 316 - /* key 10 */ 317 - .if (klen == KEY_128) 318 - vaesenclast xkeyB, var_xdata, var_xdata 319 - .else 320 - vaesenc xkeyB, var_xdata, var_xdata 321 - .endif 322 - .set i, (i +1) 323 - .endr 324 - 325 - .if (klen != KEY_128) 326 - .if (load_keys) 327 - vmovdqa 12*16(p_keys), xkey12 328 - .endif 329 - 330 - .set i, 0 331 - .rept by 332 - club XDATA, i 333 - vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 334 - .set i, (i +1) 335 - .endr 336 - 337 - .if (klen == KEY_256) 338 - vmovdqa 13*16(p_keys), xkeyA 339 - .endif 340 - 341 - .set i, 0 342 - .rept by 343 - club XDATA, i 344 - .if (klen == KEY_256) 345 - /* key 12 */ 346 - vaesenc xkey12, var_xdata, var_xdata 347 - .else 348 - vaesenclast xkey12, var_xdata, var_xdata 349 - .endif 350 - .set i, (i +1) 351 - .endr 352 - 353 - .if (klen == KEY_256) 354 - vmovdqa 14*16(p_keys), xkeyB 355 - 356 - .set i, 0 357 - .rept by 358 - club XDATA, i 359 - /* key 13 */ 360 - vaesenc xkeyA, var_xdata, var_xdata 361 - .set i, (i +1) 362 - .endr 363 - 364 - .set i, 0 365 - .rept by 366 - club XDATA, i 367 - /* key 14 */ 368 - vaesenclast xkeyB, var_xdata, var_xdata 369 - .set i, (i +1) 370 - .endr 371 - .endif 372 - .endif 373 - 374 - .set i, 0 375 - .rept (by / 2) 376 - .set j, (i+1) 377 - VMOVDQ (i*16 - 16*by)(p_in), xkeyA 378 - VMOVDQ (j*16 - 16*by)(p_in), xkeyB 379 - club XDATA, i 380 - vpxor xkeyA, var_xdata, var_xdata 381 - club XDATA, j 382 - vpxor xkeyB, var_xdata, var_xdata 383 - .set i, (i+2) 384 - .endr 385 - 386 - .if (i < by) 387 - VMOVDQ (i*16 - 16*by)(p_in), xkeyA 388 - club XDATA, i 389 - vpxor xkeyA, var_xdata, var_xdata 390 - .endif 391 - 392 - .set i, 0 393 - .rept by 394 - club XDATA, i 395 - VMOVDQ var_xdata, i*16(p_out) 396 - .set i, (i+1) 397 - .endr 398 - .endm 399 - 400 - .macro do_aes_load val, key_len, xctr 401 - do_aes \val, 1, \key_len, \xctr 402 - .endm 403 - 404 - .macro do_aes_noload val, key_len, xctr 405 - do_aes \val, 0, \key_len, \xctr 406 - .endm 407 - 408 - /* main body of aes ctr load */ 409 - 410 - .macro do_aes_ctrmain key_len, xctr 411 - cmp $16, num_bytes 412 - jb .Ldo_return2\xctr\key_len 413 - 414 - .if \xctr 415 - shr $4, counter 416 - vmovdqu (p_iv), xiv 417 - .else 418 - vmovdqa byteswap_const(%rip), xbyteswap 419 - vmovdqu (p_iv), xcounter 420 - vpshufb xbyteswap, xcounter, xcounter 421 - .endif 422 - 423 - mov num_bytes, tmp 424 - and $(7*16), tmp 425 - jz .Lmult_of_8_blks\xctr\key_len 426 - 427 - /* 1 <= tmp <= 7 */ 428 - cmp $(4*16), tmp 429 - jg .Lgt4\xctr\key_len 430 - je .Leq4\xctr\key_len 431 - 432 - .Llt4\xctr\key_len: 433 - cmp $(2*16), tmp 434 - jg .Leq3\xctr\key_len 435 - je .Leq2\xctr\key_len 436 - 437 - .Leq1\xctr\key_len: 438 - do_aes_load 1, \key_len, \xctr 439 - add $(1*16), p_out 440 - and $(~7*16), num_bytes 441 - jz .Ldo_return2\xctr\key_len 442 - jmp .Lmain_loop2\xctr\key_len 443 - 444 - .Leq2\xctr\key_len: 445 - do_aes_load 2, \key_len, \xctr 446 - add $(2*16), p_out 447 - and $(~7*16), num_bytes 448 - jz .Ldo_return2\xctr\key_len 449 - jmp .Lmain_loop2\xctr\key_len 450 - 451 - 452 - .Leq3\xctr\key_len: 453 - do_aes_load 3, \key_len, \xctr 454 - add $(3*16), p_out 455 - and $(~7*16), num_bytes 456 - jz .Ldo_return2\xctr\key_len 457 - jmp .Lmain_loop2\xctr\key_len 458 - 459 - .Leq4\xctr\key_len: 460 - do_aes_load 4, \key_len, \xctr 461 - add $(4*16), p_out 462 - and $(~7*16), num_bytes 463 - jz .Ldo_return2\xctr\key_len 464 - jmp .Lmain_loop2\xctr\key_len 465 - 466 - .Lgt4\xctr\key_len: 467 - cmp $(6*16), tmp 468 - jg .Leq7\xctr\key_len 469 - je .Leq6\xctr\key_len 470 - 471 - .Leq5\xctr\key_len: 472 - do_aes_load 5, \key_len, \xctr 473 - add $(5*16), p_out 474 - and $(~7*16), num_bytes 475 - jz .Ldo_return2\xctr\key_len 476 - jmp .Lmain_loop2\xctr\key_len 477 - 478 - .Leq6\xctr\key_len: 479 - do_aes_load 6, \key_len, \xctr 480 - add $(6*16), p_out 481 - and $(~7*16), num_bytes 482 - jz .Ldo_return2\xctr\key_len 483 - jmp .Lmain_loop2\xctr\key_len 484 - 485 - .Leq7\xctr\key_len: 486 - do_aes_load 7, \key_len, \xctr 487 - add $(7*16), p_out 488 - and $(~7*16), num_bytes 489 - jz .Ldo_return2\xctr\key_len 490 - jmp .Lmain_loop2\xctr\key_len 491 - 492 - .Lmult_of_8_blks\xctr\key_len: 493 - .if (\key_len != KEY_128) 494 - vmovdqa 0*16(p_keys), xkey0 495 - vmovdqa 4*16(p_keys), xkey4 496 - vmovdqa 8*16(p_keys), xkey8 497 - vmovdqa 12*16(p_keys), xkey12 498 - .else 499 - vmovdqa 0*16(p_keys), xkey0 500 - vmovdqa 3*16(p_keys), xkey4 501 - vmovdqa 6*16(p_keys), xkey8 502 - vmovdqa 9*16(p_keys), xkey12 503 - .endif 504 - .align 16 505 - .Lmain_loop2\xctr\key_len: 506 - /* num_bytes is a multiple of 8 and >0 */ 507 - do_aes_noload 8, \key_len, \xctr 508 - add $(8*16), p_out 509 - sub $(8*16), num_bytes 510 - jne .Lmain_loop2\xctr\key_len 511 - 512 - .Ldo_return2\xctr\key_len: 513 - .if !\xctr 514 - /* return updated IV */ 515 - vpshufb xbyteswap, xcounter, xcounter 516 - vmovdqu xcounter, (p_iv) 517 - .endif 518 - RET 519 - .endm 520 - 521 - /* 522 - * routine to do AES128 CTR enc/decrypt "by8" 523 - * XMM registers are clobbered. 524 - * Saving/restoring must be done at a higher level 525 - * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 526 - * unsigned int num_bytes) 527 - */ 528 - SYM_FUNC_START(aes_ctr_enc_128_avx_by8) 529 - /* call the aes main loop */ 530 - do_aes_ctrmain KEY_128 0 531 - 532 - SYM_FUNC_END(aes_ctr_enc_128_avx_by8) 533 - 534 - /* 535 - * routine to do AES192 CTR enc/decrypt "by8" 536 - * XMM registers are clobbered. 537 - * Saving/restoring must be done at a higher level 538 - * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 539 - * unsigned int num_bytes) 540 - */ 541 - SYM_FUNC_START(aes_ctr_enc_192_avx_by8) 542 - /* call the aes main loop */ 543 - do_aes_ctrmain KEY_192 0 544 - 545 - SYM_FUNC_END(aes_ctr_enc_192_avx_by8) 546 - 547 - /* 548 - * routine to do AES256 CTR enc/decrypt "by8" 549 - * XMM registers are clobbered. 550 - * Saving/restoring must be done at a higher level 551 - * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 552 - * unsigned int num_bytes) 553 - */ 554 - SYM_FUNC_START(aes_ctr_enc_256_avx_by8) 555 - /* call the aes main loop */ 556 - do_aes_ctrmain KEY_256 0 557 - 558 - SYM_FUNC_END(aes_ctr_enc_256_avx_by8) 559 - 560 - /* 561 - * routine to do AES128 XCTR enc/decrypt "by8" 562 - * XMM registers are clobbered. 563 - * Saving/restoring must be done at a higher level 564 - * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, 565 - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 566 - */ 567 - SYM_FUNC_START(aes_xctr_enc_128_avx_by8) 568 - /* call the aes main loop */ 569 - do_aes_ctrmain KEY_128 1 570 - 571 - SYM_FUNC_END(aes_xctr_enc_128_avx_by8) 572 - 573 - /* 574 - * routine to do AES192 XCTR enc/decrypt "by8" 575 - * XMM registers are clobbered. 576 - * Saving/restoring must be done at a higher level 577 - * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, 578 - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 579 - */ 580 - SYM_FUNC_START(aes_xctr_enc_192_avx_by8) 581 - /* call the aes main loop */ 582 - do_aes_ctrmain KEY_192 1 583 - 584 - SYM_FUNC_END(aes_xctr_enc_192_avx_by8) 585 - 586 - /* 587 - * routine to do AES256 XCTR enc/decrypt "by8" 588 - * XMM registers are clobbered. 589 - * Saving/restoring must be done at a higher level 590 - * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, 591 - * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 592 - */ 593 - SYM_FUNC_START(aes_xctr_enc_256_avx_by8) 594 - /* call the aes main loop */ 595 - do_aes_ctrmain KEY_256 1 596 - 597 - SYM_FUNC_END(aes_xctr_enc_256_avx_by8)

+210 -194

arch/x86/crypto/aesni-intel_glue.c

··· 23 23 #include <linux/err.h> 24 24 #include <crypto/algapi.h> 25 25 #include <crypto/aes.h> 26 - #include <crypto/ctr.h> 27 26 #include <crypto/b128ops.h> 28 27 #include <crypto/gcm.h> 29 28 #include <crypto/xts.h> ··· 81 82 const u8 *in, unsigned int len, u8 *iv); 82 83 83 84 #ifdef CONFIG_X86_64 84 - 85 85 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 86 86 const u8 *in, unsigned int len, u8 *iv); 87 - DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); 88 - 89 - asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, 90 - void *keys, u8 *out, unsigned int num_bytes); 91 - asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, 92 - void *keys, u8 *out, unsigned int num_bytes); 93 - asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, 94 - void *keys, u8 *out, unsigned int num_bytes); 95 - 96 - 97 - asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, 98 - const void *keys, u8 *out, unsigned int num_bytes, 99 - unsigned int byte_ctr); 100 - 101 - asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, 102 - const void *keys, u8 *out, unsigned int num_bytes, 103 - unsigned int byte_ctr); 104 - 105 - asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, 106 - const void *keys, u8 *out, unsigned int num_bytes, 107 - unsigned int byte_ctr); 108 87 #endif 109 88 110 89 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) ··· 353 376 } 354 377 355 378 #ifdef CONFIG_X86_64 356 - static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, 357 - const u8 *in, unsigned int len, u8 *iv) 358 - { 359 - /* 360 - * based on key length, override with the by8 version 361 - * of ctr mode encryption/decryption for improved performance 362 - * aes_set_key_common() ensures that key length is one of 363 - * {128,192,256} 364 - */ 365 - if (ctx->key_length == AES_KEYSIZE_128) 366 - aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len); 367 - else if (ctx->key_length == AES_KEYSIZE_192) 368 - aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len); 369 - else 370 - aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len); 371 - } 372 - 373 - static int ctr_crypt(struct skcipher_request *req) 379 + /* This is the non-AVX version. */ 380 + static int ctr_crypt_aesni(struct skcipher_request *req) 374 381 { 375 382 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 376 383 struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); ··· 368 407 while ((nbytes = walk.nbytes) > 0) { 369 408 kernel_fpu_begin(); 370 409 if (nbytes & AES_BLOCK_MASK) 371 - static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr, 372 - walk.src.virt.addr, 373 - nbytes & AES_BLOCK_MASK, 374 - walk.iv); 410 + aesni_ctr_enc(ctx, walk.dst.virt.addr, 411 + walk.src.virt.addr, 412 + nbytes & AES_BLOCK_MASK, walk.iv); 375 413 nbytes &= ~AES_BLOCK_MASK; 376 414 377 415 if (walk.nbytes == walk.total && nbytes > 0) { ··· 379 419 walk.src.virt.addr + walk.nbytes - nbytes, 380 420 keystream, nbytes); 381 421 crypto_inc(walk.iv, AES_BLOCK_SIZE); 382 - nbytes = 0; 383 - } 384 - kernel_fpu_end(); 385 - err = skcipher_walk_done(&walk, nbytes); 386 - } 387 - return err; 388 - } 389 - 390 - static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, 391 - const u8 *in, unsigned int len, u8 *iv, 392 - unsigned int byte_ctr) 393 - { 394 - if (ctx->key_length == AES_KEYSIZE_128) 395 - aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, 396 - byte_ctr); 397 - else if (ctx->key_length == AES_KEYSIZE_192) 398 - aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, 399 - byte_ctr); 400 - else 401 - aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, 402 - byte_ctr); 403 - } 404 - 405 - static int xctr_crypt(struct skcipher_request *req) 406 - { 407 - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 408 - struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); 409 - u8 keystream[AES_BLOCK_SIZE]; 410 - struct skcipher_walk walk; 411 - unsigned int nbytes; 412 - unsigned int byte_ctr = 0; 413 - int err; 414 - __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; 415 - 416 - err = skcipher_walk_virt(&walk, req, false); 417 - 418 - while ((nbytes = walk.nbytes) > 0) { 419 - kernel_fpu_begin(); 420 - if (nbytes & AES_BLOCK_MASK) 421 - aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, 422 - walk.src.virt.addr, nbytes & AES_BLOCK_MASK, 423 - walk.iv, byte_ctr); 424 - nbytes &= ~AES_BLOCK_MASK; 425 - byte_ctr += walk.nbytes - nbytes; 426 - 427 - if (walk.nbytes == walk.total && nbytes > 0) { 428 - memcpy(block, walk.iv, AES_BLOCK_SIZE); 429 - block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); 430 - aesni_enc(ctx, keystream, (u8 *)block); 431 - crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - 432 - nbytes, walk.src.virt.addr + walk.nbytes 433 - - nbytes, keystream, nbytes); 434 - byte_ctr += nbytes; 435 422 nbytes = 0; 436 423 } 437 424 kernel_fpu_end(); ··· 628 721 .ivsize = AES_BLOCK_SIZE, 629 722 .chunksize = AES_BLOCK_SIZE, 630 723 .setkey = aesni_skcipher_setkey, 631 - .encrypt = ctr_crypt, 632 - .decrypt = ctr_crypt, 724 + .encrypt = ctr_crypt_aesni, 725 + .decrypt = ctr_crypt_aesni, 633 726 #endif 634 727 }, { 635 728 .base = { ··· 655 748 struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; 656 749 657 750 #ifdef CONFIG_X86_64 658 - /* 659 - * XCTR does not have a non-AVX implementation, so it must be enabled 660 - * conditionally. 661 - */ 662 - static struct skcipher_alg aesni_xctr = { 663 - .base = { 664 - .cra_name = "__xctr(aes)", 665 - .cra_driver_name = "__xctr-aes-aesni", 666 - .cra_priority = 400, 667 - .cra_flags = CRYPTO_ALG_INTERNAL, 668 - .cra_blocksize = 1, 669 - .cra_ctxsize = CRYPTO_AES_CTX_SIZE, 670 - .cra_module = THIS_MODULE, 671 - }, 672 - .min_keysize = AES_MIN_KEY_SIZE, 673 - .max_keysize = AES_MAX_KEY_SIZE, 674 - .ivsize = AES_BLOCK_SIZE, 675 - .chunksize = AES_BLOCK_SIZE, 676 - .setkey = aesni_skcipher_setkey, 677 - .encrypt = xctr_crypt, 678 - .decrypt = xctr_crypt, 679 - }; 680 - 681 - static struct simd_skcipher_alg *aesni_simd_xctr; 682 - 683 751 asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, 684 752 u8 iv[AES_BLOCK_SIZE]); 685 753 686 - #define DEFINE_XTS_ALG(suffix, driver_name, priority) \ 754 + /* __always_inline to avoid indirect call */ 755 + static __always_inline int 756 + ctr_crypt(struct skcipher_request *req, 757 + void (*ctr64_func)(const struct crypto_aes_ctx *key, 758 + const u8 *src, u8 *dst, int len, 759 + const u64 le_ctr[2])) 760 + { 761 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 762 + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); 763 + unsigned int nbytes, p1_nbytes, nblocks; 764 + struct skcipher_walk walk; 765 + u64 le_ctr[2]; 766 + u64 ctr64; 767 + int err; 768 + 769 + ctr64 = le_ctr[0] = get_unaligned_be64(&req->iv[8]); 770 + le_ctr[1] = get_unaligned_be64(&req->iv[0]); 771 + 772 + err = skcipher_walk_virt(&walk, req, false); 773 + 774 + while ((nbytes = walk.nbytes) != 0) { 775 + if (nbytes < walk.total) { 776 + /* Not the end yet, so keep the length block-aligned. */ 777 + nbytes = round_down(nbytes, AES_BLOCK_SIZE); 778 + nblocks = nbytes / AES_BLOCK_SIZE; 779 + } else { 780 + /* It's the end, so include any final partial block. */ 781 + nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); 782 + } 783 + ctr64 += nblocks; 784 + 785 + kernel_fpu_begin(); 786 + if (likely(ctr64 >= nblocks)) { 787 + /* The low 64 bits of the counter won't overflow. */ 788 + (*ctr64_func)(key, walk.src.virt.addr, 789 + walk.dst.virt.addr, nbytes, le_ctr); 790 + } else { 791 + /* 792 + * The low 64 bits of the counter will overflow. The 793 + * assembly doesn't handle this case, so split the 794 + * operation into two at the point where the overflow 795 + * will occur. After the first part, add the carry bit. 796 + */ 797 + p1_nbytes = min_t(unsigned int, nbytes, 798 + (nblocks - ctr64) * AES_BLOCK_SIZE); 799 + (*ctr64_func)(key, walk.src.virt.addr, 800 + walk.dst.virt.addr, p1_nbytes, le_ctr); 801 + le_ctr[0] = 0; 802 + le_ctr[1]++; 803 + (*ctr64_func)(key, walk.src.virt.addr + p1_nbytes, 804 + walk.dst.virt.addr + p1_nbytes, 805 + nbytes - p1_nbytes, le_ctr); 806 + } 807 + kernel_fpu_end(); 808 + le_ctr[0] = ctr64; 809 + 810 + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); 811 + } 812 + 813 + put_unaligned_be64(ctr64, &req->iv[8]); 814 + put_unaligned_be64(le_ctr[1], &req->iv[0]); 815 + 816 + return err; 817 + } 818 + 819 + /* __always_inline to avoid indirect call */ 820 + static __always_inline int 821 + xctr_crypt(struct skcipher_request *req, 822 + void (*xctr_func)(const struct crypto_aes_ctx *key, 823 + const u8 *src, u8 *dst, int len, 824 + const u8 iv[AES_BLOCK_SIZE], u64 ctr)) 825 + { 826 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 827 + const struct crypto_aes_ctx *key = aes_ctx(crypto_skcipher_ctx(tfm)); 828 + struct skcipher_walk walk; 829 + unsigned int nbytes; 830 + u64 ctr = 1; 831 + int err; 832 + 833 + err = skcipher_walk_virt(&walk, req, false); 834 + while ((nbytes = walk.nbytes) != 0) { 835 + if (nbytes < walk.total) 836 + nbytes = round_down(nbytes, AES_BLOCK_SIZE); 837 + 838 + kernel_fpu_begin(); 839 + (*xctr_func)(key, walk.src.virt.addr, walk.dst.virt.addr, 840 + nbytes, req->iv, ctr); 841 + kernel_fpu_end(); 842 + 843 + ctr += DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); 844 + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); 845 + } 846 + return err; 847 + } 848 + 849 + #define DEFINE_AVX_SKCIPHER_ALGS(suffix, driver_name_suffix, priority) \ 687 850 \ 688 851 asmlinkage void \ 689 852 aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ ··· 772 795 return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \ 773 796 } \ 774 797 \ 775 - static struct skcipher_alg aes_xts_alg_##suffix = { \ 776 - .base = { \ 777 - .cra_name = "__xts(aes)", \ 778 - .cra_driver_name = "__" driver_name, \ 779 - .cra_priority = priority, \ 780 - .cra_flags = CRYPTO_ALG_INTERNAL, \ 781 - .cra_blocksize = AES_BLOCK_SIZE, \ 782 - .cra_ctxsize = XTS_AES_CTX_SIZE, \ 783 - .cra_module = THIS_MODULE, \ 784 - }, \ 785 - .min_keysize = 2 * AES_MIN_KEY_SIZE, \ 786 - .max_keysize = 2 * AES_MAX_KEY_SIZE, \ 787 - .ivsize = AES_BLOCK_SIZE, \ 788 - .walksize = 2 * AES_BLOCK_SIZE, \ 789 - .setkey = xts_setkey_aesni, \ 790 - .encrypt = xts_encrypt_##suffix, \ 791 - .decrypt = xts_decrypt_##suffix, \ 792 - }; \ 798 + asmlinkage void \ 799 + aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, \ 800 + const u8 *src, u8 *dst, int len, const u64 le_ctr[2]);\ 793 801 \ 794 - static struct simd_skcipher_alg *aes_xts_simdalg_##suffix 802 + static int ctr_crypt_##suffix(struct skcipher_request *req) \ 803 + { \ 804 + return ctr_crypt(req, aes_ctr64_crypt_##suffix); \ 805 + } \ 806 + \ 807 + asmlinkage void \ 808 + aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, \ 809 + const u8 *src, u8 *dst, int len, \ 810 + const u8 iv[AES_BLOCK_SIZE], u64 ctr); \ 811 + \ 812 + static int xctr_crypt_##suffix(struct skcipher_request *req) \ 813 + { \ 814 + return xctr_crypt(req, aes_xctr_crypt_##suffix); \ 815 + } \ 816 + \ 817 + static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ 818 + .base.cra_name = "__xts(aes)", \ 819 + .base.cra_driver_name = "__xts-aes-" driver_name_suffix, \ 820 + .base.cra_priority = priority, \ 821 + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ 822 + .base.cra_blocksize = AES_BLOCK_SIZE, \ 823 + .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ 824 + .base.cra_module = THIS_MODULE, \ 825 + .min_keysize = 2 * AES_MIN_KEY_SIZE, \ 826 + .max_keysize = 2 * AES_MAX_KEY_SIZE, \ 827 + .ivsize = AES_BLOCK_SIZE, \ 828 + .walksize = 2 * AES_BLOCK_SIZE, \ 829 + .setkey = xts_setkey_aesni, \ 830 + .encrypt = xts_encrypt_##suffix, \ 831 + .decrypt = xts_decrypt_##suffix, \ 832 + }, { \ 833 + .base.cra_name = "__ctr(aes)", \ 834 + .base.cra_driver_name = "__ctr-aes-" driver_name_suffix, \ 835 + .base.cra_priority = priority, \ 836 + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ 837 + .base.cra_blocksize = 1, \ 838 + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ 839 + .base.cra_module = THIS_MODULE, \ 840 + .min_keysize = AES_MIN_KEY_SIZE, \ 841 + .max_keysize = AES_MAX_KEY_SIZE, \ 842 + .ivsize = AES_BLOCK_SIZE, \ 843 + .chunksize = AES_BLOCK_SIZE, \ 844 + .setkey = aesni_skcipher_setkey, \ 845 + .encrypt = ctr_crypt_##suffix, \ 846 + .decrypt = ctr_crypt_##suffix, \ 847 + }, { \ 848 + .base.cra_name = "__xctr(aes)", \ 849 + .base.cra_driver_name = "__xctr-aes-" driver_name_suffix, \ 850 + .base.cra_priority = priority, \ 851 + .base.cra_flags = CRYPTO_ALG_INTERNAL, \ 852 + .base.cra_blocksize = 1, \ 853 + .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ 854 + .base.cra_module = THIS_MODULE, \ 855 + .min_keysize = AES_MIN_KEY_SIZE, \ 856 + .max_keysize = AES_MAX_KEY_SIZE, \ 857 + .ivsize = AES_BLOCK_SIZE, \ 858 + .chunksize = AES_BLOCK_SIZE, \ 859 + .setkey = aesni_skcipher_setkey, \ 860 + .encrypt = xctr_crypt_##suffix, \ 861 + .decrypt = xctr_crypt_##suffix, \ 862 + }}; \ 863 + \ 864 + static struct simd_skcipher_alg * \ 865 + simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)] 795 866 796 - DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); 867 + DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); 797 868 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 798 - DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); 799 - DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); 800 - DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); 869 + DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); 870 + DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700); 871 + DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800); 801 872 #endif 802 873 803 874 /* The common part of the x86_64 AES-GCM key struct */ ··· 1577 1552 1578 1553 if (!boot_cpu_has(X86_FEATURE_AVX)) 1579 1554 return 0; 1580 - err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1, 1581 - &aes_xts_simdalg_aesni_avx); 1555 + err = simd_register_skciphers_compat(skcipher_algs_aesni_avx, 1556 + ARRAY_SIZE(skcipher_algs_aesni_avx), 1557 + simd_skcipher_algs_aesni_avx); 1582 1558 if (err) 1583 1559 return err; 1584 1560 err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, ··· 1587 1561 aes_gcm_simdalgs_aesni_avx); 1588 1562 if (err) 1589 1563 return err; 1564 + /* 1565 + * Note: not all the algorithms registered below actually require 1566 + * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. 1567 + * Similarly, the assembler support was added at about the same time. 1568 + * For simplicity, just always check for VAES and VPCLMULQDQ together. 1569 + */ 1590 1570 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 1591 1571 if (!boot_cpu_has(X86_FEATURE_AVX2) || 1592 1572 !boot_cpu_has(X86_FEATURE_VAES) || ··· 1600 1568 !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || 1601 1569 !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) 1602 1570 return 0; 1603 - err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1, 1604 - &aes_xts_simdalg_vaes_avx2); 1571 + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2, 1572 + ARRAY_SIZE(skcipher_algs_vaes_avx2), 1573 + simd_skcipher_algs_vaes_avx2); 1605 1574 if (err) 1606 1575 return err; 1607 1576 ··· 1613 1580 XFEATURE_MASK_AVX512, NULL)) 1614 1581 return 0; 1615 1582 1616 - err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1, 1617 - &aes_xts_simdalg_vaes_avx10_256); 1583 + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256, 1584 + ARRAY_SIZE(skcipher_algs_vaes_avx10_256), 1585 + simd_skcipher_algs_vaes_avx10_256); 1618 1586 if (err) 1619 1587 return err; 1620 1588 err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, ··· 1627 1593 if (x86_match_cpu(zmm_exclusion_list)) { 1628 1594 int i; 1629 1595 1630 - aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; 1596 + for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++) 1597 + skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1; 1631 1598 for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) 1632 1599 aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; 1633 1600 } 1634 1601 1635 - err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, 1636 - &aes_xts_simdalg_vaes_avx10_512); 1602 + err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512, 1603 + ARRAY_SIZE(skcipher_algs_vaes_avx10_512), 1604 + simd_skcipher_algs_vaes_avx10_512); 1637 1605 if (err) 1638 1606 return err; 1639 1607 err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, ··· 1649 1613 1650 1614 static void unregister_avx_algs(void) 1651 1615 { 1652 - if (aes_xts_simdalg_aesni_avx) 1653 - simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, 1654 - &aes_xts_simdalg_aesni_avx); 1616 + if (simd_skcipher_algs_aesni_avx[0]) 1617 + simd_unregister_skciphers(skcipher_algs_aesni_avx, 1618 + ARRAY_SIZE(skcipher_algs_aesni_avx), 1619 + simd_skcipher_algs_aesni_avx); 1655 1620 if (aes_gcm_simdalgs_aesni_avx[0]) 1656 1621 simd_unregister_aeads(aes_gcm_algs_aesni_avx, 1657 1622 ARRAY_SIZE(aes_gcm_algs_aesni_avx), 1658 1623 aes_gcm_simdalgs_aesni_avx); 1659 1624 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 1660 - if (aes_xts_simdalg_vaes_avx2) 1661 - simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, 1662 - &aes_xts_simdalg_vaes_avx2); 1663 - if (aes_xts_simdalg_vaes_avx10_256) 1664 - simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, 1665 - &aes_xts_simdalg_vaes_avx10_256); 1625 + if (simd_skcipher_algs_vaes_avx2[0]) 1626 + simd_unregister_skciphers(skcipher_algs_vaes_avx2, 1627 + ARRAY_SIZE(skcipher_algs_vaes_avx2), 1628 + simd_skcipher_algs_vaes_avx2); 1629 + if (simd_skcipher_algs_vaes_avx10_256[0]) 1630 + simd_unregister_skciphers(skcipher_algs_vaes_avx10_256, 1631 + ARRAY_SIZE(skcipher_algs_vaes_avx10_256), 1632 + simd_skcipher_algs_vaes_avx10_256); 1666 1633 if (aes_gcm_simdalgs_vaes_avx10_256[0]) 1667 1634 simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, 1668 1635 ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), 1669 1636 aes_gcm_simdalgs_vaes_avx10_256); 1670 - if (aes_xts_simdalg_vaes_avx10_512) 1671 - simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, 1672 - &aes_xts_simdalg_vaes_avx10_512); 1637 + if (simd_skcipher_algs_vaes_avx10_512[0]) 1638 + simd_unregister_skciphers(skcipher_algs_vaes_avx10_512, 1639 + ARRAY_SIZE(skcipher_algs_vaes_avx10_512), 1640 + simd_skcipher_algs_vaes_avx10_512); 1673 1641 if (aes_gcm_simdalgs_vaes_avx10_512[0]) 1674 1642 simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, 1675 1643 ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), ··· 1706 1666 1707 1667 if (!x86_match_cpu(aesni_cpu_id)) 1708 1668 return -ENODEV; 1709 - #ifdef CONFIG_X86_64 1710 - if (boot_cpu_has(X86_FEATURE_AVX)) { 1711 - /* optimize performance of ctr mode encryption transform */ 1712 - static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); 1713 - pr_info("AES CTR mode by8 optimization enabled\n"); 1714 - } 1715 - #endif /* CONFIG_X86_64 */ 1716 1669 1717 1670 err = crypto_register_alg(&aesni_cipher_alg); 1718 1671 if (err) ··· 1723 1690 if (err) 1724 1691 goto unregister_skciphers; 1725 1692 1726 - #ifdef CONFIG_X86_64 1727 - if (boot_cpu_has(X86_FEATURE_AVX)) 1728 - err = simd_register_skciphers_compat(&aesni_xctr, 1, 1729 - &aesni_simd_xctr); 1730 - if (err) 1731 - goto unregister_aeads; 1732 - #endif /* CONFIG_X86_64 */ 1733 - 1734 1693 err = register_avx_algs(); 1735 1694 if (err) 1736 1695 goto unregister_avx; ··· 1731 1706 1732 1707 unregister_avx: 1733 1708 unregister_avx_algs(); 1734 - #ifdef CONFIG_X86_64 1735 - if (aesni_simd_xctr) 1736 - simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); 1737 - unregister_aeads: 1738 - #endif /* CONFIG_X86_64 */ 1739 1709 simd_unregister_aeads(aes_gcm_algs_aesni, 1740 1710 ARRAY_SIZE(aes_gcm_algs_aesni), 1741 1711 aes_gcm_simdalgs_aesni); ··· 1750 1730 simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), 1751 1731 aesni_simd_skciphers); 1752 1732 crypto_unregister_alg(&aesni_cipher_alg); 1753 - #ifdef CONFIG_X86_64 1754 - if (boot_cpu_has(X86_FEATURE_AVX)) 1755 - simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); 1756 - #endif /* CONFIG_X86_64 */ 1757 1733 unregister_avx_algs(); 1758 1734 } 1759 1735

Configure Feed

Configure Feed