Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

crypto: x86/aegis128 - optimize partial block handling using SSE4.1

Optimize the code that loads and stores partial blocks, taking advantage
of SSE4.1. The code is adapted from that in aes-gcm-aesni-x86_64.S.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Eric Biggers and committed by
Herbert Xu
933e8974 8da94b30

+87 -133
+87 -133
arch/x86/crypto/aegis128-aesni-asm.S
··· 4 4 * 5 5 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> 6 6 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. 7 + * Copyright 2024 Google LLC 7 8 */ 8 9 9 10 #include <linux/linkage.h> ··· 29 28 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 30 29 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd 31 30 32 - .section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 33 - .align 16 34 - .Laegis128_counter: 35 - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 36 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 31 + .section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 32 + .align 32 33 + .Lzeropad_mask: 34 + .octa 0xffffffffffffffffffffffffffffffff 35 + .octa 0 37 36 38 37 .text 39 38 ··· 56 55 .endm 57 56 58 57 /* 59 - * __load_partial: internal ABI 60 - * input: 61 - * LEN - bytes 62 - * SRC - src 63 - * output: 64 - * MSG - message block 65 - * changed: 66 - * T0 67 - * %r8 68 - * %r9 58 + * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register 59 + * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. 69 60 */ 70 - SYM_FUNC_START_LOCAL(__load_partial) 71 - .set LEN, %ecx 72 - .set SRC, %rsi 73 - xor %r9d, %r9d 74 - pxor MSG, MSG 61 + .macro load_partial 62 + sub $8, %ecx /* LEN - 8 */ 63 + jle .Lle8\@ 75 64 76 - mov LEN, %r8d 77 - and $0x1, %r8 78 - jz .Lld_partial_1 65 + /* Load 9 <= LEN <= 15 bytes: */ 66 + movq (SRC), MSG /* Load first 8 bytes */ 67 + mov (SRC, %rcx), %rax /* Load last 8 bytes */ 68 + neg %ecx 69 + shl $3, %ecx 70 + shr %cl, %rax /* Discard overlapping bytes */ 71 + pinsrq $1, %rax, MSG 72 + jmp .Ldone\@ 79 73 80 - mov LEN, %r8d 81 - and $0x1E, %r8 82 - add SRC, %r8 83 - mov (%r8), %r9b 74 + .Lle8\@: 75 + add $4, %ecx /* LEN - 4 */ 76 + jl .Llt4\@ 84 77 85 - .Lld_partial_1: 86 - mov LEN, %r8d 87 - and $0x2, %r8 88 - jz .Lld_partial_2 78 + /* Load 4 <= LEN <= 8 bytes: */ 79 + mov (SRC), %eax /* Load first 4 bytes */ 80 + mov (SRC, %rcx), %r8d /* Load last 4 bytes */ 81 + jmp .Lcombine\@ 89 82 90 - mov LEN, %r8d 91 - and $0x1C, %r8 92 - add SRC, %r8 93 - shl $0x10, %r9 94 - mov (%r8), %r9w 95 - 96 - .Lld_partial_2: 97 - mov LEN, %r8d 98 - and $0x4, %r8 99 - jz .Lld_partial_4 100 - 101 - mov LEN, %r8d 102 - and $0x18, %r8 103 - add SRC, %r8 104 - shl $32, %r9 105 - mov (%r8), %r8d 106 - xor %r8, %r9 107 - 108 - .Lld_partial_4: 109 - movq %r9, MSG 110 - 111 - mov LEN, %r8d 112 - and $0x8, %r8 113 - jz .Lld_partial_8 114 - 115 - mov LEN, %r8d 116 - and $0x10, %r8 117 - add SRC, %r8 118 - pslldq $8, MSG 119 - movq (%r8), T0 120 - pxor T0, MSG 121 - 122 - .Lld_partial_8: 123 - RET 124 - SYM_FUNC_END(__load_partial) 83 + .Llt4\@: 84 + /* Load 1 <= LEN <= 3 bytes: */ 85 + add $2, %ecx /* LEN - 2 */ 86 + movzbl (SRC), %eax /* Load first byte */ 87 + jl .Lmovq\@ 88 + movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ 89 + .Lcombine\@: 90 + shl $3, %ecx 91 + shl %cl, %r8 92 + or %r8, %rax /* Combine the two parts */ 93 + .Lmovq\@: 94 + movq %rax, MSG 95 + .Ldone\@: 96 + .endm 125 97 126 98 /* 127 - * __store_partial: internal ABI 128 - * input: 129 - * LEN - bytes 130 - * DST - dst 131 - * output: 132 - * T0 - message block 133 - * changed: 134 - * %r8 135 - * %r9 136 - * %r10 99 + * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer 100 + * DST. Clobbers %rax, %rcx, and %r8. 137 101 */ 138 - SYM_FUNC_START_LOCAL(__store_partial) 139 - .set LEN, %ecx 140 - .set DST, %rdx 141 - mov LEN, %r8d 142 - mov DST, %r9 102 + .macro store_partial msg 103 + sub $8, %ecx /* LEN - 8 */ 104 + jl .Llt8\@ 143 105 144 - movq T0, %r10 106 + /* Store 8 <= LEN <= 15 bytes: */ 107 + pextrq $1, \msg, %rax 108 + mov %ecx, %r8d 109 + shl $3, %ecx 110 + ror %cl, %rax 111 + mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ 112 + movq \msg, (DST) /* Store first 8 bytes */ 113 + jmp .Ldone\@ 145 114 146 - cmp $8, %r8 147 - jl .Lst_partial_8 115 + .Llt8\@: 116 + add $4, %ecx /* LEN - 4 */ 117 + jl .Llt4\@ 148 118 149 - mov %r10, (%r9) 150 - psrldq $8, T0 151 - movq T0, %r10 119 + /* Store 4 <= LEN <= 7 bytes: */ 120 + pextrd $1, \msg, %eax 121 + mov %ecx, %r8d 122 + shl $3, %ecx 123 + ror %cl, %eax 124 + mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ 125 + movd \msg, (DST) /* Store first 4 bytes */ 126 + jmp .Ldone\@ 152 127 153 - sub $8, %r8 154 - add $8, %r9 155 - 156 - .Lst_partial_8: 157 - cmp $4, %r8 158 - jl .Lst_partial_4 159 - 160 - mov %r10d, (%r9) 161 - shr $32, %r10 162 - 163 - sub $4, %r8 164 - add $4, %r9 165 - 166 - .Lst_partial_4: 167 - cmp $2, %r8 168 - jl .Lst_partial_2 169 - 170 - mov %r10w, (%r9) 171 - shr $0x10, %r10 172 - 173 - sub $2, %r8 174 - add $2, %r9 175 - 176 - .Lst_partial_2: 177 - cmp $1, %r8 178 - jl .Lst_partial_1 179 - 180 - mov %r10b, (%r9) 181 - 182 - .Lst_partial_1: 183 - RET 184 - SYM_FUNC_END(__store_partial) 128 + .Llt4\@: 129 + /* Store 1 <= LEN <= 3 bytes: */ 130 + pextrb $0, \msg, 0(DST) 131 + cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ 132 + jl .Ldone\@ 133 + pextrb $1, \msg, 1(DST) 134 + je .Ldone\@ 135 + pextrb $2, \msg, 2(DST) 136 + .Ldone\@: 137 + .endm 185 138 186 139 /* 187 140 * void aegis128_aesni_init(struct aegis_state *state, ··· 408 453 .set STATEP, %rdi 409 454 .set SRC, %rsi 410 455 .set DST, %rdx 411 - .set LEN, %ecx 456 + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ 412 457 FRAME_BEGIN 413 458 414 459 /* load the state: */ ··· 419 464 movdqu 0x40(STATEP), STATE4 420 465 421 466 /* encrypt message: */ 422 - call __load_partial 467 + mov LEN, %r9d 468 + load_partial 423 469 424 470 movdqa MSG, T0 425 471 pxor STATE1, T0 ··· 429 473 pand STATE3, T1 430 474 pxor T1, T0 431 475 432 - call __store_partial 476 + mov %r9d, LEN 477 + store_partial T0 433 478 434 479 aegis128_update 435 480 pxor MSG, STATE4 ··· 555 598 .set STATEP, %rdi 556 599 .set SRC, %rsi 557 600 .set DST, %rdx 558 - .set LEN, %ecx 601 + .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ 559 602 FRAME_BEGIN 560 603 561 604 /* load the state: */ ··· 566 609 movdqu 0x40(STATEP), STATE4 567 610 568 611 /* decrypt message: */ 569 - call __load_partial 612 + mov LEN, %r9d 613 + load_partial 570 614 571 615 pxor STATE1, MSG 572 616 pxor STATE4, MSG ··· 575 617 pand STATE3, T1 576 618 pxor T1, MSG 577 619 578 - movdqa MSG, T0 579 - call __store_partial 620 + mov %r9d, LEN 621 + store_partial MSG 580 622 581 623 /* mask with byte count: */ 582 - movd LEN, T0 583 - punpcklbw T0, T0 584 - punpcklbw T0, T0 585 - punpcklbw T0, T0 586 - punpcklbw T0, T0 587 - movdqa .Laegis128_counter(%rip), T1 588 - pcmpgtb T1, T0 624 + lea .Lzeropad_mask+16(%rip), %rax 625 + sub %r9, %rax 626 + movdqu (%rax), T0 589 627 pand T0, MSG 590 628 591 629 aegis128_update