Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

crypto: arm64/sm4 - refactor and simplify NEON implementation

This patch does not add new features. The main work is to refactor and
simplify the implementation of SM4 NEON, which is reflected in the
following aspects:

The accelerated implementation supports the arbitrary number of blocks,
not just multiples of 8, which simplifies the implementation and brings
some optimization acceleration for data that is not aligned by 8 blocks.

When loading the input data, use the ld4 instruction to replace the
original ld1 instruction as much as possible, which will save the cost
of matrix transposition of the input data.

Use 8-block parallelism whenever possible to speed up matrix transpose
and rotation operations, instead of up to 4-block parallelism.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Tianjia Zhang and committed by
Herbert Xu
62508017 a41b2129

+447 -337
+398 -206
arch/arm64/crypto/sm4-neon-core.S
··· 18 18 #define RTMP2 v10 19 19 #define RTMP3 v11 20 20 21 + #define RTMP4 v12 22 + #define RTMP5 v13 23 + #define RTMP6 v14 24 + #define RTMP7 v15 25 + 21 26 #define RX0 v12 22 27 #define RX1 v13 23 28 #define RKEY v14 ··· 30 25 31 26 /* Helper macros. */ 32 27 33 - #define PREPARE \ 28 + #define SM4_PREPARE() \ 34 29 adr_l x5, crypto_sm4_sbox; \ 35 30 ld1 {v16.16b-v19.16b}, [x5], #64; \ 36 31 ld1 {v20.16b-v23.16b}, [x5], #64; \ ··· 47 42 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 48 43 zip2 s3.2d, RTMP2.2d, RTMP3.2d; 49 44 50 - #define rotate_clockwise_90(s0, s1, s2, s3) \ 45 + #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 46 + zip1 RTMP0.4s, s0.4s, s1.4s; \ 47 + zip1 RTMP1.4s, s2.4s, s3.4s; \ 48 + zip2 RTMP2.4s, s0.4s, s1.4s; \ 49 + zip2 RTMP3.4s, s2.4s, s3.4s; \ 50 + zip1 RTMP4.4s, s4.4s, s5.4s; \ 51 + zip1 RTMP5.4s, s6.4s, s7.4s; \ 52 + zip2 RTMP6.4s, s4.4s, s5.4s; \ 53 + zip2 RTMP7.4s, s6.4s, s7.4s; \ 54 + zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 55 + zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 56 + zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 57 + zip2 s3.2d, RTMP2.2d, RTMP3.2d; \ 58 + zip1 s4.2d, RTMP4.2d, RTMP5.2d; \ 59 + zip2 s5.2d, RTMP4.2d, RTMP5.2d; \ 60 + zip1 s6.2d, RTMP6.2d, RTMP7.2d; \ 61 + zip2 s7.2d, RTMP6.2d, RTMP7.2d; 62 + 63 + #define rotate_clockwise_4x4(s0, s1, s2, s3) \ 51 64 zip1 RTMP0.4s, s1.4s, s0.4s; \ 52 65 zip2 RTMP1.4s, s1.4s, s0.4s; \ 53 66 zip1 RTMP2.4s, s3.4s, s2.4s; \ ··· 74 51 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 75 52 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 76 53 zip2 s3.2d, RTMP3.2d, RTMP1.2d; 54 + 55 + #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 56 + zip1 RTMP0.4s, s1.4s, s0.4s; \ 57 + zip1 RTMP2.4s, s3.4s, s2.4s; \ 58 + zip2 RTMP1.4s, s1.4s, s0.4s; \ 59 + zip2 RTMP3.4s, s3.4s, s2.4s; \ 60 + zip1 RTMP4.4s, s5.4s, s4.4s; \ 61 + zip1 RTMP6.4s, s7.4s, s6.4s; \ 62 + zip2 RTMP5.4s, s5.4s, s4.4s; \ 63 + zip2 RTMP7.4s, s7.4s, s6.4s; \ 64 + zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 65 + zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 66 + zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 67 + zip2 s3.2d, RTMP3.2d, RTMP1.2d; \ 68 + zip1 s4.2d, RTMP6.2d, RTMP4.2d; \ 69 + zip2 s5.2d, RTMP6.2d, RTMP4.2d; \ 70 + zip1 s6.2d, RTMP7.2d, RTMP5.2d; \ 71 + zip2 s7.2d, RTMP7.2d, RTMP5.2d; 77 72 78 73 #define ROUND4(round, s0, s1, s2, s3) \ 79 74 dup RX0.4s, RKEY.s[round]; \ ··· 128 87 /* s0 ^= RTMP3 */ \ 129 88 eor s0.16b, s0.16b, RTMP3.16b; 130 89 131 - #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 132 - rev32 b0.16b, b0.16b; \ 133 - rev32 b1.16b, b1.16b; \ 134 - rev32 b2.16b, b2.16b; \ 135 - rev32 b3.16b, b3.16b; \ 136 - \ 137 - transpose_4x4(b0, b1, b2, b3); \ 138 - \ 90 + #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ 139 91 mov x6, 8; \ 140 92 4: \ 141 93 ld1 {RKEY.4s}, [x0], #16; \ ··· 141 107 \ 142 108 bne 4b; \ 143 109 \ 144 - rotate_clockwise_90(b0, b1, b2, b3); \ 145 110 rev32 b0.16b, b0.16b; \ 146 111 rev32 b1.16b, b1.16b; \ 147 112 rev32 b2.16b, b2.16b; \ 148 113 rev32 b3.16b, b3.16b; \ 149 114 \ 115 + rotate_clockwise_4x4(b0, b1, b2, b3); \ 116 + \ 150 117 /* repoint to rkey */ \ 151 118 sub x0, x0, #128; 119 + 120 + #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 121 + rev32 b0.16b, b0.16b; \ 122 + rev32 b1.16b, b1.16b; \ 123 + rev32 b2.16b, b2.16b; \ 124 + rev32 b3.16b, b3.16b; \ 125 + SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); 152 126 153 127 #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ 154 128 /* rk ^ s1 ^ s2 ^ s3 */ \ ··· 217 175 eor s0.16b, s0.16b, RTMP0.16b; \ 218 176 eor t0.16b, t0.16b, RTMP1.16b; 219 177 220 - #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 178 + #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \ 221 179 rev32 b0.16b, b0.16b; \ 222 180 rev32 b1.16b, b1.16b; \ 223 181 rev32 b2.16b, b2.16b; \ ··· 226 184 rev32 b5.16b, b5.16b; \ 227 185 rev32 b6.16b, b6.16b; \ 228 186 rev32 b7.16b, b7.16b; \ 229 - \ 230 - transpose_4x4(b0, b1, b2, b3); \ 231 - transpose_4x4(b4, b5, b6, b7); \ 232 187 \ 233 188 mov x6, 8; \ 234 189 8: \ ··· 239 200 \ 240 201 bne 8b; \ 241 202 \ 242 - rotate_clockwise_90(b0, b1, b2, b3); \ 243 - rotate_clockwise_90(b4, b5, b6, b7); \ 244 203 rev32 b0.16b, b0.16b; \ 245 204 rev32 b1.16b, b1.16b; \ 246 205 rev32 b2.16b, b2.16b; \ ··· 251 214 /* repoint to rkey */ \ 252 215 sub x0, x0, #128; 253 216 217 + #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 218 + SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \ 219 + rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \ 220 + 254 221 255 222 .align 3 256 - SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4) 223 + SYM_FUNC_START(sm4_neon_crypt) 257 224 /* input: 258 225 * x0: round key array, CTX 259 226 * x1: dst 260 227 * x2: src 261 - * w3: num blocks (1..4) 228 + * w3: nblocks 262 229 */ 263 - PREPARE; 230 + SM4_PREPARE() 264 231 265 - ld1 {v0.16b}, [x2], #16; 266 - mov v1.16b, v0.16b; 267 - mov v2.16b, v0.16b; 268 - mov v3.16b, v0.16b; 269 - cmp w3, #2; 270 - blt .Lblk4_load_input_done; 271 - ld1 {v1.16b}, [x2], #16; 272 - beq .Lblk4_load_input_done; 273 - ld1 {v2.16b}, [x2], #16; 274 - cmp w3, #3; 275 - beq .Lblk4_load_input_done; 276 - ld1 {v3.16b}, [x2]; 232 + .Lcrypt_loop_8x: 233 + sub w3, w3, #8 234 + tbnz w3, #31, .Lcrypt_4x 277 235 278 - .Lblk4_load_input_done: 279 - SM4_CRYPT_BLK4(v0, v1, v2, v3); 236 + ld4 {v0.4s-v3.4s}, [x2], #64 237 + ld4 {v4.4s-v7.4s}, [x2], #64 280 238 281 - st1 {v0.16b}, [x1], #16; 282 - cmp w3, #2; 283 - blt .Lblk4_store_output_done; 284 - st1 {v1.16b}, [x1], #16; 285 - beq .Lblk4_store_output_done; 286 - st1 {v2.16b}, [x1], #16; 287 - cmp w3, #3; 288 - beq .Lblk4_store_output_done; 289 - st1 {v3.16b}, [x1]; 239 + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 290 240 291 - .Lblk4_store_output_done: 292 - ret; 293 - SYM_FUNC_END(__sm4_neon_crypt_blk1_4) 241 + st1 {v0.16b-v3.16b}, [x1], #64 242 + st1 {v4.16b-v7.16b}, [x1], #64 294 243 295 - .align 3 296 - SYM_FUNC_START(sm4_neon_crypt_blk1_8) 297 - /* input: 298 - * x0: round key array, CTX 299 - * x1: dst 300 - * x2: src 301 - * w3: num blocks (1..8) 302 - */ 303 - cmp w3, #5; 304 - blt __sm4_neon_crypt_blk1_4; 244 + cbz w3, .Lcrypt_end 245 + b .Lcrypt_loop_8x 305 246 306 - PREPARE; 247 + .Lcrypt_4x: 248 + add w3, w3, #8 249 + cmp w3, #4 250 + blt .Lcrypt_tail 307 251 308 - ld1 {v0.16b-v3.16b}, [x2], #64; 309 - ld1 {v4.16b}, [x2], #16; 310 - mov v5.16b, v4.16b; 311 - mov v6.16b, v4.16b; 312 - mov v7.16b, v4.16b; 313 - beq .Lblk8_load_input_done; 314 - ld1 {v5.16b}, [x2], #16; 315 - cmp w3, #7; 316 - blt .Lblk8_load_input_done; 317 - ld1 {v6.16b}, [x2], #16; 318 - beq .Lblk8_load_input_done; 319 - ld1 {v7.16b}, [x2]; 252 + sub w3, w3, #4 320 253 321 - .Lblk8_load_input_done: 322 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 254 + ld4 {v0.4s-v3.4s}, [x2], #64 323 255 324 - cmp w3, #6; 325 - st1 {v0.16b-v3.16b}, [x1], #64; 326 - st1 {v4.16b}, [x1], #16; 327 - blt .Lblk8_store_output_done; 328 - st1 {v5.16b}, [x1], #16; 329 - beq .Lblk8_store_output_done; 330 - st1 {v6.16b}, [x1], #16; 331 - cmp w3, #7; 332 - beq .Lblk8_store_output_done; 333 - st1 {v7.16b}, [x1]; 256 + SM4_CRYPT_BLK4(v0, v1, v2, v3) 334 257 335 - .Lblk8_store_output_done: 336 - ret; 337 - SYM_FUNC_END(sm4_neon_crypt_blk1_8) 258 + st1 {v0.16b-v3.16b}, [x1], #64 338 259 339 - .align 3 340 - SYM_FUNC_START(sm4_neon_crypt_blk8) 341 - /* input: 342 - * x0: round key array, CTX 343 - * x1: dst 344 - * x2: src 345 - * w3: nblocks (multiples of 8) 346 - */ 347 - PREPARE; 260 + cbz w3, .Lcrypt_end 348 261 349 - .Lcrypt_loop_blk: 350 - subs w3, w3, #8; 351 - bmi .Lcrypt_end; 262 + .Lcrypt_tail: 263 + cmp w3, #2 264 + ld1 {v0.16b}, [x2], #16 265 + blt .Lcrypt_tail_load_done 266 + ld1 {v1.16b}, [x2], #16 267 + beq .Lcrypt_tail_load_done 268 + ld1 {v2.16b}, [x2], #16 352 269 353 - ld1 {v0.16b-v3.16b}, [x2], #64; 354 - ld1 {v4.16b-v7.16b}, [x2], #64; 270 + .Lcrypt_tail_load_done: 271 + transpose_4x4(v0, v1, v2, v3) 355 272 356 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 273 + SM4_CRYPT_BLK4(v0, v1, v2, v3) 357 274 358 - st1 {v0.16b-v3.16b}, [x1], #64; 359 - st1 {v4.16b-v7.16b}, [x1], #64; 360 - 361 - b .Lcrypt_loop_blk; 275 + cmp w3, #2 276 + st1 {v0.16b}, [x1], #16 277 + blt .Lcrypt_end 278 + st1 {v1.16b}, [x1], #16 279 + beq .Lcrypt_end 280 + st1 {v2.16b}, [x1], #16 362 281 363 282 .Lcrypt_end: 364 - ret; 365 - SYM_FUNC_END(sm4_neon_crypt_blk8) 283 + ret 284 + SYM_FUNC_END(sm4_neon_crypt) 366 285 367 286 .align 3 368 - SYM_FUNC_START(sm4_neon_cbc_dec_blk8) 287 + SYM_FUNC_START(sm4_neon_cbc_dec) 369 288 /* input: 370 289 * x0: round key array, CTX 371 290 * x1: dst 372 291 * x2: src 373 292 * x3: iv (big endian, 128 bit) 374 - * w4: nblocks (multiples of 8) 293 + * w4: nblocks 375 294 */ 376 - PREPARE; 295 + SM4_PREPARE() 377 296 378 - ld1 {RIV.16b}, [x3]; 297 + ld1 {RIV.16b}, [x3] 379 298 380 - .Lcbc_loop_blk: 381 - subs w4, w4, #8; 382 - bmi .Lcbc_end; 299 + .Lcbc_dec_loop_8x: 300 + sub w4, w4, #8 301 + tbnz w4, #31, .Lcbc_dec_4x 383 302 384 - ld1 {v0.16b-v3.16b}, [x2], #64; 385 - ld1 {v4.16b-v7.16b}, [x2]; 303 + ld4 {v0.4s-v3.4s}, [x2], #64 304 + ld4 {v4.4s-v7.4s}, [x2] 386 305 387 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 306 + SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7) 388 307 389 - sub x2, x2, #64; 390 - eor v0.16b, v0.16b, RIV.16b; 391 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 392 - eor v1.16b, v1.16b, RTMP0.16b; 393 - eor v2.16b, v2.16b, RTMP1.16b; 394 - eor v3.16b, v3.16b, RTMP2.16b; 395 - st1 {v0.16b-v3.16b}, [x1], #64; 308 + /* Avoid overwriting the RIV register */ 309 + rotate_clockwise_4x4(v0, v1, v2, v3) 310 + rotate_clockwise_4x4(v4, v5, v6, v7) 396 311 397 - eor v4.16b, v4.16b, RTMP3.16b; 398 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 399 - eor v5.16b, v5.16b, RTMP0.16b; 400 - eor v6.16b, v6.16b, RTMP1.16b; 401 - eor v7.16b, v7.16b, RTMP2.16b; 312 + sub x2, x2, #64 402 313 403 - mov RIV.16b, RTMP3.16b; 404 - st1 {v4.16b-v7.16b}, [x1], #64; 314 + eor v0.16b, v0.16b, RIV.16b 405 315 406 - b .Lcbc_loop_blk; 316 + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 317 + ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 407 318 408 - .Lcbc_end: 319 + eor v1.16b, v1.16b, RTMP0.16b 320 + eor v2.16b, v2.16b, RTMP1.16b 321 + eor v3.16b, v3.16b, RTMP2.16b 322 + eor v4.16b, v4.16b, RTMP3.16b 323 + eor v5.16b, v5.16b, RTMP4.16b 324 + eor v6.16b, v6.16b, RTMP5.16b 325 + eor v7.16b, v7.16b, RTMP6.16b 326 + 327 + mov RIV.16b, RTMP7.16b 328 + 329 + st1 {v0.16b-v3.16b}, [x1], #64 330 + st1 {v4.16b-v7.16b}, [x1], #64 331 + 332 + cbz w4, .Lcbc_dec_end 333 + b .Lcbc_dec_loop_8x 334 + 335 + .Lcbc_dec_4x: 336 + add w4, w4, #8 337 + cmp w4, #4 338 + blt .Lcbc_dec_tail 339 + 340 + sub w4, w4, #4 341 + 342 + ld1 {v0.16b-v3.16b}, [x2], #64 343 + 344 + rev32 v4.16b, v0.16b 345 + rev32 v5.16b, v1.16b 346 + rev32 v6.16b, v2.16b 347 + rev32 v7.16b, v3.16b 348 + 349 + transpose_4x4(v4, v5, v6, v7) 350 + 351 + SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 352 + 353 + eor v4.16b, v4.16b, RIV.16b 354 + eor v5.16b, v5.16b, v0.16b 355 + eor v6.16b, v6.16b, v1.16b 356 + eor v7.16b, v7.16b, v2.16b 357 + 358 + mov RIV.16b, v3.16b 359 + 360 + st1 {v4.16b-v7.16b}, [x1], #64 361 + 362 + cbz w4, .Lcbc_dec_end 363 + 364 + .Lcbc_dec_tail: 365 + cmp w4, #2 366 + ld1 {v0.16b}, [x2], #16 367 + blt .Lcbc_dec_tail_load_done 368 + ld1 {v1.16b}, [x2], #16 369 + beq .Lcbc_dec_tail_load_done 370 + ld1 {v2.16b}, [x2], #16 371 + 372 + .Lcbc_dec_tail_load_done: 373 + rev32 v4.16b, v0.16b 374 + rev32 v5.16b, v1.16b 375 + rev32 v6.16b, v2.16b 376 + 377 + transpose_4x4(v4, v5, v6, v7) 378 + 379 + SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 380 + 381 + cmp w4, #2 382 + eor v4.16b, v4.16b, RIV.16b 383 + mov RIV.16b, v0.16b 384 + st1 {v4.16b}, [x1], #16 385 + blt .Lcbc_dec_end 386 + 387 + eor v5.16b, v5.16b, v0.16b 388 + mov RIV.16b, v1.16b 389 + st1 {v5.16b}, [x1], #16 390 + beq .Lcbc_dec_end 391 + 392 + eor v6.16b, v6.16b, v1.16b 393 + mov RIV.16b, v2.16b 394 + st1 {v6.16b}, [x1], #16 395 + 396 + .Lcbc_dec_end: 409 397 /* store new IV */ 410 - st1 {RIV.16b}, [x3]; 398 + st1 {RIV.16b}, [x3] 411 399 412 - ret; 413 - SYM_FUNC_END(sm4_neon_cbc_dec_blk8) 400 + ret 401 + SYM_FUNC_END(sm4_neon_cbc_dec) 414 402 415 403 .align 3 416 - SYM_FUNC_START(sm4_neon_cfb_dec_blk8) 404 + SYM_FUNC_START(sm4_neon_cfb_dec) 417 405 /* input: 418 406 * x0: round key array, CTX 419 407 * x1: dst 420 408 * x2: src 421 409 * x3: iv (big endian, 128 bit) 422 - * w4: nblocks (multiples of 8) 410 + * w4: nblocks 423 411 */ 424 - PREPARE; 412 + SM4_PREPARE() 425 413 426 - ld1 {v0.16b}, [x3]; 414 + ld1 {v0.16b}, [x3] 427 415 428 - .Lcfb_loop_blk: 429 - subs w4, w4, #8; 430 - bmi .Lcfb_end; 416 + .Lcfb_dec_loop_8x: 417 + sub w4, w4, #8 418 + tbnz w4, #31, .Lcfb_dec_4x 431 419 432 - ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; 433 - ld1 {v4.16b-v7.16b}, [x2]; 420 + ld1 {v1.16b-v3.16b}, [x2], #48 421 + ld4 {v4.4s-v7.4s}, [x2] 434 422 435 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 423 + transpose_4x4(v0, v1, v2, v3) 436 424 437 - sub x2, x2, #48; 438 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 439 - eor v0.16b, v0.16b, RTMP0.16b; 440 - eor v1.16b, v1.16b, RTMP1.16b; 441 - eor v2.16b, v2.16b, RTMP2.16b; 442 - eor v3.16b, v3.16b, RTMP3.16b; 443 - st1 {v0.16b-v3.16b}, [x1], #64; 425 + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 444 426 445 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 446 - eor v4.16b, v4.16b, RTMP0.16b; 447 - eor v5.16b, v5.16b, RTMP1.16b; 448 - eor v6.16b, v6.16b, RTMP2.16b; 449 - eor v7.16b, v7.16b, RTMP3.16b; 450 - st1 {v4.16b-v7.16b}, [x1], #64; 427 + sub x2, x2, #48 428 + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 429 + ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 451 430 452 - mov v0.16b, RTMP3.16b; 431 + eor v0.16b, v0.16b, RTMP0.16b 432 + eor v1.16b, v1.16b, RTMP1.16b 433 + eor v2.16b, v2.16b, RTMP2.16b 434 + eor v3.16b, v3.16b, RTMP3.16b 435 + eor v4.16b, v4.16b, RTMP4.16b 436 + eor v5.16b, v5.16b, RTMP5.16b 437 + eor v6.16b, v6.16b, RTMP6.16b 438 + eor v7.16b, v7.16b, RTMP7.16b 453 439 454 - b .Lcfb_loop_blk; 440 + st1 {v0.16b-v3.16b}, [x1], #64 441 + st1 {v4.16b-v7.16b}, [x1], #64 455 442 456 - .Lcfb_end: 443 + mov v0.16b, RTMP7.16b 444 + 445 + cbz w4, .Lcfb_dec_end 446 + b .Lcfb_dec_loop_8x 447 + 448 + .Lcfb_dec_4x: 449 + add w4, w4, #8 450 + cmp w4, #4 451 + blt .Lcfb_dec_tail 452 + 453 + sub w4, w4, #4 454 + 455 + ld1 {v4.16b-v7.16b}, [x2], #64 456 + 457 + rev32 v0.16b, v0.16b /* v0 is IV register */ 458 + rev32 v1.16b, v4.16b 459 + rev32 v2.16b, v5.16b 460 + rev32 v3.16b, v6.16b 461 + 462 + transpose_4x4(v0, v1, v2, v3) 463 + 464 + SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) 465 + 466 + eor v0.16b, v0.16b, v4.16b 467 + eor v1.16b, v1.16b, v5.16b 468 + eor v2.16b, v2.16b, v6.16b 469 + eor v3.16b, v3.16b, v7.16b 470 + 471 + st1 {v0.16b-v3.16b}, [x1], #64 472 + 473 + mov v0.16b, v7.16b 474 + 475 + cbz w4, .Lcfb_dec_end 476 + 477 + .Lcfb_dec_tail: 478 + cmp w4, #2 479 + ld1 {v4.16b}, [x2], #16 480 + blt .Lcfb_dec_tail_load_done 481 + ld1 {v5.16b}, [x2], #16 482 + beq .Lcfb_dec_tail_load_done 483 + ld1 {v6.16b}, [x2], #16 484 + 485 + .Lcfb_dec_tail_load_done: 486 + rev32 v0.16b, v0.16b /* v0 is IV register */ 487 + rev32 v1.16b, v4.16b 488 + rev32 v2.16b, v5.16b 489 + 490 + transpose_4x4(v0, v1, v2, v3) 491 + 492 + SM4_CRYPT_BLK4_BE(v0, v1, v2, v3) 493 + 494 + cmp w4, #2 495 + eor v0.16b, v0.16b, v4.16b 496 + st1 {v0.16b}, [x1], #16 497 + mov v0.16b, v4.16b 498 + blt .Lcfb_dec_end 499 + 500 + eor v1.16b, v1.16b, v5.16b 501 + st1 {v1.16b}, [x1], #16 502 + mov v0.16b, v5.16b 503 + beq .Lcfb_dec_end 504 + 505 + eor v2.16b, v2.16b, v6.16b 506 + st1 {v2.16b}, [x1], #16 507 + mov v0.16b, v6.16b 508 + 509 + .Lcfb_dec_end: 457 510 /* store new IV */ 458 - st1 {v0.16b}, [x3]; 511 + st1 {v0.16b}, [x3] 459 512 460 - ret; 461 - SYM_FUNC_END(sm4_neon_cfb_dec_blk8) 513 + ret 514 + SYM_FUNC_END(sm4_neon_cfb_dec) 462 515 463 516 .align 3 464 - SYM_FUNC_START(sm4_neon_ctr_enc_blk8) 517 + SYM_FUNC_START(sm4_neon_ctr_crypt) 465 518 /* input: 466 519 * x0: round key array, CTX 467 520 * x1: dst 468 521 * x2: src 469 522 * x3: ctr (big endian, 128 bit) 470 - * w4: nblocks (multiples of 8) 523 + * w4: nblocks 471 524 */ 472 - PREPARE; 525 + SM4_PREPARE() 473 526 474 - ldp x7, x8, [x3]; 475 - rev x7, x7; 476 - rev x8, x8; 527 + ldp x7, x8, [x3] 528 + rev x7, x7 529 + rev x8, x8 477 530 478 - .Lctr_loop_blk: 479 - subs w4, w4, #8; 480 - bmi .Lctr_end; 531 + .Lctr_crypt_loop_8x: 532 + sub w4, w4, #8 533 + tbnz w4, #31, .Lctr_crypt_4x 481 534 482 - #define inc_le128(vctr) \ 483 - mov vctr.d[1], x8; \ 484 - mov vctr.d[0], x7; \ 485 - adds x8, x8, #1; \ 486 - adc x7, x7, xzr; \ 487 - rev64 vctr.16b, vctr.16b; 535 + #define inc_le128(vctr) \ 536 + mov vctr.d[1], x8; \ 537 + mov vctr.d[0], x7; \ 538 + adds x8, x8, #1; \ 539 + rev64 vctr.16b, vctr.16b; \ 540 + adc x7, x7, xzr; 488 541 489 542 /* construct CTRs */ 490 - inc_le128(v0); /* +0 */ 491 - inc_le128(v1); /* +1 */ 492 - inc_le128(v2); /* +2 */ 493 - inc_le128(v3); /* +3 */ 494 - inc_le128(v4); /* +4 */ 495 - inc_le128(v5); /* +5 */ 496 - inc_le128(v6); /* +6 */ 497 - inc_le128(v7); /* +7 */ 543 + inc_le128(v0) /* +0 */ 544 + inc_le128(v1) /* +1 */ 545 + inc_le128(v2) /* +2 */ 546 + inc_le128(v3) /* +3 */ 547 + inc_le128(v4) /* +4 */ 548 + inc_le128(v5) /* +5 */ 549 + inc_le128(v6) /* +6 */ 550 + inc_le128(v7) /* +7 */ 498 551 499 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 552 + transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7) 500 553 501 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 502 - eor v0.16b, v0.16b, RTMP0.16b; 503 - eor v1.16b, v1.16b, RTMP1.16b; 504 - eor v2.16b, v2.16b, RTMP2.16b; 505 - eor v3.16b, v3.16b, RTMP3.16b; 506 - st1 {v0.16b-v3.16b}, [x1], #64; 554 + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 507 555 508 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 509 - eor v4.16b, v4.16b, RTMP0.16b; 510 - eor v5.16b, v5.16b, RTMP1.16b; 511 - eor v6.16b, v6.16b, RTMP2.16b; 512 - eor v7.16b, v7.16b, RTMP3.16b; 513 - st1 {v4.16b-v7.16b}, [x1], #64; 556 + ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 557 + ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 514 558 515 - b .Lctr_loop_blk; 559 + eor v0.16b, v0.16b, RTMP0.16b 560 + eor v1.16b, v1.16b, RTMP1.16b 561 + eor v2.16b, v2.16b, RTMP2.16b 562 + eor v3.16b, v3.16b, RTMP3.16b 563 + eor v4.16b, v4.16b, RTMP4.16b 564 + eor v5.16b, v5.16b, RTMP5.16b 565 + eor v6.16b, v6.16b, RTMP6.16b 566 + eor v7.16b, v7.16b, RTMP7.16b 516 567 517 - .Lctr_end: 568 + st1 {v0.16b-v3.16b}, [x1], #64 569 + st1 {v4.16b-v7.16b}, [x1], #64 570 + 571 + cbz w4, .Lctr_crypt_end 572 + b .Lctr_crypt_loop_8x 573 + 574 + .Lctr_crypt_4x: 575 + add w4, w4, #8 576 + cmp w4, #4 577 + blt .Lctr_crypt_tail 578 + 579 + sub w4, w4, #4 580 + 581 + /* construct CTRs */ 582 + inc_le128(v0) /* +0 */ 583 + inc_le128(v1) /* +1 */ 584 + inc_le128(v2) /* +2 */ 585 + inc_le128(v3) /* +3 */ 586 + 587 + ld1 {v4.16b-v7.16b}, [x2], #64 588 + 589 + transpose_4x4(v0, v1, v2, v3) 590 + 591 + SM4_CRYPT_BLK4(v0, v1, v2, v3) 592 + 593 + eor v0.16b, v0.16b, v4.16b 594 + eor v1.16b, v1.16b, v5.16b 595 + eor v2.16b, v2.16b, v6.16b 596 + eor v3.16b, v3.16b, v7.16b 597 + 598 + st1 {v0.16b-v3.16b}, [x1], #64 599 + 600 + cbz w4, .Lctr_crypt_end 601 + 602 + .Lctr_crypt_tail: 603 + /* inc_le128 will change the sign bit */ 604 + ld1 {v4.16b}, [x2], #16 605 + inc_le128(v0) 606 + cmp w4, #2 607 + blt .Lctr_crypt_tail_load_done 608 + 609 + ld1 {v5.16b}, [x2], #16 610 + inc_le128(v1) 611 + cmp w4, #2 612 + beq .Lctr_crypt_tail_load_done 613 + 614 + ld1 {v6.16b}, [x2], #16 615 + inc_le128(v2) 616 + 617 + .Lctr_crypt_tail_load_done: 618 + transpose_4x4(v0, v1, v2, v3) 619 + 620 + SM4_CRYPT_BLK4(v0, v1, v2, v3) 621 + 622 + cmp w4, #2 623 + 624 + eor v0.16b, v0.16b, v4.16b 625 + st1 {v0.16b}, [x1], #16 626 + blt .Lctr_crypt_end 627 + 628 + eor v1.16b, v1.16b, v5.16b 629 + st1 {v1.16b}, [x1], #16 630 + beq .Lctr_crypt_end 631 + 632 + eor v2.16b, v2.16b, v6.16b 633 + st1 {v2.16b}, [x1], #16 634 + 635 + .Lctr_crypt_end: 518 636 /* store new CTR */ 519 - rev x7, x7; 520 - rev x8, x8; 521 - stp x7, x8, [x3]; 637 + rev x7, x7 638 + rev x8, x8 639 + stp x7, x8, [x3] 522 640 523 - ret; 524 - SYM_FUNC_END(sm4_neon_ctr_enc_blk8) 641 + ret 642 + SYM_FUNC_END(sm4_neon_ctr_crypt)
+49 -131
arch/arm64/crypto/sm4-neon-glue.c
··· 18 18 #include <crypto/internal/skcipher.h> 19 19 #include <crypto/sm4.h> 20 20 21 - #define BYTES2BLKS(nbytes) ((nbytes) >> 4) 22 - #define BYTES2BLK8(nbytes) (((nbytes) >> 4) & ~(8 - 1)) 23 - 24 - asmlinkage void sm4_neon_crypt_blk1_8(const u32 *rkey, u8 *dst, const u8 *src, 25 - unsigned int nblks); 26 - asmlinkage void sm4_neon_crypt_blk8(const u32 *rkey, u8 *dst, const u8 *src, 27 - unsigned int nblks); 28 - asmlinkage void sm4_neon_cbc_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, 29 - u8 *iv, unsigned int nblks); 30 - asmlinkage void sm4_neon_cfb_dec_blk8(const u32 *rkey, u8 *dst, const u8 *src, 31 - u8 *iv, unsigned int nblks); 32 - asmlinkage void sm4_neon_ctr_enc_blk8(const u32 *rkey, u8 *dst, const u8 *src, 33 - u8 *iv, unsigned int nblks); 21 + asmlinkage void sm4_neon_crypt(const u32 *rkey, u8 *dst, const u8 *src, 22 + unsigned int nblocks); 23 + asmlinkage void sm4_neon_cbc_dec(const u32 *rkey_dec, u8 *dst, const u8 *src, 24 + u8 *iv, unsigned int nblocks); 25 + asmlinkage void sm4_neon_cfb_dec(const u32 *rkey_enc, u8 *dst, const u8 *src, 26 + u8 *iv, unsigned int nblocks); 27 + asmlinkage void sm4_neon_ctr_crypt(const u32 *rkey_enc, u8 *dst, const u8 *src, 28 + u8 *iv, unsigned int nblocks); 34 29 35 30 static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, 36 31 unsigned int key_len) ··· 46 51 while ((nbytes = walk.nbytes) > 0) { 47 52 const u8 *src = walk.src.virt.addr; 48 53 u8 *dst = walk.dst.virt.addr; 49 - unsigned int nblks; 54 + unsigned int nblocks; 50 55 51 - kernel_neon_begin(); 56 + nblocks = nbytes / SM4_BLOCK_SIZE; 57 + if (nblocks) { 58 + kernel_neon_begin(); 52 59 53 - nblks = BYTES2BLK8(nbytes); 54 - if (nblks) { 55 - sm4_neon_crypt_blk8(rkey, dst, src, nblks); 56 - dst += nblks * SM4_BLOCK_SIZE; 57 - src += nblks * SM4_BLOCK_SIZE; 58 - nbytes -= nblks * SM4_BLOCK_SIZE; 60 + sm4_neon_crypt(rkey, dst, src, nblocks); 61 + 62 + kernel_neon_end(); 59 63 } 60 64 61 - nblks = BYTES2BLKS(nbytes); 62 - if (nblks) { 63 - sm4_neon_crypt_blk1_8(rkey, dst, src, nblks); 64 - nbytes -= nblks * SM4_BLOCK_SIZE; 65 - } 66 - 67 - kernel_neon_end(); 68 - 69 - err = skcipher_walk_done(&walk, nbytes); 65 + err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE); 70 66 } 71 67 72 68 return err; ··· 124 138 while ((nbytes = walk.nbytes) > 0) { 125 139 const u8 *src = walk.src.virt.addr; 126 140 u8 *dst = walk.dst.virt.addr; 127 - unsigned int nblks; 141 + unsigned int nblocks; 128 142 129 - kernel_neon_begin(); 143 + nblocks = nbytes / SM4_BLOCK_SIZE; 144 + if (nblocks) { 145 + kernel_neon_begin(); 130 146 131 - nblks = BYTES2BLK8(nbytes); 132 - if (nblks) { 133 - sm4_neon_cbc_dec_blk8(ctx->rkey_dec, dst, src, 134 - walk.iv, nblks); 135 - dst += nblks * SM4_BLOCK_SIZE; 136 - src += nblks * SM4_BLOCK_SIZE; 137 - nbytes -= nblks * SM4_BLOCK_SIZE; 147 + sm4_neon_cbc_dec(ctx->rkey_dec, dst, src, 148 + walk.iv, nblocks); 149 + 150 + kernel_neon_end(); 138 151 } 139 152 140 - nblks = BYTES2BLKS(nbytes); 141 - if (nblks) { 142 - u8 keystream[SM4_BLOCK_SIZE * 8]; 143 - u8 iv[SM4_BLOCK_SIZE]; 144 - int i; 145 - 146 - sm4_neon_crypt_blk1_8(ctx->rkey_dec, keystream, 147 - src, nblks); 148 - 149 - src += ((int)nblks - 2) * SM4_BLOCK_SIZE; 150 - dst += (nblks - 1) * SM4_BLOCK_SIZE; 151 - memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); 152 - 153 - for (i = nblks - 1; i > 0; i--) { 154 - crypto_xor_cpy(dst, src, 155 - &keystream[i * SM4_BLOCK_SIZE], 156 - SM4_BLOCK_SIZE); 157 - src -= SM4_BLOCK_SIZE; 158 - dst -= SM4_BLOCK_SIZE; 159 - } 160 - crypto_xor_cpy(dst, walk.iv, 161 - keystream, SM4_BLOCK_SIZE); 162 - memcpy(walk.iv, iv, SM4_BLOCK_SIZE); 163 - nbytes -= nblks * SM4_BLOCK_SIZE; 164 - } 165 - 166 - kernel_neon_end(); 167 - 168 - err = skcipher_walk_done(&walk, nbytes); 153 + err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE); 169 154 } 170 155 171 156 return err; ··· 195 238 while ((nbytes = walk.nbytes) > 0) { 196 239 const u8 *src = walk.src.virt.addr; 197 240 u8 *dst = walk.dst.virt.addr; 198 - unsigned int nblks; 241 + unsigned int nblocks; 199 242 200 - kernel_neon_begin(); 243 + nblocks = nbytes / SM4_BLOCK_SIZE; 244 + if (nblocks) { 245 + kernel_neon_begin(); 201 246 202 - nblks = BYTES2BLK8(nbytes); 203 - if (nblks) { 204 - sm4_neon_cfb_dec_blk8(ctx->rkey_enc, dst, src, 205 - walk.iv, nblks); 206 - dst += nblks * SM4_BLOCK_SIZE; 207 - src += nblks * SM4_BLOCK_SIZE; 208 - nbytes -= nblks * SM4_BLOCK_SIZE; 247 + sm4_neon_cfb_dec(ctx->rkey_enc, dst, src, 248 + walk.iv, nblocks); 249 + 250 + kernel_neon_end(); 251 + 252 + dst += nblocks * SM4_BLOCK_SIZE; 253 + src += nblocks * SM4_BLOCK_SIZE; 254 + nbytes -= nblocks * SM4_BLOCK_SIZE; 209 255 } 210 - 211 - nblks = BYTES2BLKS(nbytes); 212 - if (nblks) { 213 - u8 keystream[SM4_BLOCK_SIZE * 8]; 214 - 215 - memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); 216 - if (nblks > 1) 217 - memcpy(&keystream[SM4_BLOCK_SIZE], src, 218 - (nblks - 1) * SM4_BLOCK_SIZE); 219 - memcpy(walk.iv, src + (nblks - 1) * SM4_BLOCK_SIZE, 220 - SM4_BLOCK_SIZE); 221 - 222 - sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, 223 - keystream, nblks); 224 - 225 - crypto_xor_cpy(dst, src, keystream, 226 - nblks * SM4_BLOCK_SIZE); 227 - dst += nblks * SM4_BLOCK_SIZE; 228 - src += nblks * SM4_BLOCK_SIZE; 229 - nbytes -= nblks * SM4_BLOCK_SIZE; 230 - } 231 - 232 - kernel_neon_end(); 233 256 234 257 /* tail */ 235 258 if (walk.nbytes == walk.total && nbytes > 0) { ··· 239 302 while ((nbytes = walk.nbytes) > 0) { 240 303 const u8 *src = walk.src.virt.addr; 241 304 u8 *dst = walk.dst.virt.addr; 242 - unsigned int nblks; 305 + unsigned int nblocks; 243 306 244 - kernel_neon_begin(); 307 + nblocks = nbytes / SM4_BLOCK_SIZE; 308 + if (nblocks) { 309 + kernel_neon_begin(); 245 310 246 - nblks = BYTES2BLK8(nbytes); 247 - if (nblks) { 248 - sm4_neon_ctr_enc_blk8(ctx->rkey_enc, dst, src, 249 - walk.iv, nblks); 250 - dst += nblks * SM4_BLOCK_SIZE; 251 - src += nblks * SM4_BLOCK_SIZE; 252 - nbytes -= nblks * SM4_BLOCK_SIZE; 311 + sm4_neon_ctr_crypt(ctx->rkey_enc, dst, src, 312 + walk.iv, nblocks); 313 + 314 + kernel_neon_end(); 315 + 316 + dst += nblocks * SM4_BLOCK_SIZE; 317 + src += nblocks * SM4_BLOCK_SIZE; 318 + nbytes -= nblocks * SM4_BLOCK_SIZE; 253 319 } 254 - 255 - nblks = BYTES2BLKS(nbytes); 256 - if (nblks) { 257 - u8 keystream[SM4_BLOCK_SIZE * 8]; 258 - int i; 259 - 260 - for (i = 0; i < nblks; i++) { 261 - memcpy(&keystream[i * SM4_BLOCK_SIZE], 262 - walk.iv, SM4_BLOCK_SIZE); 263 - crypto_inc(walk.iv, SM4_BLOCK_SIZE); 264 - } 265 - sm4_neon_crypt_blk1_8(ctx->rkey_enc, keystream, 266 - keystream, nblks); 267 - 268 - crypto_xor_cpy(dst, src, keystream, 269 - nblks * SM4_BLOCK_SIZE); 270 - dst += nblks * SM4_BLOCK_SIZE; 271 - src += nblks * SM4_BLOCK_SIZE; 272 - nbytes -= nblks * SM4_BLOCK_SIZE; 273 - } 274 - 275 - kernel_neon_end(); 276 320 277 321 /* tail */ 278 322 if (walk.nbytes == walk.total && nbytes > 0) {