Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

crypto: arm64/sm4 - add CE implementation for CTS-CBC mode

This patch is a CE-optimized assembly implementation for CTS-CBC mode.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 218 mode of
tcrypt, and compared the performance before and after this patch (the driver
used before this patch is cts(cbc-sm4-ce)). The abscissas are blocks of
different lengths. The data is tabulated and the unit is Mb/s:

Before:

cts(cbc-sm4-ce) | 16 64 128 256 1024 1420 4096
----------------+--------------------------------------------------------------
CTS-CBC enc | 286.09 297.17 457.97 627.75 868.58 900.80 957.69
CTS-CBC dec | 286.67 285.63 538.35 947.08 2241.03 2577.32 3391.14

After:

cts-cbc-sm4-ce | 16 64 128 256 1024 1420 4096
----------------+--------------------------------------------------------------
CTS-CBC enc | 288.19 428.80 593.57 741.04 911.73 931.80 950.00
CTS-CBC dec | 292.22 468.99 838.23 1380.76 2741.17 3036.42 3409.62

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Tianjia Zhang and committed by
Herbert Xu
b1863fd0 45089dbe

+196
+102
arch/arm64/crypto/sm4-ce-core.S
··· 307 307 SYM_FUNC_END(sm4_ce_cbc_dec) 308 308 309 309 .align 3 310 + SYM_FUNC_START(sm4_ce_cbc_cts_enc) 311 + /* input: 312 + * x0: round key array, CTX 313 + * x1: dst 314 + * x2: src 315 + * x3: iv (big endian, 128 bit) 316 + * w4: nbytes 317 + */ 318 + SM4_PREPARE(x0) 319 + 320 + sub w5, w4, #16 321 + uxtw x5, w5 322 + 323 + ld1 {RIV.16b}, [x3] 324 + 325 + ld1 {v0.16b}, [x2] 326 + eor RIV.16b, RIV.16b, v0.16b 327 + SM4_CRYPT_BLK(RIV) 328 + 329 + /* load permute table */ 330 + adr_l x6, .Lcts_permute_table 331 + add x7, x6, #32 332 + add x6, x6, x5 333 + sub x7, x7, x5 334 + ld1 {v3.16b}, [x6] 335 + ld1 {v4.16b}, [x7] 336 + 337 + /* overlapping loads */ 338 + add x2, x2, x5 339 + ld1 {v1.16b}, [x2] 340 + 341 + /* create Cn from En-1 */ 342 + tbl v0.16b, {RIV.16b}, v3.16b 343 + /* padding Pn with zeros */ 344 + tbl v1.16b, {v1.16b}, v4.16b 345 + 346 + eor v1.16b, v1.16b, RIV.16b 347 + SM4_CRYPT_BLK(v1) 348 + 349 + /* overlapping stores */ 350 + add x5, x1, x5 351 + st1 {v0.16b}, [x5] 352 + st1 {v1.16b}, [x1] 353 + 354 + ret 355 + SYM_FUNC_END(sm4_ce_cbc_cts_enc) 356 + 357 + .align 3 358 + SYM_FUNC_START(sm4_ce_cbc_cts_dec) 359 + /* input: 360 + * x0: round key array, CTX 361 + * x1: dst 362 + * x2: src 363 + * x3: iv (big endian, 128 bit) 364 + * w4: nbytes 365 + */ 366 + SM4_PREPARE(x0) 367 + 368 + sub w5, w4, #16 369 + uxtw x5, w5 370 + 371 + ld1 {RIV.16b}, [x3] 372 + 373 + /* load permute table */ 374 + adr_l x6, .Lcts_permute_table 375 + add x7, x6, #32 376 + add x6, x6, x5 377 + sub x7, x7, x5 378 + ld1 {v3.16b}, [x6] 379 + ld1 {v4.16b}, [x7] 380 + 381 + /* overlapping loads */ 382 + ld1 {v0.16b}, [x2], x5 383 + ld1 {v1.16b}, [x2] 384 + 385 + SM4_CRYPT_BLK(v0) 386 + /* select the first Ln bytes of Xn to create Pn */ 387 + tbl v2.16b, {v0.16b}, v3.16b 388 + eor v2.16b, v2.16b, v1.16b 389 + 390 + /* overwrite the first Ln bytes with Cn to create En-1 */ 391 + tbx v0.16b, {v1.16b}, v4.16b 392 + SM4_CRYPT_BLK(v0) 393 + eor v0.16b, v0.16b, RIV.16b 394 + 395 + /* overlapping stores */ 396 + add x5, x1, x5 397 + st1 {v2.16b}, [x5] 398 + st1 {v0.16b}, [x1] 399 + 400 + ret 401 + SYM_FUNC_END(sm4_ce_cbc_cts_dec) 402 + 403 + .align 3 310 404 SYM_FUNC_START(sm4_ce_cfb_enc) 311 405 /* input: 312 406 * x0: round key array, CTX ··· 670 576 .Lbswap128_mask: 671 577 .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b 672 578 .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 579 + 580 + .Lcts_permute_table: 581 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 582 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 583 + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 584 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 585 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 586 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+94
arch/arm64/crypto/sm4-ce-glue.c
··· 16 16 #include <asm/simd.h> 17 17 #include <crypto/internal/simd.h> 18 18 #include <crypto/internal/skcipher.h> 19 + #include <crypto/scatterwalk.h> 19 20 #include <crypto/sm4.h> 20 21 21 22 #define BYTES2BLKS(nbytes) ((nbytes) >> 4) ··· 30 29 u8 *iv, unsigned int nblocks); 31 30 asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src, 32 31 u8 *iv, unsigned int nblocks); 32 + asmlinkage void sm4_ce_cbc_cts_enc(const u32 *rkey, u8 *dst, const u8 *src, 33 + u8 *iv, unsigned int nbytes); 34 + asmlinkage void sm4_ce_cbc_cts_dec(const u32 *rkey, u8 *dst, const u8 *src, 35 + u8 *iv, unsigned int nbytes); 33 36 asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src, 34 37 u8 *iv, unsigned int nblks); 35 38 asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, ··· 156 151 struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); 157 152 158 153 return sm4_cbc_crypt(req, ctx, false); 154 + } 155 + 156 + static int sm4_cbc_cts_crypt(struct skcipher_request *req, bool encrypt) 157 + { 158 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 159 + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); 160 + struct scatterlist *src = req->src; 161 + struct scatterlist *dst = req->dst; 162 + struct scatterlist sg_src[2], sg_dst[2]; 163 + struct skcipher_request subreq; 164 + struct skcipher_walk walk; 165 + int cbc_blocks; 166 + int err; 167 + 168 + if (req->cryptlen < SM4_BLOCK_SIZE) 169 + return -EINVAL; 170 + 171 + if (req->cryptlen == SM4_BLOCK_SIZE) 172 + return sm4_cbc_crypt(req, ctx, encrypt); 173 + 174 + skcipher_request_set_tfm(&subreq, tfm); 175 + skcipher_request_set_callback(&subreq, skcipher_request_flags(req), 176 + NULL, NULL); 177 + 178 + /* handle the CBC cryption part */ 179 + cbc_blocks = DIV_ROUND_UP(req->cryptlen, SM4_BLOCK_SIZE) - 2; 180 + if (cbc_blocks) { 181 + skcipher_request_set_crypt(&subreq, src, dst, 182 + cbc_blocks * SM4_BLOCK_SIZE, 183 + req->iv); 184 + 185 + err = sm4_cbc_crypt(&subreq, ctx, encrypt); 186 + if (err) 187 + return err; 188 + 189 + dst = src = scatterwalk_ffwd(sg_src, src, subreq.cryptlen); 190 + if (req->dst != req->src) 191 + dst = scatterwalk_ffwd(sg_dst, req->dst, 192 + subreq.cryptlen); 193 + } 194 + 195 + /* handle ciphertext stealing */ 196 + skcipher_request_set_crypt(&subreq, src, dst, 197 + req->cryptlen - cbc_blocks * SM4_BLOCK_SIZE, 198 + req->iv); 199 + 200 + err = skcipher_walk_virt(&walk, &subreq, false); 201 + if (err) 202 + return err; 203 + 204 + kernel_neon_begin(); 205 + 206 + if (encrypt) 207 + sm4_ce_cbc_cts_enc(ctx->rkey_enc, walk.dst.virt.addr, 208 + walk.src.virt.addr, walk.iv, walk.nbytes); 209 + else 210 + sm4_ce_cbc_cts_dec(ctx->rkey_dec, walk.dst.virt.addr, 211 + walk.src.virt.addr, walk.iv, walk.nbytes); 212 + 213 + kernel_neon_end(); 214 + 215 + return skcipher_walk_done(&walk, 0); 216 + } 217 + 218 + static int sm4_cbc_cts_encrypt(struct skcipher_request *req) 219 + { 220 + return sm4_cbc_cts_crypt(req, true); 221 + } 222 + 223 + static int sm4_cbc_cts_decrypt(struct skcipher_request *req) 224 + { 225 + return sm4_cbc_cts_crypt(req, false); 159 226 } 160 227 161 228 static int sm4_cfb_encrypt(struct skcipher_request *req) ··· 419 342 .setkey = sm4_setkey, 420 343 .encrypt = sm4_ctr_crypt, 421 344 .decrypt = sm4_ctr_crypt, 345 + }, { 346 + .base = { 347 + .cra_name = "cts(cbc(sm4))", 348 + .cra_driver_name = "cts-cbc-sm4-ce", 349 + .cra_priority = 400, 350 + .cra_blocksize = SM4_BLOCK_SIZE, 351 + .cra_ctxsize = sizeof(struct sm4_ctx), 352 + .cra_module = THIS_MODULE, 353 + }, 354 + .min_keysize = SM4_KEY_SIZE, 355 + .max_keysize = SM4_KEY_SIZE, 356 + .ivsize = SM4_BLOCK_SIZE, 357 + .walksize = SM4_BLOCK_SIZE * 2, 358 + .setkey = sm4_setkey, 359 + .encrypt = sm4_cbc_cts_encrypt, 360 + .decrypt = sm4_cbc_cts_decrypt, 422 361 } 423 362 }; 424 363 ··· 458 365 MODULE_ALIAS_CRYPTO("cbc(sm4)"); 459 366 MODULE_ALIAS_CRYPTO("cfb(sm4)"); 460 367 MODULE_ALIAS_CRYPTO("ctr(sm4)"); 368 + MODULE_ALIAS_CRYPTO("cts(cbc(sm4))"); 461 369 MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); 462 370 MODULE_LICENSE("GPL v2");