Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

crypto: arm64/sm4 - add CE implementation for cmac/xcbc/cbcmac

This patch is a CE-optimized assembly implementation for cmac/xcbc/cbcmac.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 300 mode of
tcrypt, and compared the performance before and after this patch (the driver
used before this patch is XXXmac(sm4-ce)). The abscissas are blocks of
different lengths. The data is tabulated and the unit is Mb/s:

Before:

update-size | 16 64 256 1024 2048 4096 8192
---------------+--------------------------------------------------------
cmac(sm4-ce) | 293.33 403.69 503.76 527.78 531.10 535.46 535.81
xcbc(sm4-ce) | 292.83 402.50 504.02 529.08 529.87 536.55 538.24
cbcmac(sm4-ce) | 318.42 415.79 497.12 515.05 523.15 521.19 523.01

After:

update-size | 16 64 256 1024 2048 4096 8192
---------------+--------------------------------------------------------
cmac-sm4-ce | 371.99 675.28 903.56 971.65 980.57 990.40 991.04
xcbc-sm4-ce | 372.11 674.55 903.47 971.61 980.96 990.42 991.10
cbcmac-sm4-ce | 371.63 675.33 903.23 972.07 981.42 990.93 991.45

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Tianjia Zhang and committed by
Herbert Xu
6b5360a5 01f63311

+336 -1
+70
arch/arm64/crypto/sm4-ce-core.S
··· 35 35 #define RTMP3 v19 36 36 37 37 #define RIV v20 38 + #define RMAC v20 38 39 #define RMASK v21 39 40 40 41 ··· 1007 1006 .Lxts_dec_ret: 1008 1007 ret 1009 1008 SYM_FUNC_END(sm4_ce_xts_dec) 1009 + 1010 + .align 3 1011 + SYM_FUNC_START(sm4_ce_mac_update) 1012 + /* input: 1013 + * x0: round key array, CTX 1014 + * x1: digest 1015 + * x2: src 1016 + * w3: nblocks 1017 + * w4: enc_before 1018 + * w5: enc_after 1019 + */ 1020 + SM4_PREPARE(x0) 1021 + 1022 + ld1 {RMAC.16b}, [x1] 1023 + 1024 + cbz w4, .Lmac_update 1025 + 1026 + SM4_CRYPT_BLK(RMAC) 1027 + 1028 + .Lmac_update: 1029 + cbz w3, .Lmac_ret 1030 + 1031 + sub w6, w3, #1 1032 + cmp w5, wzr 1033 + csel w3, w3, w6, ne 1034 + 1035 + cbz w3, .Lmac_end 1036 + 1037 + .Lmac_loop_4x: 1038 + cmp w3, #4 1039 + blt .Lmac_loop_1x 1040 + 1041 + sub w3, w3, #4 1042 + 1043 + ld1 {v0.16b-v3.16b}, [x2], #64 1044 + 1045 + eor RMAC.16b, RMAC.16b, v0.16b 1046 + SM4_CRYPT_BLK(RMAC) 1047 + eor RMAC.16b, RMAC.16b, v1.16b 1048 + SM4_CRYPT_BLK(RMAC) 1049 + eor RMAC.16b, RMAC.16b, v2.16b 1050 + SM4_CRYPT_BLK(RMAC) 1051 + eor RMAC.16b, RMAC.16b, v3.16b 1052 + SM4_CRYPT_BLK(RMAC) 1053 + 1054 + cbz w3, .Lmac_end 1055 + b .Lmac_loop_4x 1056 + 1057 + .Lmac_loop_1x: 1058 + sub w3, w3, #1 1059 + 1060 + ld1 {v0.16b}, [x2], #16 1061 + 1062 + eor RMAC.16b, RMAC.16b, v0.16b 1063 + SM4_CRYPT_BLK(RMAC) 1064 + 1065 + cbnz w3, .Lmac_loop_1x 1066 + 1067 + 1068 + .Lmac_end: 1069 + cbnz w5, .Lmac_ret 1070 + 1071 + ld1 {v0.16b}, [x2], #16 1072 + eor RMAC.16b, RMAC.16b, v0.16b 1073 + 1074 + .Lmac_ret: 1075 + st1 {RMAC.16b}, [x1] 1076 + ret 1077 + SYM_FUNC_END(sm4_ce_mac_update) 1010 1078 1011 1079 1012 1080 .section ".rodata", "a"
+266 -1
arch/arm64/crypto/sm4-ce-glue.c
··· 14 14 #include <linux/cpufeature.h> 15 15 #include <asm/neon.h> 16 16 #include <asm/simd.h> 17 + #include <crypto/b128ops.h> 17 18 #include <crypto/internal/simd.h> 18 19 #include <crypto/internal/skcipher.h> 20 + #include <crypto/internal/hash.h> 19 21 #include <crypto/scatterwalk.h> 20 22 #include <crypto/xts.h> 21 23 #include <crypto/sm4.h> ··· 49 47 asmlinkage void sm4_ce_xts_dec(const u32 *rkey1, u8 *dst, const u8 *src, 50 48 u8 *tweak, unsigned int nbytes, 51 49 const u32 *rkey2_enc); 50 + asmlinkage void sm4_ce_mac_update(const u32 *rkey_enc, u8 *digest, 51 + const u8 *src, unsigned int nblocks, 52 + bool enc_before, bool enc_after); 52 53 53 54 EXPORT_SYMBOL(sm4_ce_expand_key); 54 55 EXPORT_SYMBOL(sm4_ce_crypt_block); ··· 61 56 struct sm4_xts_ctx { 62 57 struct sm4_ctx key1; 63 58 struct sm4_ctx key2; 59 + }; 60 + 61 + struct sm4_mac_tfm_ctx { 62 + struct sm4_ctx key; 63 + u8 __aligned(8) consts[]; 64 + }; 65 + 66 + struct sm4_mac_desc_ctx { 67 + unsigned int len; 68 + u8 digest[SM4_BLOCK_SIZE]; 64 69 }; 65 70 66 71 static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key, ··· 609 594 } 610 595 }; 611 596 597 + static int sm4_cbcmac_setkey(struct crypto_shash *tfm, const u8 *key, 598 + unsigned int key_len) 599 + { 600 + struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); 601 + 602 + if (key_len != SM4_KEY_SIZE) 603 + return -EINVAL; 604 + 605 + kernel_neon_begin(); 606 + sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec, 607 + crypto_sm4_fk, crypto_sm4_ck); 608 + kernel_neon_end(); 609 + 610 + return 0; 611 + } 612 + 613 + static int sm4_cmac_setkey(struct crypto_shash *tfm, const u8 *key, 614 + unsigned int key_len) 615 + { 616 + struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); 617 + be128 *consts = (be128 *)ctx->consts; 618 + u64 a, b; 619 + 620 + if (key_len != SM4_KEY_SIZE) 621 + return -EINVAL; 622 + 623 + memset(consts, 0, SM4_BLOCK_SIZE); 624 + 625 + kernel_neon_begin(); 626 + 627 + sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec, 628 + crypto_sm4_fk, crypto_sm4_ck); 629 + 630 + /* encrypt the zero block */ 631 + sm4_ce_crypt_block(ctx->key.rkey_enc, (u8 *)consts, (const u8 *)consts); 632 + 633 + kernel_neon_end(); 634 + 635 + /* gf(2^128) multiply zero-ciphertext with u and u^2 */ 636 + a = be64_to_cpu(consts[0].a); 637 + b = be64_to_cpu(consts[0].b); 638 + consts[0].a = cpu_to_be64((a << 1) | (b >> 63)); 639 + consts[0].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0)); 640 + 641 + a = be64_to_cpu(consts[0].a); 642 + b = be64_to_cpu(consts[0].b); 643 + consts[1].a = cpu_to_be64((a << 1) | (b >> 63)); 644 + consts[1].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0)); 645 + 646 + return 0; 647 + } 648 + 649 + static int sm4_xcbc_setkey(struct crypto_shash *tfm, const u8 *key, 650 + unsigned int key_len) 651 + { 652 + struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm); 653 + u8 __aligned(8) key2[SM4_BLOCK_SIZE]; 654 + static u8 const ks[3][SM4_BLOCK_SIZE] = { 655 + { [0 ... SM4_BLOCK_SIZE - 1] = 0x1}, 656 + { [0 ... SM4_BLOCK_SIZE - 1] = 0x2}, 657 + { [0 ... SM4_BLOCK_SIZE - 1] = 0x3}, 658 + }; 659 + 660 + if (key_len != SM4_KEY_SIZE) 661 + return -EINVAL; 662 + 663 + kernel_neon_begin(); 664 + 665 + sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec, 666 + crypto_sm4_fk, crypto_sm4_ck); 667 + 668 + sm4_ce_crypt_block(ctx->key.rkey_enc, key2, ks[0]); 669 + sm4_ce_crypt(ctx->key.rkey_enc, ctx->consts, ks[1], 2); 670 + 671 + sm4_ce_expand_key(key2, ctx->key.rkey_enc, ctx->key.rkey_dec, 672 + crypto_sm4_fk, crypto_sm4_ck); 673 + 674 + kernel_neon_end(); 675 + 676 + return 0; 677 + } 678 + 679 + static int sm4_mac_init(struct shash_desc *desc) 680 + { 681 + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); 682 + 683 + memset(ctx->digest, 0, SM4_BLOCK_SIZE); 684 + ctx->len = 0; 685 + 686 + return 0; 687 + } 688 + 689 + static int sm4_mac_update(struct shash_desc *desc, const u8 *p, 690 + unsigned int len) 691 + { 692 + struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); 693 + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); 694 + unsigned int l, nblocks; 695 + 696 + if (len == 0) 697 + return 0; 698 + 699 + if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) { 700 + l = min(len, SM4_BLOCK_SIZE - ctx->len); 701 + 702 + crypto_xor(ctx->digest + ctx->len, p, l); 703 + ctx->len += l; 704 + len -= l; 705 + p += l; 706 + } 707 + 708 + if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) { 709 + kernel_neon_begin(); 710 + 711 + if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) { 712 + sm4_ce_crypt_block(tctx->key.rkey_enc, 713 + ctx->digest, ctx->digest); 714 + ctx->len = 0; 715 + } else { 716 + nblocks = len / SM4_BLOCK_SIZE; 717 + len %= SM4_BLOCK_SIZE; 718 + 719 + sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p, 720 + nblocks, (ctx->len == SM4_BLOCK_SIZE), 721 + (len != 0)); 722 + 723 + p += nblocks * SM4_BLOCK_SIZE; 724 + 725 + if (len == 0) 726 + ctx->len = SM4_BLOCK_SIZE; 727 + } 728 + 729 + kernel_neon_end(); 730 + 731 + if (len) { 732 + crypto_xor(ctx->digest, p, len); 733 + ctx->len = len; 734 + } 735 + } 736 + 737 + return 0; 738 + } 739 + 740 + static int sm4_cmac_final(struct shash_desc *desc, u8 *out) 741 + { 742 + struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); 743 + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); 744 + const u8 *consts = tctx->consts; 745 + 746 + if (ctx->len != SM4_BLOCK_SIZE) { 747 + ctx->digest[ctx->len] ^= 0x80; 748 + consts += SM4_BLOCK_SIZE; 749 + } 750 + 751 + kernel_neon_begin(); 752 + sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1, 753 + false, true); 754 + kernel_neon_end(); 755 + 756 + memcpy(out, ctx->digest, SM4_BLOCK_SIZE); 757 + 758 + return 0; 759 + } 760 + 761 + static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out) 762 + { 763 + struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); 764 + struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); 765 + 766 + if (ctx->len) { 767 + kernel_neon_begin(); 768 + sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest, 769 + ctx->digest); 770 + kernel_neon_end(); 771 + } 772 + 773 + memcpy(out, ctx->digest, SM4_BLOCK_SIZE); 774 + 775 + return 0; 776 + } 777 + 778 + static struct shash_alg sm4_mac_algs[] = { 779 + { 780 + .base = { 781 + .cra_name = "cmac(sm4)", 782 + .cra_driver_name = "cmac-sm4-ce", 783 + .cra_priority = 400, 784 + .cra_blocksize = SM4_BLOCK_SIZE, 785 + .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx) 786 + + SM4_BLOCK_SIZE * 2, 787 + .cra_module = THIS_MODULE, 788 + }, 789 + .digestsize = SM4_BLOCK_SIZE, 790 + .init = sm4_mac_init, 791 + .update = sm4_mac_update, 792 + .final = sm4_cmac_final, 793 + .setkey = sm4_cmac_setkey, 794 + .descsize = sizeof(struct sm4_mac_desc_ctx), 795 + }, { 796 + .base = { 797 + .cra_name = "xcbc(sm4)", 798 + .cra_driver_name = "xcbc-sm4-ce", 799 + .cra_priority = 400, 800 + .cra_blocksize = SM4_BLOCK_SIZE, 801 + .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx) 802 + + SM4_BLOCK_SIZE * 2, 803 + .cra_module = THIS_MODULE, 804 + }, 805 + .digestsize = SM4_BLOCK_SIZE, 806 + .init = sm4_mac_init, 807 + .update = sm4_mac_update, 808 + .final = sm4_cmac_final, 809 + .setkey = sm4_xcbc_setkey, 810 + .descsize = sizeof(struct sm4_mac_desc_ctx), 811 + }, { 812 + .base = { 813 + .cra_name = "cbcmac(sm4)", 814 + .cra_driver_name = "cbcmac-sm4-ce", 815 + .cra_priority = 400, 816 + .cra_blocksize = 1, 817 + .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx), 818 + .cra_module = THIS_MODULE, 819 + }, 820 + .digestsize = SM4_BLOCK_SIZE, 821 + .init = sm4_mac_init, 822 + .update = sm4_mac_update, 823 + .final = sm4_cbcmac_final, 824 + .setkey = sm4_cbcmac_setkey, 825 + .descsize = sizeof(struct sm4_mac_desc_ctx), 826 + } 827 + }; 828 + 612 829 static int __init sm4_init(void) 613 830 { 614 - return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); 831 + int err; 832 + 833 + err = crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); 834 + if (err) 835 + return err; 836 + 837 + err = crypto_register_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs)); 838 + if (err) 839 + goto out_err; 840 + 841 + return 0; 842 + 843 + out_err: 844 + crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); 845 + return err; 615 846 } 616 847 617 848 static void __exit sm4_exit(void) 618 849 { 850 + crypto_unregister_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs)); 619 851 crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs)); 620 852 } 621 853 ··· 878 616 MODULE_ALIAS_CRYPTO("ctr(sm4)"); 879 617 MODULE_ALIAS_CRYPTO("cts(cbc(sm4))"); 880 618 MODULE_ALIAS_CRYPTO("xts(sm4)"); 619 + MODULE_ALIAS_CRYPTO("cmac(sm4)"); 620 + MODULE_ALIAS_CRYPTO("xcbc(sm4)"); 621 + MODULE_ALIAS_CRYPTO("cbcmac(sm4)"); 881 622 MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); 882 623 MODULE_LICENSE("GPL v2");