Rename sexpt get_mem→mem, get_nth→nth · gazagnaire.org/ocaml-crypto@5b6650d

+1

config/cfg.ml

··· 34 34 match (arch, ccomp_type_opt) with 35 35 | `x86_64, Some "msvc" -> [ "-DACCELERATE" ] 36 36 | `x86_64, _ -> [ "-DACCELERATE"; "-mssse3"; "-maes"; "-mpclmul" ] 37 + | `arm64, _ -> [ "-DACCELERATE" ] 37 38 | _ -> [] 38 39 in 39 40 let ent_flags =

+224

src/c/aes_aesni.c

··· 362 362 363 363 #endif /* __mc_ACCELERATE__ */ 364 364 365 + /* ------------------------------------------------------------------ */ 366 + /* ARM64 Cryptography Extensions (FEAT_AES) */ 367 + /* */ 368 + /* Uses ARM NEON Crypto intrinsics (arm_neon.h): */ 369 + /* vaeseq_u8 - AES single round encrypt (AddRoundKey+SubBytes+ */ 370 + /* ShiftRows) */ 371 + /* vaesmcq_u8 - AES MixColumns */ 372 + /* vaesdq_u8 - AES single round decrypt */ 373 + /* vaesimcq_u8 - AES InverseMixColumns */ 374 + /* */ 375 + /* Reference: ARM Architecture Reference Manual, FEAT_AES. */ 376 + /* Same pattern as Linux kernel aes-ce-glue.c / OpenSSL aes-armv8.c. */ 377 + /* ------------------------------------------------------------------ */ 378 + 379 + #if defined (__mc_ARM64CE__) 380 + 381 + static int _mc_arm_aes_rk_size (uint8_t rounds) { 382 + return (rounds + 1) * 16; 383 + } 384 + 385 + /* 386 + * ARM vaeseq_u8 performs: AddRoundKey XOR, then SubBytes, then ShiftRows. 387 + * So the encryption sequence per round is: 388 + * block = vaesmcq_u8(vaeseq_u8(block, rk[i])) 389 + * Final round omits MixColumns: 390 + * block = vaeseq_u8(block, rk[Nr-1]) ^ rk[Nr] 391 + */ 392 + static inline void _mc_arm_aes_enc (const uint8_t src[16], uint8_t dst[16], 393 + const uint8_t *rk0, uint8_t rounds) { 394 + uint8x16_t block = vld1q_u8(src); 395 + 396 + for (uint8_t i = 0; i < rounds - 1; i++) { 397 + uint8x16_t rk = vld1q_u8(rk0 + i * 16); 398 + block = vaesmcq_u8(vaeseq_u8(block, rk)); 399 + } 400 + /* Final round: SubBytes + ShiftRows + AddRoundKey (no MixColumns) */ 401 + block = vaeseq_u8(block, vld1q_u8(rk0 + (rounds - 1) * 16)); 402 + block = veorq_u8(block, vld1q_u8(rk0 + rounds * 16)); 403 + vst1q_u8(dst, block); 404 + } 405 + 406 + static inline void _mc_arm_aes_dec (const uint8_t src[16], uint8_t dst[16], 407 + const uint8_t *rk0, uint8_t rounds) { 408 + uint8x16_t block = vld1q_u8(src); 409 + 410 + for (uint8_t i = 0; i < rounds - 1; i++) { 411 + uint8x16_t rk = vld1q_u8(rk0 + i * 16); 412 + block = vaesimcq_u8(vaesdq_u8(block, rk)); 413 + } 414 + block = vaesdq_u8(block, vld1q_u8(rk0 + (rounds - 1) * 16)); 415 + block = veorq_u8(block, vld1q_u8(rk0 + rounds * 16)); 416 + vst1q_u8(dst, block); 417 + } 418 + 419 + /* 4-block parallel encrypt for throughput */ 420 + static inline void _mc_arm_aes_enc4 (const uint8_t src[64], uint8_t dst[64], 421 + const uint8_t *rk0, uint8_t rounds) { 422 + uint8x16_t b0 = vld1q_u8(src); 423 + uint8x16_t b1 = vld1q_u8(src + 16); 424 + uint8x16_t b2 = vld1q_u8(src + 32); 425 + uint8x16_t b3 = vld1q_u8(src + 48); 426 + 427 + for (uint8_t i = 0; i < rounds - 1; i++) { 428 + uint8x16_t rk = vld1q_u8(rk0 + i * 16); 429 + b0 = vaesmcq_u8(vaeseq_u8(b0, rk)); 430 + b1 = vaesmcq_u8(vaeseq_u8(b1, rk)); 431 + b2 = vaesmcq_u8(vaeseq_u8(b2, rk)); 432 + b3 = vaesmcq_u8(vaeseq_u8(b3, rk)); 433 + } 434 + uint8x16_t rk2 = vld1q_u8(rk0 + (rounds - 1) * 16); 435 + uint8x16_t rkl = vld1q_u8(rk0 + rounds * 16); 436 + b0 = veorq_u8(vaeseq_u8(b0, rk2), rkl); 437 + b1 = veorq_u8(vaeseq_u8(b1, rk2), rkl); 438 + b2 = veorq_u8(vaeseq_u8(b2, rk2), rkl); 439 + b3 = veorq_u8(vaeseq_u8(b3, rk2), rkl); 440 + 441 + vst1q_u8(dst, b0); 442 + vst1q_u8(dst + 16, b1); 443 + vst1q_u8(dst + 32, b2); 444 + vst1q_u8(dst + 48, b3); 445 + } 446 + 447 + static inline void _mc_arm_aes_dec4 (const uint8_t src[64], uint8_t dst[64], 448 + const uint8_t *rk0, uint8_t rounds) { 449 + uint8x16_t b0 = vld1q_u8(src); 450 + uint8x16_t b1 = vld1q_u8(src + 16); 451 + uint8x16_t b2 = vld1q_u8(src + 32); 452 + uint8x16_t b3 = vld1q_u8(src + 48); 453 + 454 + for (uint8_t i = 0; i < rounds - 1; i++) { 455 + uint8x16_t rk = vld1q_u8(rk0 + i * 16); 456 + b0 = vaesimcq_u8(vaesdq_u8(b0, rk)); 457 + b1 = vaesimcq_u8(vaesdq_u8(b1, rk)); 458 + b2 = vaesimcq_u8(vaesdq_u8(b2, rk)); 459 + b3 = vaesimcq_u8(vaesdq_u8(b3, rk)); 460 + } 461 + uint8x16_t rk2 = vld1q_u8(rk0 + (rounds - 1) * 16); 462 + uint8x16_t rkl = vld1q_u8(rk0 + rounds * 16); 463 + b0 = veorq_u8(vaesdq_u8(b0, rk2), rkl); 464 + b1 = veorq_u8(vaesdq_u8(b1, rk2), rkl); 465 + b2 = veorq_u8(vaesdq_u8(b2, rk2), rkl); 466 + b3 = veorq_u8(vaesdq_u8(b3, rk2), rkl); 467 + 468 + vst1q_u8(dst, b0); 469 + vst1q_u8(dst + 16, b1); 470 + vst1q_u8(dst + 32, b2); 471 + vst1q_u8(dst + 48, b3); 472 + } 473 + 474 + static inline void _mc_arm_aes_enc_blocks (const uint8_t *src, uint8_t *dst, 475 + const uint8_t *rk, uint8_t rounds, 476 + size_t blocks) { 477 + while (blocks >= 4) { 478 + _mc_arm_aes_enc4(src, dst, rk, rounds); 479 + src += 64; dst += 64; blocks -= 4; 480 + } 481 + while (blocks--) { 482 + _mc_arm_aes_enc(src, dst, rk, rounds); 483 + src += 16; dst += 16; 484 + } 485 + } 486 + 487 + static inline void _mc_arm_aes_dec_blocks (const uint8_t *src, uint8_t *dst, 488 + const uint8_t *rk, uint8_t rounds, 489 + size_t blocks) { 490 + while (blocks >= 4) { 491 + _mc_arm_aes_dec4(src, dst, rk, rounds); 492 + src += 64; dst += 64; blocks -= 4; 493 + } 494 + while (blocks--) { 495 + _mc_arm_aes_dec(src, dst, rk, rounds); 496 + src += 16; dst += 16; 497 + } 498 + } 499 + 500 + /* 501 + * ARM64 key schedule helpers. 502 + * 503 + * The generic key expansion stores round keys as uint32_t in host byte 504 + * order (little-endian on ARM64). ARM AES instructions (vaeseq_u8 etc.) 505 + * operate on byte arrays where each 4-byte column is in big-endian order 506 + * (matching the AES standard's byte layout). We byte-swap each uint32_t 507 + * after the generic expansion. 508 + */ 509 + static void _mc_arm_fixup_rk (uint8_t *rk_bytes, int rounds) { 510 + uint32_t *rk = (uint32_t *)rk_bytes; 511 + int nwords = (rounds + 1) * 4; 512 + for (int i = 0; i < nwords; i++) 513 + rk[i] = __builtin_bswap32(rk[i]); 514 + } 515 + 516 + /* 517 + * ARM64 decrypt key inversion: apply InvMixColumns to middle round keys 518 + * using vaesimcq_u8 (equivalent of x86 _mm_aesimc_si128). 519 + */ 520 + static void _mc_arm_invert_e_key (const uint8_t *ek, uint8_t *dk, 521 + uint8_t rounds) { 522 + /* Copy last encryption round key → first decrypt round key */ 523 + vst1q_u8(dk, vld1q_u8(ek + rounds * 16)); 524 + /* InvMixColumns on middle round keys */ 525 + for (uint8_t i = 1; i < rounds; i++) 526 + vst1q_u8(dk + i * 16, vaesimcq_u8(vld1q_u8(ek + (rounds - i) * 16))); 527 + /* Copy first encryption round key → last decrypt round key */ 528 + vst1q_u8(dk + rounds * 16, vld1q_u8(ek)); 529 + } 530 + 531 + #endif /* __mc_ARM64CE__ */ 532 + 533 + /* ------------------------------------------------------------------ */ 534 + /* Dispatch (platform-aware) */ 535 + /* ------------------------------------------------------------------ */ 536 + 365 537 CAMLprim value 366 538 mc_aes_rk_size (value rounds) { 367 539 value s; 540 + #if defined(__mc_ARM64CE__) 541 + _mc_switch_accel(arm_aes, 542 + s = mc_aes_rk_size_generic(rounds), 543 + s = Val_int (_mc_arm_aes_rk_size (Int_val (rounds)))) 544 + #else 368 545 _mc_switch_accel(aesni, 369 546 s = mc_aes_rk_size_generic(rounds), 370 547 s = Val_int (_mc_aesni_rk_size (Int_val (rounds)))) 548 + #endif 371 549 return s; 372 550 } 373 551 374 552 CAMLprim value 375 553 mc_aes_derive_e_key (value key, value rk, value rounds) { 554 + #if defined(__mc_ARM64CE__) 555 + mc_aes_derive_e_key_generic(key, rk, rounds); 556 + if (mc_detected_cpu_features.arm_aes) 557 + _mc_arm_fixup_rk(_bp_uint8(rk), Int_val(rounds)); 558 + #else 376 559 _mc_switch_accel(aesni, 377 560 mc_aes_derive_e_key_generic(key, rk, rounds), 378 561 _mc_aesni_derive_e_key (_st_uint8 (key), 379 562 _bp_uint8 (rk), 380 563 Int_val (rounds))) 564 + #endif 381 565 return Val_unit; 382 566 } 383 567 384 568 CAMLprim value 385 569 mc_aes_derive_d_key (value key, value kr, value rounds, value rk) { 570 + #if defined(__mc_ARM64CE__) 571 + if (mc_detected_cpu_features.arm_aes) { 572 + /* First derive encryption keys (byte-swapped for ARM) */ 573 + mc_aes_derive_e_key_generic(key, kr, rounds); 574 + _mc_arm_fixup_rk(_bp_uint8(kr), Int_val(rounds)); 575 + /* Then invert for decryption using vaesimcq_u8 */ 576 + uint8_t tmp[15 * 16]; /* max AES-256: 15 round keys */ 577 + memcpy(tmp, _bp_uint8(kr), (Int_val(rounds) + 1) * 16); 578 + _mc_arm_invert_e_key(tmp, _bp_uint8(kr), Int_val(rounds)); 579 + } else { 580 + mc_aes_derive_d_key_generic(key, kr, rounds, rk); 581 + } 582 + #else 386 583 _mc_switch_accel(aesni, 387 584 mc_aes_derive_d_key_generic(key, kr, rounds, rk), 388 585 _mc_aesni_derive_d_key (_st_uint8 (key), 389 586 _bp_uint8 (kr), 390 587 Int_val (rounds), 391 588 Is_block(rk) ? _bp_uint8(Field(rk, 0)) : 0)) 589 + #endif 392 590 return Val_unit; 393 591 } 394 592 395 593 CAMLprim value 396 594 mc_aes_enc (value src, value off1, value dst, value off2, value rk, value rounds, value blocks) { 595 + #if defined(__mc_ARM64CE__) 596 + _mc_switch_accel(arm_aes, 597 + mc_aes_enc_generic(src, off1, dst, off2, rk, rounds, blocks), 598 + _mc_arm_aes_enc_blocks ( _st_uint8_off (src, off1), 599 + _bp_uint8_off (dst, off2), 600 + _st_uint8 (rk), 601 + Int_val (rounds), 602 + Int_val (blocks) )) 603 + #else 397 604 _mc_switch_accel(aesni, 398 605 mc_aes_enc_generic(src, off1, dst, off2, rk, rounds, blocks), 399 606 _mc_aesni_enc_blocks ( _st_uint8_off (src, off1), ··· 401 608 _st_uint8 (rk), 402 609 Int_val (rounds), 403 610 Int_val (blocks) )) 611 + #endif 404 612 return Val_unit; 405 613 } 406 614 407 615 CAMLprim value 408 616 mc_aes_dec (value src, value off1, value dst, value off2, value rk, value rounds, value blocks) { 617 + #if defined(__mc_ARM64CE__) 618 + _mc_switch_accel(arm_aes, 619 + mc_aes_dec_generic(src, off1, dst, off2, rk, rounds, blocks), 620 + _mc_arm_aes_dec_blocks ( _st_uint8_off (src, off1), 621 + _bp_uint8_off (dst, off2), 622 + _st_uint8 (rk), 623 + Int_val (rounds), 624 + Int_val (blocks) )) 625 + #else 409 626 _mc_switch_accel(aesni, 410 627 mc_aes_dec_generic(src, off1, dst, off2, rk, rounds, blocks), 411 628 _mc_aesni_dec_blocks ( _st_uint8_off (src, off1), ··· 413 630 _st_uint8 (rk), 414 631 Int_val (rounds), 415 632 Int_val (blocks) )) 633 + #endif 416 634 return Val_unit; 417 635 } 418 636 419 637 CAMLprim value mc_aes_mode (__unit ()) { 420 638 value enabled = 0; 639 + #if defined(__mc_ARM64CE__) 640 + _mc_switch_accel(arm_aes, 641 + enabled = 0, 642 + enabled = 1) 643 + #else 421 644 _mc_switch_accel(aesni, 422 645 enabled = 0, 423 646 enabled = 1) 647 + #endif 424 648 return Val_int (enabled); 425 649 } 426 650

+20 -6

src/c/crypto.h

··· 8 8 #include <caml/mlvalues.h> 9 9 10 10 #ifdef ACCELERATE 11 - # ifdef _MSC_VER 11 + # if defined(_MSC_VER) 12 12 # include <intrin.h> 13 - # else 13 + # define __mc_ACCELERATE__ 14 + # elif defined(__x86_64__) || defined(__i386__) 14 15 # include <x86intrin.h> 16 + # define __mc_ACCELERATE__ 17 + # elif defined(__aarch64__) 18 + # include <arm_neon.h> 19 + # define __mc_ARM64CE__ 15 20 # endif 16 - #define __mc_ACCELERATE__ 17 21 #define __mc_detect_features__ 18 22 #endif 19 23 ··· 25 29 #ifdef __mc_detect_features__ 26 30 27 31 struct _mc_cpu_features { 32 + /* x86 */ 28 33 int aesni; 29 34 int pclmul; 30 35 int ssse3; 31 36 int rdrand; 32 37 int rdseed; 38 + /* ARM64 Cryptography Extensions */ 39 + int arm_aes; 40 + int arm_pmull; 33 41 }; 34 42 35 43 /* Supported accelerations */ ··· 37 45 38 46 #endif /* __mc_detect_features__ */ 39 47 40 - #ifdef __mc_ACCELERATE__ 48 + #if defined(__mc_ACCELERATE__) 49 + 50 + #define _mc_switch_accel(FEATURE, GENERIC_CALL, ACCELERATED_CALL) \ 51 + if (!(mc_detected_cpu_features.FEATURE)) { GENERIC_CALL; } \ 52 + else { ACCELERATED_CALL; } 53 + 54 + #elif defined(__mc_ARM64CE__) 41 55 42 56 #define _mc_switch_accel(FEATURE, GENERIC_CALL, ACCELERATED_CALL) \ 43 57 if (!(mc_detected_cpu_features.FEATURE)) { GENERIC_CALL; } \ 44 58 else { ACCELERATED_CALL; } 45 59 46 - #else /* __mc_ACCELERATE__ */ 60 + #else 47 61 48 62 #define _mc_switch_accel(_FEATURE, GENERIC_CALL, _ACCELERATED_CALL) \ 49 63 GENERIC_CALL; 50 64 51 - #endif /* __mc_ACCELERATE__ */ 65 + #endif 52 66 53 67 #if defined (__x86_64__) || defined (__aarch64__) || defined (__powerpc64__) || defined (__ppc64__) || (64 == __riscv_xlen) || defined (__s390x__) || (defined (__mips__) && _MIPS_SIM==_ABI64) || defined (__loongarch_lp64) || (1 == _WIN64) 54 68 #define ARCH_64BIT

+47 -1

src/c/detect_cpu_features.c

··· 2 2 3 3 #ifdef __mc_detect_features__ 4 4 5 + #if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER) 5 6 #ifndef _MSC_VER 6 7 # include <cpuid.h> 8 + #endif 9 + #endif 10 + 11 + #if defined(__aarch64__) && defined(__linux__) 12 + # include <sys/auxval.h> 13 + # ifndef HWCAP_AES 14 + # define HWCAP_AES (1 << 3) 15 + # endif 16 + # ifndef HWCAP_PMULL 17 + # define HWCAP_PMULL (1 << 4) 18 + # endif 19 + #elif defined(__aarch64__) && defined(__FreeBSD__) 20 + # include <sys/auxval.h> 7 21 #endif 8 22 9 23 struct _mc_cpu_features mc_detected_cpu_features = { 0 }; ··· 77 91 } 78 92 #endif /* _MSC_VER */ 79 93 80 - #else /* __mc_detect_features__ */ 94 + #elif defined(__aarch64__) 95 + 96 + /* 97 + * ARM64 Cryptography Extensions detection. 98 + * 99 + * On Linux: getauxval(AT_HWCAP) returns HWCAP_AES and HWCAP_PMULL bits. 100 + * On macOS/iOS: Apple Silicon always has crypto extensions. 101 + * On FreeBSD: elf_aux_info(AT_HWCAP, ...) provides the same. 102 + */ 103 + CAMLprim value 104 + mc_detect_cpu_features (__unit ()) { 105 + #if defined(__APPLE__) 106 + /* Apple Silicon always supports AES and PMULL */ 107 + mc_detected_cpu_features.arm_aes = 1; 108 + mc_detected_cpu_features.arm_pmull = 1; 109 + #elif defined(__linux__) 110 + unsigned long hwcap = getauxval(AT_HWCAP); 111 + if (hwcap & HWCAP_AES) 112 + mc_detected_cpu_features.arm_aes = 1; 113 + if (hwcap & HWCAP_PMULL) 114 + mc_detected_cpu_features.arm_pmull = 1; 115 + #elif defined(__FreeBSD__) 116 + unsigned long hwcap = 0; 117 + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); 118 + if (hwcap & HWCAP_AES) 119 + mc_detected_cpu_features.arm_aes = 1; 120 + if (hwcap & HWCAP_PMULL) 121 + mc_detected_cpu_features.arm_pmull = 1; 122 + #endif 123 + return Val_unit; 124 + } 125 + 126 + #else /* __mc_detect_features__ but unknown arch */ 81 127 82 128 CAMLprim value 83 129 mc_detect_cpu_features (__unit ()) {

+147

src/c/ghash_pclmul.c

··· 188 188 189 189 #endif /* __mc_ACCELERATE__ */ 190 190 191 + /* ------------------------------------------------------------------ */ 192 + /* ARM64 PMULL-based GHASH */ 193 + /* */ 194 + /* Uses vmull_p64 for carry-less multiplication (same algorithm as */ 195 + /* the PCLMULQDQ path, different intrinsics). */ 196 + /* */ 197 + /* Reference: ARM Architecture Reference Manual, FEAT_PMULL. */ 198 + /* Same GF(2^128) reduction as Intel CLMUL white paper. */ 199 + /* ------------------------------------------------------------------ */ 200 + 201 + #if defined(__mc_ARM64CE__) 202 + 203 + #include <string.h> 204 + 205 + /* 206 + * Carry-less multiply two 64-bit values, producing a 128-bit result. 207 + */ 208 + static inline poly128_t __pmull (uint64_t a, uint64_t b) { 209 + return vmull_p64 ((poly64_t)a, (poly64_t)b); 210 + } 211 + 212 + /* Extract high/low 64 bits of a uint8x16_t (treating as 128-bit value) */ 213 + static inline uint64_t __lo64 (uint8x16_t v) { 214 + return vgetq_lane_u64(vreinterpretq_u64_u8(v), 0); 215 + } 216 + static inline uint64_t __hi64 (uint8x16_t v) { 217 + return vgetq_lane_u64(vreinterpretq_u64_u8(v), 1); 218 + } 219 + 220 + /* Construct uint8x16_t from two uint64_t */ 221 + static inline uint8x16_t __from64 (uint64_t hi, uint64_t lo) { 222 + uint64x2_t v = vcombine_u64(vcreate_u64(lo), vcreate_u64(hi)); 223 + return vreinterpretq_u8_u64(v); 224 + } 225 + 226 + static inline uint64_t __p128_lo (poly128_t p) { 227 + return (uint64_t)p; 228 + } 229 + static inline uint64_t __p128_hi (poly128_t p) { 230 + return (uint64_t)(p >> 64); 231 + } 232 + 233 + /* Byte-reverse a 128-bit value (GHASH uses reflected bit order) */ 234 + static inline uint8x16_t __bswap128 (uint8x16_t v) { 235 + static const uint8_t rev_idx[16] = { 236 + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 237 + }; 238 + return vqtbl1q_u8(v, vld1q_u8(rev_idx)); 239 + } 240 + 241 + /* 242 + * GF(2^128) multiply with reflected reduction (x^128 + x^7 + x^2 + x + 1). 243 + * 244 + * Reflected representation means the polynomial is bit-reversed compared to 245 + * the standard representation. The reduction constant is 0xc200000000000000 246 + * (same as the PCLMULQDQ path). 247 + */ 248 + static inline uint8x16_t __arm_gfmul (uint8x16_t H, uint8x16_t X) { 249 + uint64_t h0 = __lo64(H), h1 = __hi64(H); 250 + uint64_t x0 = __lo64(X), x1 = __hi64(X); 251 + 252 + /* Schoolbook multiplication: H * X = (h1:h0) * (x1:x0) 253 + * Product = (z3:z2:z1:z0) 254 + * z0 = h0 * x0 (low) 255 + * z3 = h1 * x1 (high) 256 + * z1,z2 = h0*x1 + h1*x0 (cross terms) */ 257 + poly128_t p_lo = __pmull(h0, x0); 258 + poly128_t p_hi = __pmull(h1, x1); 259 + poly128_t p_m1 = __pmull(h0, x1); 260 + poly128_t p_m2 = __pmull(h1, x0); 261 + 262 + uint64_t z0 = __p128_lo(p_lo); 263 + uint64_t z1 = __p128_hi(p_lo) ^ __p128_lo(p_m1) ^ __p128_lo(p_m2); 264 + uint64_t z2 = __p128_lo(p_hi) ^ __p128_hi(p_m1) ^ __p128_hi(p_m2); 265 + uint64_t z3 = __p128_hi(p_hi); 266 + 267 + /* Reflected reduction modulo x^128 + x^7 + x^2 + x + 1. 268 + * Reduce (z3:z2:z1:z0) to 128 bits. */ 269 + uint64_t v0, v1; 270 + poly128_t r0 = __pmull(z0, 0xc200000000000000ULL); 271 + v0 = z1 ^ __p128_hi(r0); 272 + v1 = z0 ^ __p128_lo(r0); 273 + 274 + poly128_t r1 = __pmull(v1, 0xc200000000000000ULL); 275 + v0 = z2 ^ v0 ^ __p128_hi(r1); 276 + v1 = z3 ^ v1 ^ __p128_lo(r1); 277 + 278 + return __from64(v1, v0); 279 + } 280 + 281 + #define __arm_keys 1 282 + 283 + static inline void __arm_derive (const uint8_t *key, uint8x16_t *m) { 284 + m[0] = __bswap128(vld1q_u8(key)); 285 + } 286 + 287 + static inline void __arm_ghash (uint8x16_t *m, uint8x16_t *hash, 288 + const uint8_t *src, size_t len) { 289 + uint8x16_t acc = __bswap128(*hash); 290 + uint8x16_t H = m[0]; 291 + 292 + while (len >= 16) { 293 + uint8x16_t block = __bswap128(vld1q_u8(src)); 294 + acc = __arm_gfmul(H, veorq_u8(acc, block)); 295 + src += 16; 296 + len -= 16; 297 + } 298 + if (len > 0) { 299 + uint8_t tmp[16] = { 0 }; 300 + memcpy(tmp, src, len); 301 + uint8x16_t block = __bswap128(vld1q_u8(tmp)); 302 + acc = __arm_gfmul(H, veorq_u8(acc, block)); 303 + } 304 + *hash = __bswap128(acc); 305 + } 306 + 307 + #endif /* __mc_ARM64CE__ */ 308 + 309 + /* ------------------------------------------------------------------ */ 310 + /* Dispatch */ 311 + /* ------------------------------------------------------------------ */ 312 + 191 313 CAMLprim value mc_ghash_key_size (__unit ()) { 192 314 value s; 315 + #if defined(__mc_ARM64CE__) 316 + _mc_switch_accel(arm_pmull, 317 + s = mc_ghash_key_size_generic(Val_unit), 318 + s = Val_int (__arm_keys * 16)) 319 + #else 193 320 _mc_switch_accel(pclmul, 194 321 s = mc_ghash_key_size_generic(Val_unit), 195 322 s = Val_int (__keys * 16)) 323 + #endif 196 324 return s; 197 325 } 198 326 199 327 CAMLprim value mc_ghash_init_key (value key, value m) { 328 + #if defined(__mc_ARM64CE__) 329 + _mc_switch_accel(arm_pmull, 330 + mc_ghash_init_key_generic(key, m), 331 + __arm_derive (_st_uint8 (key), (uint8x16_t *) Bp_val (m))) 332 + #else 200 333 _mc_switch_accel(pclmul, 201 334 mc_ghash_init_key_generic(key, m), 202 335 __derive ((__m128i *) _st_uint8 (key), (__m128i *) Bp_val (m))) 336 + #endif 203 337 return Val_unit; 204 338 } 205 339 206 340 CAMLprim value 207 341 mc_ghash (value k, value hash, value src, value off, value len) { 342 + #if defined(__mc_ARM64CE__) 343 + _mc_switch_accel(arm_pmull, 344 + mc_ghash_generic(k, hash, src, off, len), 345 + __arm_ghash ( (uint8x16_t *) Bp_val (k), (uint8x16_t *) Bp_val (hash), 346 + _st_uint8_off (src, off), Int_val (len) )) 347 + #else 208 348 _mc_switch_accel(pclmul, 209 349 mc_ghash_generic(k, hash, src, off, len), 210 350 __ghash ( (__m128i *) Bp_val (k), (__m128i *) Bp_val (hash), 211 351 (__m128i *) _st_uint8_off (src, off), Int_val (len) )) 352 + #endif 212 353 return Val_unit; 213 354 } 214 355 215 356 CAMLprim value mc_ghash_mode (__unit ()) { 216 357 value enabled = 0; 358 + #if defined(__mc_ARM64CE__) 359 + _mc_switch_accel(arm_pmull, 360 + enabled = 0, 361 + enabled = 1) 362 + #else 217 363 _mc_switch_accel(pclmul, 218 364 enabled = 0, 219 365 enabled = 1) 366 + #endif 220 367 return Val_int (enabled); 221 368 }

Configure Feed

Configure Feed